In [17]:
import numpy as np
import pandas as pd
import cv2
import os
import requests
from bs4 import BeautifulSoup
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import tensorflow as tf
from tensorflow.keras import layers, models
from sklearn.preprocessing import LabelEncoder
import base64

# Function to download images
def download_image(url, folder):
    if url.startswith('data:image'):
        # Decode Base64 image
        base64_str = url.split(';base64,')[-1]
        image_data = base64.b64decode(base64_str)
        
        # Save the image to a file
        filename = os.path.join(folder, f'image_{len(os.listdir(folder))}.jpg')
        with open(filename, 'wb') as f:
            f.write(image_data)
    else:
        # For regular image URLs
        response = requests.get(url)
        if response.status_code == 200:
            filename = os.path.join(folder, url.split("/")[-1].split("?")[0])
            with open(filename, 'wb') as f:
                f.write(response.content)

# Create directories for saving images
os.makedirs('ads', exist_ok=True)
os.makedirs('non-ads', exist_ok=True)

# Scraping ad images from Google Display Network Gallery
ad_url = 'https://www.google.com/ads/gallery/'
response = requests.get(ad_url)
soup = BeautifulSoup(response.text, 'html.parser')
images = soup.find_all('img')
for img in images:
    img_url = img['src']
    download_image(img_url, 'ads')

# Scraping non-ad images from Unsplash
non_ad_url = 'https://unsplash.com/'
response = requests.get(non_ad_url)
soup = BeautifulSoup(response.text, 'html.parser')
images = soup.find_all('img')
for img in images:
    img_url = img['src']
    download_image(img_url, 'non-ads')

# Create dataset
ad_files = os.listdir('ads')
non_ad_files = os.listdir('non-ads')

ad_data = [{'filename': os.path.join('ads', filename), 'label': 'ad'} for filename in ad_files]
non_ad_data = [{'filename': os.path.join('non-ads', filename), 'label': 'non-ad'} for filename in non_ad_files]

df = pd.DataFrame(ad_data + non_ad_data)
df.to_csv('dataset.csv', index=False)

# Data preprocessing
X = []
y = []
for index, row in df.iterrows():
    img = cv2.imread(row['filename'])
    if img is not None:  # Check if the image is loaded successfully
        img = cv2.resize(img, (224, 224))  # Resize images to a fixed size
        X.append(img)
        y.append(row['label'])
    else:
        print(f"Error loading image: {row['filename']}. Skipping...")

# Convert lists to numpy arrays
X = np.array(X) / 255.0  # Normalize pixel values
y = np.array(y)

# Define class labels
class_labels = ['ad', 'non-ad']

# Encode labels
label_encoder = LabelEncoder()
label_encoder.fit(class_labels)
y_encoded = label_encoder.transform(y)

# Split dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42)

# Model building
model = models.Sequential([
    layers.Conv2D(32, (3, 3), activation='relu', input_shape=(224, 224, 3)),
    layers.MaxPooling2D((2, 2)),
    layers.Conv2D(64, (3, 3), activation='relu'),
    layers.MaxPooling2D((2, 2)),
    layers.Conv2D(128, (3, 3), activation='relu'),
    layers.MaxPooling2D((2, 2)),
    layers.Conv2D(128, (3, 3), activation='relu'),
    layers.MaxPooling2D((2, 2)),
    layers.Flatten(),
    layers.Dense(512, activation='relu'),
    layers.Dense(1, activation='sigmoid')
])

model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])

# Train the model
model.fit(X_train, y_train, epochs=10, batch_size=32, validation_split=0.2)

# Evaluate the model
test_loss, test_acc = model.evaluate(X_test, y_test)
print("Test Accuracy:", test_acc)

# Generate classification report
y_pred = (model.predict(X_test) > 0.5).astype("int32").flatten()

# Map numerical labels back to original class names
class_names = label_encoder.inverse_transform([0, 1])

print(classification_report(y_test, y_pred, target_names=class_names))

Error loading image: non-ads\1pixel.gif. Skipping...
Error loading image: non-ads\m. Skipping...


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/10
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 532ms/step - accuracy: 0.8125 - loss: 0.7711 - val_accuracy: 0.8889 - val_loss: 0.4724
Epoch 2/10
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 423ms/step - accuracy: 0.8543 - loss: 0.4621 - val_accuracy: 0.8889 - val_loss: 0.2802
Epoch 3/10
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 428ms/step - accuracy: 0.8960 - loss: 0.3439 - val_accuracy: 0.9722 - val_loss: 0.2076
Epoch 4/10
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 421ms/step - accuracy: 0.9271 - loss: 0.2824 - val_accuracy: 0.9444 - val_loss: 0.2055
Epoch 5/10
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 417ms/step - accuracy: 0.9213 - loss: 0.2862 - val_accuracy: 0.9722 - val_loss: 0.1680
Epoch 6/10
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 421ms/step - accuracy: 0.9635 - loss: 0.1495 - val_accuracy: 0.9722 - val_loss: 0.1632
Epoch 7/10
[1m5/5[0m [32m━━━━━━━━━━━━