# Import Packages

In [None]:
!pip uninstall keras --yes 
!pip install keras==2.6.0
!pip uninstall tensorflow --yes
!pip install tensorflow==2.6.0 
print("pip installs complete")

In [None]:
from IPython.core.display import HTML
HTML("<script>Jupyter.notebook.kernel.restart()</script>")

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import tensorflow as tf
from tensorflow import keras
import plotly
import sklearn
from sklearn import preprocessing

from tensorflow.keras.layers import IntegerLookup
from tensorflow.keras.layers import Normalization
from tensorflow.keras.layers import StringLookup

In [None]:
print(tf.__version__)
print(keras.__version__)

# Should be versions 2.6.0

# **Import Data**

In [None]:
raw_data = pd.read_csv("../input/advertising-missingrecords/advertising_missingdata.csv")
raw_data.head()

In [None]:
raw_data.describe().transpose()

In [None]:
df = raw_data.copy()

In [None]:
df.head()

**The goal of this project is to use the advertisement dataset to build a classification model to predict whether someone click on the ad or not**

- First we will do some quick exploration of the data

In [None]:
df.info()

- Missing 250 values for Area Income Feature.
- lets explore the data and see what may be the best method for imputation

# **Data Exploration**

In [None]:
# histogram of Daily Time Spent on Site by gender
import plotly.express as px
fig = px.histogram(df, x="Daily Time Spent on Site", color="Male", opacity=0.65, width=1200)
fig.show()

In [None]:
fig2 = px.histogram(df, x="Daily Internet Usage", color="Male", opacity=0.65, width=1200)
fig2.show()

**In this data, the Income distributions by gender are similar.  It does appear that replace missing `Area Income` values with group center statistics is much more representative to the sample** 
- We will explore other options after further exploration.

In [None]:
fig3 = px.histogram(df, x="Area Income", color="Male", opacity=0.70, width=1200)
fig3.show()

**Based on the Plot below, it appears `Daily time spent on the site` and `Daily internet` usage are important in determining whether a person Click on Ad or not.**
- There is clear linear seperation between Clicking on an Ad or not
- A simple model will likely be suitable for the classification task

In [None]:
import plotly.express as px
df["Clicked on Ad"] = df["Clicked on Ad"].astype(str)
df["Male"] = df["Male"].astype(str)
fig4 = px.scatter(df, x="Daily Internet Usage", y="Daily Time Spent on Site", color="Clicked on Ad", facet_col="Male",opacity=0.75)
fig4.show()

**Many variables appear to have clear linear seperation**
- As aforementioned, this task should not require a sophisticated model to perform well.

In [None]:
sns.set_theme(style="ticks")
sns.pairplot(df, hue="Clicked on Ad")

**Lets explore the target distribution to see which model evaluation measures will be best:**

- The target variable, `Clicked on Ad` classes are balanced.
  - Accuracy will be a suitable performance metric for our model.

In [None]:
fig = px.histogram(df, x="Clicked on Ad", opacity = 0.65, width=600)
fig.update_layout(bargap=0.2)
fig.show()

# **Data Cleaning**

## **Missing Values: Area Income**

- I will impute missing values in `Area Income` with means from Age Group

In [None]:
df['Age_bracket'] = pd.qcut(df['Age'],5).astype(str)
pd.qcut(df['Age'],5)

In [None]:
df.groupby(['Age_bracket']).mean()['Area Income']

In [None]:
df['Age_bracket'] = df['Age_bracket'].replace('(18.999, 28.0]', 1).astype(str)
df['Age_bracket'] = df['Age_bracket'].replace('(28.0, 32.0]', 2).astype(str)
df['Age_bracket'] = df['Age_bracket'].replace('(32.0, 37.0]', 3).astype(str)
df['Age_bracket'] = df['Age_bracket'].replace('(37.0, 44.0]', 4).astype(str)
df['Age_bracket'] = df['Age_bracket'].replace('(44.0, 61.0]', 5).astype(str)
df.head()

In [None]:
Age_bracket = df.groupby('Age_bracket').median()['Area Income']

for i in range(0,5):
  print('Median Income of age group {}s: {}'.format(i, Age_bracket[i]))
print('Median age of all passengers: {}'.format(df['Area Income'].median()))

# Filling the missing values in Area Income with the medians of Age groups
df['Area Income'] = df.groupby('Age_bracket')['Area Income'].apply(lambda x: x.fillna(x.mean()))

In [None]:
df.head()

In [None]:
# Missing values have been imputed
df[df['Area Income'].isnull()]

## **Timestamp Data**
- I will extract some additional features from the Timestamp feature to make it more useful for our model
- From the Timestamp I will create:
  - Hour of Day
  - Month of Year
  - Day of Week
  - Day of Month

In [None]:
df['Time of Day'] = pd.to_datetime(df['Timestamp']).dt.hour
df['Day of Month'] = pd.to_datetime(df['Timestamp']).dt.day
df['Day Name'] = pd.to_datetime(df['Timestamp']).dt.day_name()
df['Month of Year'] = pd.to_datetime(df['Timestamp']).dt.month_name()
df.head()

In [None]:
df['Clicked on Ad'] = df['Clicked on Ad'].astype(int)

# **Preprocessing & Model Pipeline**

In [None]:
# Load packages to handle the Pipeline
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV

In [None]:
# List of all features 
features = ['Daily Time Spent on Site', 'Age', 'Area Income',
       'Daily Internet Usage', 'Ad Topic Line', 'City', 'Male', 'Country',
       'Timestamp', 'Age_bracket', 'Time of Day',
       'Day of Month', 'Day Name', 'Month of Year', 'Clicked on Ad']
df =df.reindex(columns=features)
data = df.copy()

In [None]:
# Dropping columns I will not be using 
df.drop(columns=['Age_bracket', 'Timestamp', 'Ad Topic Line', 'City'], inplace=True)
target = df['Clicked on Ad']
df.drop(columns='Clicked on Ad', inplace=True)

# Changing data types back to string
df['Time of Day'] = df['Time of Day'].astype(str)
df['Day of Month'] = df['Day of Month'].astype(str)

## **Split data into training and validation sets**

In [None]:
# Splitting data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(df, target, test_size = 0.2, random_state=0)
X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, test_size = 0.2, random_state = 0)

In [None]:
print("Training set shape: ", X_train.shape[0])
print("Validation set shape: ", X_val.shape[0])
print("Testing set shape: ", X_test.shape[0])

## **Logistic Regression - Model Pipeline**

In [None]:
# List of all numeric features
numeric_features=  ['Daily Time Spent on Site', 'Age','Area Income', 'Daily Internet Usage']

# Numeric feature pipeline
numeric_transformer = Pipeline(steps=[
    # We have already dealt with missing values early (imputer is not necessary here)
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())])

# List of all categorical features
categorical_features = ['Male', 'Country', 'Time of Day', 'Day of Month', 'Day Name', 'Month of Year']
# Categorical feature transformer
categorical_transformer = OneHotEncoder(handle_unknown='ignore')

# Combine each transformer
preprocessor = ColumnTransformer(
    transformers=[
                  ('num', numeric_transformer, numeric_features),
                  ('cat', categorical_transformer, categorical_features)])

# Add classifier to preprocessing pipeline
clf = Pipeline(steps=[('preprocessor', preprocessor),
                      ('classifier', LogisticRegression())])

# Train model
clf.fit(X_train, y_train)
print("Model Score: %.3f" % clf.score(X_val, y_val))

## **Random Forest Classifier**

In [None]:
numeric_features=  ['Daily Time Spent on Site', 'Age','Area Income', 'Daily Internet Usage']

# Numeric feature pipeline
numeric_transformer = Pipeline(steps=[
    # We have already dealt with missing values early (imputer is not necessary here)
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())])

# List of all categorical features
categorical_features = ['Male', 'Country', 'Time of Day', 'Day of Month', 'Day Name', 'Month of Year']
# Categorical feature transformer
categorical_transformer = OneHotEncoder(handle_unknown='ignore')

# Combine each transformer
preprocessor = ColumnTransformer(
    transformers=[
                  ('num', numeric_transformer, numeric_features),
                  ('cat', categorical_transformer, categorical_features)])

In [None]:
from sklearn.ensemble import RandomForestClassifier

# Add classifier to preprocessing pipeline
clf_rf = Pipeline(steps=[('preprocessor', preprocessor),
                      ('classifier', RandomForestClassifier())])

# Train model
clf_rf.fit(X_train, y_train)
print("Model Score: %.3f" % clf_rf.score(X_val, y_val))

- **Another way to construct Pipeline by specifying data types to handle**

In [None]:
from sklearn.compose import make_column_selector as selector
from sklearn.linear_model import LogisticRegressionCV 

preprocessor = ColumnTransformer(transformers=[
    ('num', numeric_transformer, selector(dtype_exclude="object")),
    ('cat', categorical_transformer, selector(dtype_include="category"))])

clf2 = Pipeline(steps=[('preprocessor', preprocessor),
                      ('classifier', LogisticRegression())])


clf2.fit(X_train, y_train)
print("model score: %.3f" % clf2.score(X_val, y_val))

## Evaluation Metrics

In [None]:
from sklearn.metrics import classification_report
y_pred = clf.predict(X_test)
target_names = ['0', '1']
print(classification_report(y_test, y_pred, target_names=target_names))

In [None]:
from sklearn.metrics import plot_confusion_matrix
np.set_printoptions(precision=2)

class_names = clf.classes_

plot_confusion_matrix(clf, X_test, y_test,
                        display_labels=class_names,
                        cmap=plt.cm.Blues)

## ROC Curve

In [None]:
# Create ROC Graph
from sklearn.metrics import roc_curve, roc_auc_score

fpr, tpr, thresholds = roc_curve(y_test, clf.predict_proba(X_test)[:,1])
rf_fpr, rf_tpr, rf_thresholds = roc_curve(y_test, clf_rf.predict_proba(X_test)[:,1])
lr_auc = roc_auc_score(y_test, clf.predict(X_test))
rf_roc_auc = roc_auc_score(y_test, clf_rf.predict(X_test))


plt.figure(figsize=(9,7))

# Plot Logistic Regression ROC
plt.plot(fpr, tpr, label='Logistic Regression (area = %0.2f)' % lr_auc)

# Plot Random Forest ROC
plt.plot(rf_fpr, rf_tpr, label='Random Forest Classifier (area = %0.2f)' % rf_roc_auc)


# Plot Base Rate ROC
plt.plot([0,1], [0,1],label='Base',color = "black", linestyle='dashed')

plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Graph')
plt.legend(loc="lower right")
plt.show()

**Found this nice classification function by [Fares Sayah](https://www.kaggle.com/faressayah)**

In [None]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

def print_score(clf, X_train, y_train, X_test, y_test, train=True):
    if train:
        pred = clf.predict(X_train)
        clf_report = pd.DataFrame(classification_report(y_train, pred, output_dict=True))
        print("_______________________________________________________________________")
        print("Train Result:\n================================================")
        print(f"Accuracy Score: {accuracy_score(y_train, pred) * 100:.2f}%")
        print("_______________________________________________")
        print(f"CLASSIFICATION REPORT:\n{clf_report}")
        print("_______________________________________________")
        print(f"Confusion Matrix: \n {confusion_matrix(y_train, pred)}\n")

        print("_______________________________________________________________________")
        
    elif train==False:
        pred = clf.predict(X_test)
        clf_report = pd.DataFrame(classification_report(y_test, pred, output_dict=True))
        print("Test Result:\n================================================")        
        print(f"Accuracy Score: {accuracy_score(y_test, pred) * 100:.2f}%")
        print("_______________________________________________")
        print(f"CLASSIFICATION REPORT:\n{clf_report}")
        print("_______________________________________________")
        print(f"Confusion Matrix: \n {confusion_matrix(y_test, pred)}\n")


print_score(clf, X_train, y_train, X_test, y_test, train=True)
print_score(clf, X_train, y_train, X_test, y_test, train=False)
print_score(clf_rf, X_train, y_train, X_test, y_test, train=True)
print_score(clf_rf, X_train, y_train, X_test, y_test, train=False)

In [None]:
from sklearn.metrics import average_precision_score
y_score = clf.decision_function(X_test)
average_precision = average_precision_score(y_test, y_score)

print('Average precision-recall score: {0:0.2f}'.format(
      average_precision))

# **Neural Network**
- I will train a neural network to predict Ad clicks based on the Advertising Dataset

In [None]:
data.drop(columns=['Ad Topic Line', 'City', 'Timestamp', 'Age_bracket'], inplace=True)

In [None]:
# Create training and validation set

val_dataframe = data.sample(frac=0.2, random_state=10)
train_dataframe = data.drop(val_dataframe.index)

print(
    "Using %d samples for training and %d for validation"
    % (len(train_dataframe), len(val_dataframe))
)

In [None]:
# Function to extract labels, convert dataframe in Tensorflow Dataset and shuffle
def dataframe_to_dataset(dataframe):
    dataframe = dataframe.copy()
    labels = dataframe.pop("Clicked on Ad")
    ds = tf.data.Dataset.from_tensor_slices((dict(dataframe), labels))
    ds = ds.shuffle(buffer_size=len(dataframe))
    return ds


train_ds = dataframe_to_dataset(train_dataframe)
val_ds = dataframe_to_dataset(val_dataframe)

In [None]:
for x, y in train_ds.take(1):
    print("Input:", x)
    print("Target:", y)

In [None]:
# Batch the data for performance during training

train_ds = train_ds.batch(50)
val_ds = val_ds.batch(50)

In [None]:
from tensorflow.keras.layers import IntegerLookup
from tensorflow.keras.layers import Normalization
from tensorflow.keras.layers import StringLookup


def encode_numerical_feature(feature, name, dataset):
    # Create a Normalization layer for our feature
    normalizer = Normalization()

    # Prepare a Dataset that only yields our feature
    feature_ds = dataset.map(lambda x, y: x[name])
    feature_ds = feature_ds.map(lambda x: tf.expand_dims(x, -1))

    # Learn the statistics of the data
    normalizer.adapt(feature_ds)

    # Normalize the input feature
    encoded_feature = normalizer(feature)
    return encoded_feature

def encode_categorical_feature(feature, name, dataset, is_string):
    lookup_class = StringLookup if is_string else IntegerLookup
    # Create a lookup layer which will turn strings into integer indices
    lookup = lookup_class(output_mode="binary")

    # Prepare a Dataset that only yields our feature
    feature_ds = dataset.map(lambda x, y: x[name])
    feature_ds = feature_ds.map(lambda x: tf.expand_dims(x, -1))

    # Learn the set of possible string values and assign them a fixed integer index
    lookup.adapt(feature_ds)

    # Turn the string input into integer indices
    encoded_feature = lookup(feature)
    return encoded_feature

In [None]:
# Convert columns into form we need
data['Male'] = data['Male'].astype('string')
data['Country'] = data['Country'].astype('string')
data['Day Name'] = data['Day Name'].astype('string')
data['Month of Year'] = data['Month of Year'].astype('string')

In [None]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.layers.experimental import preprocessing
from functools import partial

## **Compile Model**

In [None]:
tf.keras.backend.clear_session() # Resets all keras states

tf.random.set_seed(45)

# Categorical features encoded as integers
time = keras.Input(shape=(1,), name="Time of Day", dtype="int64")
day_month = keras.Input(shape=(1,), name="Day of Month", dtype="int64")


# Categorical feature encoded as string
male = keras.Input(shape=(1,), name="Male", dtype="string")
country = keras.Input(shape=(1,), name="Country", dtype="string")
day_name = keras.Input(shape=(1,), name="Day Name", dtype="string")
month = keras.Input(shape=(1,), name="Month of Year", dtype="string")

# Numerical features
internet_usage = keras.Input(shape=(1,), name="Daily Internet Usage")
income = keras.Input(shape=(1,), name="Area Income")
time_on_site = keras.Input(shape=(1,), name="Daily Time Spent on Site")
age = keras.Input(shape=(1,), name="Age")


all_inputs = [
    time,
    day_month,
    male,
    country,
    day_name,
    month,
    internet_usage,
    income,
    time_on_site,
    age
]

# Integer categorical features
time_encoded = encode_categorical_feature(time, "Time of Day", train_ds, False)
day_month_encoded = encode_categorical_feature(day_month, "Day of Month", train_ds, False)


# String categorical features
male_encoded = encode_categorical_feature(male, "Male", train_ds, True)
country_encoded = encode_categorical_feature(country, "Country", train_ds, True)
day_name_encoded = encode_categorical_feature(day_name, "Day Name", train_ds, True)
month_encoded = encode_categorical_feature(month, "Month of Year", train_ds, True)

# Numerical features
internet_usage_encoded = encode_numerical_feature(internet_usage, "Daily Internet Usage", train_ds)
income_encoded = encode_numerical_feature(income, "Area Income", train_ds)
time_on_site_encoded = encode_numerical_feature(time_on_site, "Daily Time Spent on Site", train_ds)
age_encoded = encode_numerical_feature(age, "Age", train_ds)


all_features = layers.concatenate(
    [
        time_encoded,
        day_month_encoded,
        male_encoded,
        country_encoded,
        day_name_encoded,
        month_encoded,
        internet_usage_encoded,
        income_encoded,
        time_on_site_encoded,
        age_encoded
    ]
)


# Create thin wrapper for Dense layer
RegularizedDense = partial(keras.layers.Dense,
                           activation = "relu",
                           # Add regularization to improve generalization
                           kernel_regularizer = keras.regularizers.l2(0.03))

# Model Structure
x = RegularizedDense(64)(all_features)
x = RegularizedDense(32)(x)
x = RegularizedDense(16)(x)
x = layers.Dropout(0.50)(x) # Help prevent overfitting
output = layers.Dense(1, activation="sigmoid")(x)


# Compile Model
model = keras.Model(all_inputs, output)
model.compile("adam", "binary_crossentropy", metrics=["accuracy"])

## **Plot Model Architecture**

In [None]:
keras.utils.plot_model(model, show_shapes=True, rankdir="LR")

## **Train Model**

In [None]:
# Early Stopping Callback
es_callback = keras.callbacks.EarlyStopping(monitor="val_loss", min_delta=0, patience=4,
                                            restore_best_weights=True)

# Callback to control learning rate
reduce_lr = keras.callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.20,
                              patience=3, min_lr=0.001)

In [None]:
history = model.fit(train_ds, epochs=50, validation_data=val_ds, callbacks=[reduce_lr, es_callback])

## **Evaluate**

In [None]:
model.evaluate(x = val_ds)

In [None]:
def visualize_accuracy(history, title):
    accuracy = history.history["accuracy"]
    val_accuracy = history.history["val_accuracy"]
    epochs = range(len(accuracy))
    plt.figure(figsize=(9,7))
    plt.plot(epochs, accuracy, "b", label="Training accuracy")
    plt.plot(epochs, val_accuracy, "r", label="Validation accuracy")
    plt.title(title)
    plt.xlabel("Epochs")
    plt.ylabel("Accuracy")
    plt.legend()
    plt.show()


visualize_accuracy(history, "Training and Validation Accuracy")

In [None]:
def visualize_loss(history, title):
    loss = history.history["loss"]
    val_loss = history.history["val_loss"]
    epochs = range(len(loss))
    plt.figure(figsize=(9,7))
    plt.plot(epochs, loss, "b", label="Training loss")
    plt.plot(epochs, val_loss, "r", label="Validation loss")
    plt.title(title)
    plt.xlabel("Epochs")
    plt.ylabel("Loss")
    plt.legend()
    plt.show()


visualize_loss(history, "Training and Validation Loss")