# Ad Click Prediction Classification

Going to take the following approach:

1. Problem definition
2. Data
3. Evaluation
4. Features
5. Modelling
6. Model Evaluation
7. Experientmetion / Improvements

# 1. Problem Definition

How we can use various python based Machine Learning Model and the given parameters to predict if the customer purchased or not?

# 2. Data

Data from: https://www.kaggle.com/jahnveenarang/cvdcvd-vd

# 3. Evaluation

As this is a classification problem, we will use the classification metics for evauluting the model

# 4. Features

## inputs / features
    1. User ID - Customer Unique Id
    2. Gender - Gender of a customer - M/F
    3. Age - Age of a customer
    4. EstimatedSalary - Estimated salary of a customer

## Output / label
    5. Purchased - Whether they purchased or not after Ad click 1/0

## Standard Imports

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

## Reading the Dataset

In [None]:
# Local
# df = pd.read_csv('Social_Network_Ads.csv')

# Kaggle
df = pd.read_csv('/kaggle/input/cvdcvd-vd/Social_Network_Ads.csv')
df.head()

## Data Exporation

In [None]:
df

In [None]:
len(df['User ID'].unique())

As User ID is based on the customer ID and is unique by customer, we will drop the User ID

In [None]:
df = df.drop('User ID', axis=1)

In [None]:
df

In [None]:
plt.figure(figsize=(20,10))
plt.title('Value count of Labels')
sns.countplot(data=df, x='Purchased');

As we can see the data is in-balanced

In [None]:
plt.figure(figsize=(20,10))
plt.title('Value count of gender')
sns.countplot(data=df, x='Gender');

In [None]:
plt.figure(figsize=(20,10))
plt.title('Value count of gender who Purchase or not')
sns.countplot(data=df, x='Gender', hue='Purchased');

In [None]:
plt.figure(figsize=(20,10))
plt.title('Histogram of age')
sns.histplot(data=df, x='Age', bins=25, kde=True);

In [None]:
plt.figure(figsize=(20,10))
plt.title('Histogram of EstimatedSalary')
sns.histplot(data=df, x='EstimatedSalary', bins=25, kde=True);

In [None]:
plt.figure(figsize=(20,10))
plt.title('Plot of Age vs EstimatedSalary')
sns.boxplot(data=df, x='Age', y='EstimatedSalary');

In [None]:
plt.figure(figsize=(20,10))
plt.title('Plot of Age vs Estimated Salary vs Purchased or not')
sns.scatterplot(data=df, x='Age', y='EstimatedSalary', hue='Purchased', s=150, alpha=0.5);

In [None]:
plt.figure(figsize=(20,20))
plt.title('Heatmap of Pearson corrlation')
sns.heatmap(data=(pd.get_dummies(df)).corr(),annot=True);

In [None]:
plt.figure(figsize=(20,20))
plt.title('Heatmap of Spearman corrlation')
sns.heatmap(data=(pd.get_dummies(df)).corr('spearman'),annot=True);

# 5. Modelling

In [None]:
X = df.drop('Purchased', axis=1)
X = pd.get_dummies(X, drop_first=True)
y = df['Purchased']

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [None]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

## Model Imports

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier, XGBRFClassifier
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier

## Baseline Model Scores

In [None]:
from warnings import filterwarnings

In [None]:
filterwarnings('ignore')

In [None]:
def fit_and_score(models, X_train, X_test, y_train, y_test):
    np.random.seed(42)
    
    model_scores = {}
    
    for name, model in models.items():
        model.fit(X_train,y_train)
        model_scores[name] = model.score(X_test,y_test)

    model_scores = pd.DataFrame(model_scores, index=['Score']).transpose()
    model_scores = model_scores.sort_values('Score')
        
    return model_scores

In [None]:
models = {'LogisticRegression': LogisticRegression(max_iter=10000),
          'KNeighborsClassifier': KNeighborsClassifier(),
          'SVC': SVC(),
          'DecisionTreeClassifier': DecisionTreeClassifier(),
          'RandomForestClassifier': RandomForestClassifier(),
          'AdaBoostClassifier': AdaBoostClassifier(),
          'GradientBoostingClassifier': GradientBoostingClassifier(),
          'XGBClassifier': XGBClassifier(),
          'XGBRFClassifier': XGBRFClassifier(),
          'LGBMClassifier':LGBMClassifier()}

In [None]:
baseline_model_scores = fit_and_score(models, X_train, X_test, y_train, y_test)

In [None]:
baseline_model_scores

In [None]:
plt.figure(figsize=(20,10))
sns.barplot(data=baseline_model_scores.sort_values('Score').T)
plt.title('Baseline Model Precision Score')
plt.xticks(rotation=90);

From the baseline modelling we can see that the top models are:
1. SVC 	0.933333
1. XGBRFClassifier 	0.933333

we can try tuning the hyperparams to check if the model improves

## Random Search CV

As the data is in-balance we will use the F1 scores for the scoring 

In [None]:
from sklearn.model_selection import RandomizedSearchCV

In [None]:
def randomsearch_cv_scores(models, params, X_train, X_test, y_train, y_test):
    np.random.seed(42)
    
    model_rs_scores = {}
    model_rs_best_param = {}
    
    for name, model in models.items():
        rs_model = RandomizedSearchCV(model,
                                     param_distributions=params[name],
                                      scoring='f1',
                                      cv=5,
                                     n_iter=20,n_jobs=1,
                                     verbose=0)        
        rs_model.fit(X_train,y_train)
        model_rs_scores[name] = rs_model.score(X_test,y_test)
        model_rs_best_param[name] = rs_model.best_params_
        
    return model_rs_scores, model_rs_best_param

In [None]:
models = {'SVC': SVC(),
         'XGBRFClassifier': XGBRFClassifier()}

params = {'SVC':{'C' : np.linspace(0.1,0.9, 9),
                'kernel':['linear', 'ploy', 'rbf', 'sigmoid'],
                'gamma': np.linspace(0,1,11),
                },
          'XGBRFClassifier':{'n_estimators': [2,5,10,20,50,100,200],
                             'learning_rate':np.linspace(0,1,11),
                             'gamma': np.linspace(0,1,11)}
         }

### RS model 1

In [None]:
model_rs_scores_1, model_rs_best_param_1 = randomsearch_cv_scores(models, params, X_train, X_test, y_train, y_test)

In [None]:
model_rs_scores_1

In [None]:
model_rs_best_param_1

From the random search CV of 5, we found that the SVC model perfroms the best with a CV F1 mean score of 91.67%
we will based the model evalution on the SVC.

# 6. Model Evalution

In [None]:
from sklearn.metrics import classification_report, plot_confusion_matrix, plot_roc_curve
from sklearn.model_selection import cross_val_score

In [None]:
model = SVC(kernel='rbf',
            gamma=0.7,
            C = 0.6)

In [None]:
model.fit(X_train,y_train)
y_preds = model.predict(X_test)

## Classification Report

In [None]:
print(classification_report(y_test,y_preds))

## Confustion Matrix

In [None]:
plot_confusion_matrix(model,X_test,y_test)

## ROC Curve

In [None]:
plot_roc_curve(model,X_test,y_test)

## Evalution using cross-validation

In [None]:
def get_cv_score(model, X, y, cv=5):
    
    
    cv_accuracy = cross_val_score(model,X,y,cv=5,
                         scoring='accuracy')
    print(f'Cross Validaion accuracy Scores: {cv_accuracy}')
    print(f'Cross Validation accuracy Mean Score: {cv_accuracy.mean()}')
    
    cv_precision = cross_val_score(model,X,y,cv=5,
                         scoring='precision')
    print(f'Cross Validaion precision Scores: {cv_precision}')
    print(f'Cross Validation precision Mean Score: {cv_precision.mean()}')
    
    cv_recall = cross_val_score(model,X,y,cv=5,
                         scoring='recall')
    print(f'Cross Validaion recall Scores: {cv_recall}')
    print(f'Cross Validation recall Mean Score: {cv_recall.mean()}')
    
    cv_f1 = cross_val_score(model,X,y,cv=5,
                         scoring='f1')
    print(f'Cross Validaion f1 Scores: {cv_f1}')
    print(f'Cross Validation f1 Mean Score: {cv_f1.mean()}')   
    
    cv_merics = pd.DataFrame({'Accuracy': cv_accuracy.mean(),
                         'Precision': cv_precision.mean(),
                         'Recall': cv_recall.mean(),
                         'f1': cv_recall.mean()},index=[0])
    
    return cv_merics

In [None]:
cv_merics = get_cv_score(model, X_train, y_train, cv=5)

In [None]:
cv_merics

With the SVC model, we are able to get the following:

    Accuracy 	0.903571 
    Precision 	0.841834
    Recall 	    0.895789
    f1          0.895789
	 	 	