In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV , RepeatedStratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.svm import SVC
import warnings
warnings.simplefilter('ignore')
%matplotlib inline

In [None]:
plt.style.use('seaborn-darkgrid')
palette =  plt.get_cmap('Set2')

In [None]:
df = pd.read_csv('../input/advertising/advertising.csv')
df.head(5)

# Exploratory Data Analysis (EDA)

In [None]:
df.columns

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
df.describe(include=[np.object])

In [None]:
df['Timestamp'] = pd.to_datetime(df['Timestamp'])

In [None]:
plt.figure(figsize=(10 , 5))
df['Daily Time Spent on Site'].hist()
plt.show()

In [None]:
plt.figure(figsize=(10 , 5))
sns.distplot(df['Age'] , bins=35)
plt.xlabel('Age' , fontsize=15)
plt.ylabel('Density' , fontsize=15)
plt.show()

In [None]:
df['hour'] = df['Timestamp'].apply(lambda x : x.hour)
df['clicks_hour']= df.groupby('hour')['Clicked on Ad'].transform('sum')

In [None]:
plt.figure(figsize=(10 , 5))
sns.lineplot(df['hour'] , df['clicks_hour'])
plt.xlabel('Hours' , fontsize=15)
plt.ylabel('click counts' , fontsize=15)
plt.xticks(list(range(0,26 , 2)))
plt.show()

**No of clicks are higher at 0 , 7 and 9 according to 24 hours time format**

In [None]:
df['day'] = df['Timestamp'].apply(lambda x : x.day_of_week)
df['clicks_day'] = df.groupby('day')['Clicked on Ad'].transform('sum')

In [None]:
temp = df['day'].map({0:'Sun' , 1:'Mon' , 2:'Tue' , 3:'Wed' , 4:'Thurs' ,5:'Fri' , 6:'Sat'})
plt.figure(figsize=(10 , 5))
sns.lineplot(temp , df['clicks_day'])
plt.xlabel('Week days' , fontsize=15)
plt.ylabel('Click Counts' , fontsize=15)
plt.show()

**No of clicks are higher on Saturday , Wednesday , Tuesday**
**And least clicks on Monday**

In [None]:
plt.figure(figsize=(20 , 6))
sns.scatterplot(df['Age'] , df['Area Income'] , hue=df['Clicked on Ad'])
plt.xlabel('Age' , fontsize=15)
plt.ylabel('Area Income' , fontsize=15)
plt.show()

**20-40 aged peoples having area income ranging 45k to 80k clicks on ad more than others**

In [None]:
plt.figure(figsize=(20 , 6))
sns.scatterplot(df['Age'] , df['Daily Time Spent on Site'] , hue=df['Clicked on Ad'])
plt.xlabel('Age' , fontsize=15)
plt.ylabel('Daily Tme Spent on Site' , fontsize=15)
plt.show()

**20-40 aged peoples who spents more than 60 minutes on internet more clicks on the ads**

In [None]:
plt.figure(figsize=(20 , 6))
sns.scatterplot(df['Daily Internet Usage'] , df['Daily Time Spent on Site'] , hue=df['Clicked on Ad'])
plt.xlabel('Daily Internet Usage' , fontsize=15)
plt.ylabel('Daily Tme Spent on Site' , fontsize=15)
plt.show()

**Peoples who spent more time on internet and internet surfing more clicks on the ads**

In [None]:
plt.figure(figsize=(20 , 6))
sns.countplot(df['Male'])
plt.xlabel('Male' , fontsize=15)
plt.ylabel('Count' , fontsize=15)
plt.show()

In [None]:
df['Male'].value_counts()

**We can say male and female both surfs equally**

In [None]:
plt.figure(figsize=(25 , 15))
sns.pairplot(df , hue='Clicked on Ad')
plt.show()

# Preprocessing

In [None]:
X = df[['Daily Time Spent on Site', 'Age', 'Area Income','Daily Internet Usage', 'Male']]
y = df['Clicked on Ad']

In [None]:
X_train , X_test , y_train , y_test = train_test_split(X , y , random_state=0 , test_size=0.30)

In [None]:
scaler = MinMaxScaler(feature_range=(0,1))
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Logistics Model

In [None]:
logmodel = LogisticRegression()
logmodel.fit(X_train,y_train)
y_pred = logmodel.predict(X_test)

In [None]:
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))
print(accuracy_score(y_test,y_pred))
lg=accuracy_score(y_test,y_pred)

# Random Forest

In [None]:
rf = RandomForestClassifier()
rf.fit(X_train,y_train)
y_pred = rf.predict(X_test)

In [None]:
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))
print(accuracy_score(y_test,y_pred))
rfs=accuracy_score(y_test,y_pred)

# XGBoost

In [None]:
xgb = XGBClassifier()
xgb.fit(X_train,y_train)
y_pred = xgb.predict(X_test)

In [None]:
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))
print(accuracy_score(y_test,y_pred))
xgbs=accuracy_score(y_test,y_pred)

# SVM

In [None]:
svm = SVC()
svm.fit(X_train,y_train)
y_pred = svm.predict(X_test)

In [None]:
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))
print(accuracy_score(y_test,y_pred))
svcs=accuracy_score(y_test,y_pred)

# Model Comparison

In [None]:
models = pd.DataFrame({
    'Model':['Logistic Regression','Random Forest', 'XGBoost', 'SVC'],
    'Accuracy_score' :[lg,rfs, xgbs, svcs]
})
sns.barplot(x='Accuracy_score', y='Model', data=models)
models.sort_values(by='Accuracy_score', ascending=False)

# Hyperparameter tuning of logistics model

In [None]:
model = LogisticRegression()
solvers = ['newton-cg', 'lbfgs', 'liblinear']
penalty = ['l2']
c_values = [100, 10, 1.0, 0.1, 0.01]

In [None]:
grid = dict(solver=solvers,penalty=penalty,C=c_values)
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
grid_search = GridSearchCV(estimator=model, param_grid=grid, n_jobs=-1, cv=cv, scoring='accuracy',error_score=0)
grid_result = grid_search.fit(X, y)

In [None]:
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

In [None]:
model = SVC()
kernel = ['poly', 'rbf', 'sigmoid']
C = [50, 10, 1.0, 0.1, 0.01]
gamma = ['scale']

In [None]:
grid = dict(kernel=kernel,C=C,gamma=gamma)
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
grid_search = GridSearchCV(estimator=model, param_grid=grid, n_jobs=-1, cv=cv, scoring='accuracy',error_score=0)
grid_result = grid_search.fit(X_train, y_train)

In [None]:
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

# Let's create our best model

In [None]:
logmodel = LogisticRegression(penalty='l2' , C=10 , solver='newton-cg')
logmodel.fit(X_train,y_train)
y_pred = logmodel.predict(X_test)

In [None]:
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))
print(accuracy_score(y_test,y_pred))
lg=accuracy_score(y_test,y_pred)

### We have created a model with 97 % accuracy