In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
df = pd.read_csv('../input/advertising/advertising.csv')
df.head()

In [None]:
#Lets see the columns below to make it understood
print(df['Country'].unique())
print(df['Ad Topic Line'].unique())

We can see the above two columns are useless for the classification problem because there are too many different category. so better we should drop the columns from the dataset. Further we might drop the "City" and "Timestamp" columns too

In [None]:
df.drop(['Country','Ad Topic Line','Timestamp','City'], axis=1, inplace=True)

In [None]:
df.head()

In [None]:
#check the columns type
df.info()

In [None]:
df.shape

In [None]:
#now lets see if there is any null value
df.isnull().sum()

Our Data is not null

In [None]:
#Lets see if there is any na value
df.isna().sum()

Our data doesn't have any na value

## Lets Visualize the data


In [None]:
#Lets draw Heatmap for missing value

plt.figure(figsize=(12,8))
plt.style.use('fivethirtyeight')
sns.heatmap(df.isnull(), yticklabels=False, cmap='RdBu', cbar=False)
plt.title('Missing Value')

In [None]:
#Lets draw the heatmap of correlation matrix

plt.figure(figsize=(12,8))
plt.style.use('fivethirtyeight')
sns.heatmap(df.corr(), annot=True, cmap='RdBu')
plt.title('Correlation Matrix')

In [None]:
#Lets plot the Countplot

plt.figure(figsize=(12,8))
plt.style.use('fivethirtyeight')
sns.countplot(df['Male'])

In [None]:
#Lets draw the pairplot
plt.figure(figsize=(12,8))
plt.style.use('fivethirtyeight')
sns.pairplot(df, hue='Clicked on Ad')

Here We can see that There is no way to draw a straight line to seperate the data from one another. So we should not apply Logistic Regression rather we must go for KNN or other Algorithms. 

In [None]:
df.head()

In [None]:
X = df.drop('Clicked on Ad', axis=1)
y = df.iloc[:,-1]

## Split the dataset first

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=32)

## KNN Method with X_train, y_train

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train,y_train)
y_pred = knn.predict(X_test)
accuracy_score(y_test,y_pred)

In [None]:
#now do some cross validation with the knn model

from sklearn.model_selection import cross_val_score

score = cross_val_score(knn, X, y, cv=10, scoring='accuracy').mean()
score

## Now lets see the graphical evaluation of our model for different k value

In [None]:
k_range = range(1,31)
score = []
for k in k_range:
    knn = KNeighborsClassifier(n_neighbors=k)
    scores = cross_val_score(knn, X, y, cv=10, scoring='accuracy').mean()
    score.append(scores)

In [None]:
plt.figure(figsize=(12,8))
plt.style.use('fivethirtyeight')
plt.plot(k_range, score)
plt.xlabel('Value of K')
plt.ylabel('Scores')
plt.show()

### A little bit improvement for our model

In [None]:
#Now apply GridSearchCV for KNN. this will take a range of k for the model

from sklearn.model_selection import GridSearchCV
k_range = range(1,31)
weight_options = ['uniform','distance']
param_grid = dict(n_neighbors=k_range, weights=weight_options)
grid = GridSearchCV(knn, param_grid, cv=10, scoring='accuracy', n_jobs=-1)
grid.fit(X,y)
print(grid.best_score_)
print('\n')
print(grid.cv_results_)
print('\n')
print(grid.best_params_)
print('\n')
print(grid.best_estimator_)

In [None]:
knn = KNeighborsClassifier(n_neighbors=1, weights='uniform')
knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)
accuracy_score(y_test,y_pred)

# Decision Tree Method

In [None]:
from sklearn.tree import DecisionTreeClassifier
model = DecisionTreeClassifier()
max_depth = [2,4,6]
min_samples_split = [4,8,12,16]
min_samples_leaf = [4,8,12,16]
max_leaf_nodes = [15,20,25,30]
criterion = ['gini', 'entropy', 'chi2']

param_grid = dict(max_depth=max_depth,
                  min_samples_split=min_samples_split,
                  min_samples_leaf=min_samples_leaf,
                  max_leaf_nodes=max_leaf_nodes,
                  criterion=criterion
                 )

grid = GridSearchCV(model, param_grid, cv=5, n_jobs=-1, scoring='accuracy')
grid.fit(X_train,y_train)

In [None]:
grid.best_params_

In [None]:
grid.best_score_

In [None]:
#Now apply these parameters for out decision tress
model = DecisionTreeClassifier(criterion='entropy', max_depth=6, max_leaf_nodes=15, min_samples_leaf=4, min_samples_split=4, random_state=12)
model.fit(X_train,y_train)
y_pred = model.predict(X_test)
accuracy_score(y_test,y_pred)

#Now our model is very much accurate compared to the KNN

## Random Forest Classification

**Now aplly RandomForestClassifier with GridSearchCV**

In [None]:
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier()
max_depth = [6]
n_estimators = [100,200,300]
min_samples_split = [4]
min_samples_leaf = [4]
max_leaf_nodes = [15]
criterion = ['gini']
max_features = [.3,.4,.5]
max_samples = [.1,.2,.3]

param_grid = dict(max_depth=max_depth,
                  n_estimators=n_estimators,
                  min_samples_split=min_samples_split,
                  min_samples_leaf=min_samples_leaf,
                  max_leaf_nodes=max_leaf_nodes,
                  max_features=max_features,
                 )

grid = GridSearchCV(model, param_grid, cv=5, n_jobs=-1, scoring='accuracy')
grid.fit(X_train,y_train)

In [None]:
grid.best_params_

In [None]:
grid.best_score_

In [None]:
grid.best_estimator_

In [None]:
model = RandomForestClassifier(max_depth=6, max_features=0.3, max_leaf_nodes=15,
                       min_samples_leaf=4, min_samples_split=4)
model.fit(X_train,y_train)
y_pred = model.predict(X_test)
accuracy_score(y_test, y_pred)

**We can improve accuracy by 1% compared to DecisionTree using RandomForestClassification**