In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# This is a Simple and Direct code for Logistic Classification with the KNN Classifier.

In this workbook we are going to learn how quickly with minimum data insights we can create a data model that can predict our Classification problem data sets

Note :- We will import the Libraries as when required so you can undesrtand use of each library.

Let's get Started...

# First Step : Data 

In [None]:
df = pd.read_csv('../input/advertising/advertising.csv')
df.describe

Data looks good but we require few more information before creating our prediction model.

What are data types ?

In [None]:
df.dtypes

I think there is a Timestamp coloumn which contains the Date&Time data. So we will convert that to have a correct data types.

In [None]:
df['Timestamp']=pd.to_datetime(df['Timestamp'])
df['Timestamp'] = (df['Timestamp'] - df['Timestamp'].min())
df.dtypes

Now data seems Good....
But Are there any null value in the data ? Let us check....

In [None]:
df.isnull().sum()

Perfect..! No null values
Hey wait, we can build a Model with object data type columns link City, Country.
We need to convert them...

In [None]:
df = pd.get_dummies(df, columns=['City'],dtype=float)
df = pd.get_dummies(df, columns=['Country'],dtype=float)

# Second Step : Data Preparation (Inputs & Outputs)

As we have already prepare the dataset ready now we will have devide/desect the data for further analysis.

Now, we will create two variables X - containing the our Features for our Modem & y- contains the Outcome we require to give the Model to learn from. Simple,the prediction values.

In [None]:
X=df.drop(columns=['Clicked on Ad','Ad Topic Line','Timestamp'])
y=df['Clicked on Ad']
X.dtypes

If we analyse the Feature (X),

It could easily be undesrtood that having 1211 features for 1000 examples is not a good scenario to train Logistic regression Model.

Here, we required to identify the Features that have the maximum impact for our Target classification output. For this, we will be using the SelectKbest from the SKlearn.

We will have only first 10 most usable features to train our model.

In [None]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
best=SelectKBest(score_func=chi2, k=100)
fit1=best.fit(X,y)
data_scores=pd.DataFrame(fit1.scores_)
data_columns=pd.DataFrame(X.columns)
scores=pd.concat([data_columns,data_scores],axis=1)
scores.columns=['Feature','Score']
print(scores.nlargest(10,'Score'))

Wow, we have worked hard for Countries and cities but in comparison to other features we can negelect them.

Why? You may ask...

Answer is, if we will train a Model considering the features do not have any importance for prediction may have adverse effects on our predictions. The Precision and F1 score of the final result will be affected.

So we will limit our features (X) to first Four. 

In [None]:
X=X[['Daily Time Spent on Site', 'Age', 'Area Income', 'Daily Internet Usage']]
X.head()

Before creating a Model, we must have to devide the data for Training and Testing datasets so that after creating a Model we can check the performace our Model.

train_test_split is the most common option for that, Right !

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.37, random_state=1)

As you can see that all our features are raning from different values like,
1.Area Income have values raning in - Five Digit
2. Daily Time Spent on Site - Two Digit
3. Age - Two Digit
4. Daily Internet Usage - Three Digit

So, we have to first Scale them to 0 to 1 considering the Maximum & Minimum values we have in that perticular coloumn dataset..

In [None]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Third Step : Create a First KNN Model 

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, roc_curve, classification_report

knn = KNeighborsClassifier(n_neighbors=1)
model = knn.fit(X_train, y_train)
knn_predict = knn.predict(X_test)
knn_conf_matrix = confusion_matrix(y_test, knn_predict)
knn_acc_score = accuracy_score(y_test, knn_predict)
print("confusion matrix")
print(knn_conf_matrix)
print("\n")
print("Accuracy of k-NN Classification:",knn_acc_score*100,'\n')
print(classification_report(y_test, knn_predict))

Yes, We have successfully created a KNNModel with Good accurancy, Precision & F1 Score. 

Question is, Can we improve ?

Answer is, yea we can...

# Fourth Step : Improving the Model / Tunning the Model

Here, we will only tune the Model based on K neighbour numbers..

What if we change the value of the K neighbours ?

To answer this, we will write below code to identify the impact on Error improvement with values of K neighbour from 1 to 20. 

We will append the values of Error to have cumulative data of Error rates. 

In [None]:
error_rate = []

for i in range(1,20):
    
    knn = KNeighborsClassifier(n_neighbors=i)
    knn.fit(X_train,y_train)
    pred_i = knn.predict(X_test)
    error_rate.append(np.mean(pred_i != y_test))

Now, we will plot the data Error rate to identify the best suitable values of the K neighbour to have best results.

In [None]:
import matplotlib.pyplot as plt
plt.figure(figsize=(10,6))

plt.plot(range(1,20),error_rate,color='green', linestyle='dashed', marker='o',
         markerfacecolor='red', markersize=5)
plt.title('Error Rate vs. K Value')
plt.xlabel('K')
plt.ylabel('Error Rate')

By looking at the above graph, 

It was noted that we have received minimum error at k neighbour value 7 and after that the error value is just oscilating and not going any further down.

So, We will select 7 as our luck number for KNN classifier..

# Fifith Step : Recreating a KNN Model

In [None]:
knn = KNeighborsClassifier(n_neighbors=7)
model = knn.fit(X_train, y_train)
knn_predict = knn.predict(X_test)
knn_conf_matrix = confusion_matrix(y_test, knn_predict)
knn_acc_score = accuracy_score(y_test, knn_predict)
print("confusion matrix")
print(knn_conf_matrix)
print("\n")
print("Accuracy of k-NN Classification:",knn_acc_score*100,'\n')
print(classification_report(y_test, knn_predict))

So final Conclusion is,

Before tunning: the Model Accuracy of k-NN Classification: 94.05405405405406

After tunning: the Model Accuracy of k-NN Classification: 95.4054054054054

# So We get increment of 1.35% in final results for just tunning the one parameter.

Thank You for reaching this far.

If you require any further information please give it in Comment.
