# Hello Guys. I have performed Logistic Regression & K Nearest Neighbor (KNN) on Online Ad purchase
* First I loaded the Dataset
* then I checked for any missing values and cleaned the data.
* Changed some of the datatypes to make it more readable for the regression
* Performed some Graphical Plotting to see the relations between vairables

# Performed Logistic Regression
* Scaled the Data
* Performed Evaluation Report on the Model

# K Nearest Classifier
* Elbow Method for K Nearest Neighbor Model
* Evaluation Report on the Model

# Loading the Dataset

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import missingno as msno

In [None]:
df = pd.read_csv('../input/logistic-regression/Social_Network_Ads.csv')

In [None]:
df.head()

In [None]:
df.info()

# Checking for Null Values and Changing Data Types to Clean the Data

In [None]:
msno.bar(df)

In [None]:
Gender = pd.get_dummies(df['Gender'], drop_first=True)

df = pd.concat([df, Gender], axis=1)

In [None]:
df.head()

In [None]:
df['Male'] = df['Male'].astype(float)

In [None]:
df.info()

# Graphical Plots to analyze relation of variables in Dataset

In [None]:
plt.figure(figsize=(10,6))
sns.countplot(x='Gender', data=df, palette='magma', alpha=0.4)
sns.despine(left=True)

In [None]:
plt.figure(figsize=(10,6))
sns.boxplot(x='Gender', y='EstimatedSalary', data=df, palette='plasma')
sns.despine(left=True)

In [None]:
plt.figure(figsize=(14,10))
sns.scatterplot(x='EstimatedSalary', y='Age', data=df, hue=df['Purchased'], palette='GnBu', s=100, alpha=0.9)
sns.despine(left=True)

We observe that people with age 30 and above plus salary of 80,000 and above tend to purhase more from Online advertisements

In [None]:
df_purc = df.groupby(df['Purchased']).mean()
df_purc = df_purc.reset_index()
fig, ax =plt.subplots(nrows= 1, ncols = 3, figsize= (14,6))
w = sns.barplot(x='Purchased', y= 'Age', data=df_purc, palette='GnBu', ax=ax[0])
i = sns.barplot(x='Purchased', y= 'EstimatedSalary', data=df_purc, palette='magma', ax=ax[1])
h = sns.barplot(x='Purchased', y= 'Male', data=df_purc, palette='GnBu', ax=ax[2])
sns.despine(left=True)

w.set_title('Purchased or Not: Average Age')
w.set_ylabel('Age')

i.set_title('Purchased or Not: Average Estimated Salary')
i.set_ylabel('Estimated Salary')

h.set_title('Purchased or Not: Male = 1, Female = 0')
h.set_ylabel('Gender')


# Scaling the Data for Regression

In [None]:
X = df.drop(['User ID', 'Gender', 'Purchased'], axis=1)
y = df['Purchased']

In [None]:
X.head()

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(X)
scaled_data = scaler.transform(X)

In [None]:
scaled_data = pd.DataFrame(scaled_data, columns= X.columns)

In [None]:
scaled_data.head()

# Logistic Regression

In [None]:
from sklearn.feature_selection import SelectKBest,chi2
test=SelectKBest(score_func=chi2,k=2)
fit=test.fit(X,y)
print(fit.scores_)

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test,y_train,y_test = train_test_split(scaled_data,y, test_size=0.3)

In [None]:
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression()
logreg.fit(X_train, y_train)

In [None]:
prediction = logreg.predict(X_test)

# Evaluation

In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_test, prediction))

# K Nearest Neighbor
* We will now classify the same data with KNN Classifier
* First we will perform Elbow Method to find out the right value of K
* then we will run KNN Analysis on the Dataset
* Finally we will evaluate the Data

# Elbow Method

In [None]:
from sklearn.neighbors import KNeighborsClassifier
error_rate = []
for i in range(1,16):
    knn = KNeighborsClassifier(n_neighbors=i)
    knn.fit(X_train, y_train)
    prediction_k = knn.predict(X_test)
    error_rate.append(np.mean(prediction_k !=y_test))

In [None]:
error_rate = pd.DataFrame(error_rate,range(1,16), columns=['Error Rate']).reset_index()
error_rate = error_rate.rename(columns={'index': 'K Value'})

In [None]:
plt.figure(figsize=(12,7))
g = sns.set(style="white")
g = sns.lineplot(x='K Value', y='Error Rate', data=error_rate, color='green')
f = sns.scatterplot(x='K Value', y='Error Rate', data=error_rate, color='red', s=100)

g.set_title('Error Rate Analysis')
g.set_ylabel('Error Rate')
g.set_xlabel('K Value')
g =sns.despine(left=True)

* We will chose the K Value to be 5 because after that there is no significant drop in the error rate so this seems optimum

# K Nearest Neighbor Classification

* We have already scaled the data for Logistic regression
* We have already seperated train and test data before
* We will now directly proceed by running the model on the same data set


In [None]:
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, y_train)
prediction_5 = knn.predict(X_test)

# Evaluation

In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_test, prediction_k))