# Import Libraries and Load the Dataset

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

# note: those to be used for classification will be imported as needed.

# get the directory
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
ad = pd.read_csv('/kaggle/input/advertising/advertising.csv')
ad.head()

In [None]:
ad.info(memory_usage='deep')

In [None]:
# converting the datatype of 'timestamp'
ad.Timestamp = pd.to_datetime(ad.Timestamp)

# confirming that there are no null values
ad.isna().sum()

In [None]:
ad.describe()

## Main ideas:
* explore the dataset
* preprocess the data
* create a **logistic regression model**, use it for predictions and evaluate its performance
* scale the dataset, build a **KNN model**, get and evaluate the performance. Find the best k-value based on the error rate, tune the model and re-run it
* compare the performance of both models

# Exploratory Data Analysis

In [None]:
sns.set_style('whitegrid')

plt.figure(figsize=(10,8))
sns.heatmap(ad.corr(), annot=True, cmap='coolwarm');
plt.title('Dataset Correlation', loc='left', pad=20, fontsize=15);

In [None]:
plt.figure(figsize=(10, 7));

sns.histplot(ad.Age,bins=30,kde=True, color='b');
plt.title('Age Distribution', loc='left', fontsize=15, pad=20);

In [None]:
plt.figure(figsize=(10, 7));

xplot = ad.Male.apply(lambda x: 'Female' if x == 0 else 'Male')
sns.countplot(xplot, palette='coolwarm_r');
plt.xlabel('Gender');
plt.title('Distribution by Gender', loc='left', fontsize=15, pad=20);

In [None]:
# getting the most common words in the ad topic line. Using value_counts i/o Counter for performance sake

plt.figure(figsize=(18, 9))
pd.Series(' '.join(ad['Ad Topic Line']).lower().split()).value_counts()[:20].plot(kind='bar');
plt.xticks(rotation=45);
plt.title('Most common words in Ad Topic Line', loc='left', pad=20, fontsize=15);

In [None]:
y = ad.copy()
y['Day of Week'] = y.Timestamp.dt.day_name()

fig, (ax1, ax2) = plt.subplots(ncols=2, sharex=True, figsize=(15, 8));

y.groupby('Day of Week')['Daily Internet Usage'].mean().plot(kind='bar', ax=ax1)
y.groupby('Day of Week')['Clicked on Ad'].sum().plot(kind='line', color='g', ax=ax2)
ax1.title.set_text('Average Internet Usage by Day of the Week');
ax2.title.set_text('Clicks per Day of the Week');
ax2.set_ylim([50,85]);

In [None]:
# age vs. area income
sns.jointplot(x='Age', y='Area Income', data=ad, kind='hex', color='blue');

In [None]:
# age vs. daily time spent on the website
sns.jointplot(kind='kde', x='Age', y='Daily Time Spent on Site', data=ad, 
              color='darkcyan',shade=True, fill=True);

In [None]:
# daily time spent on the website vs. daily internet usage
sns.jointplot(x='Daily Time Spent on Site', y='Daily Internet Usage', data=ad, 
              hue='Clicked on Ad', palette='inferno_r');

In [None]:
sns.pairplot(ad, hue='Clicked on Ad', palette='inferno_r');

# Preprocessing Data

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

X = ad[['Daily Time Spent on Site', 'Age', 'Area Income','Daily Internet Usage', 'Male']]
y = ad['Clicked on Ad']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()
lr.fit(X_train, y_train)
lr_pred = lr.predict(X_test)

from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
lr_acc = accuracy_score(y_test, lr_pred)

print(f'Model Accuracy: {lr_acc}')
print(f'\nConfusion Matrix: \n{confusion_matrix(y_test, lr_pred)}')
print(f'\nClassification Report: \n{classification_report(y_test, lr_pred)}')

# K-Nearest Neighbors

In [None]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=1)
knn.fit(X_train, y_train)
knn_pred = knn.predict(X_test)

knn_acc = accuracy_score(y_test, knn_pred)

print(f'Model Accuracy: {knn_acc}')
print(f'\nConfusion Matrix: \n{confusion_matrix(y_test, knn_pred)}')
print(f'\nClassification Report: \n{classification_report(y_test, knn_pred)}')

## Choosing a K-Value

In [None]:
# getting the error rate
error_rate = []

for i in range (1, 60):
    knn = KNeighborsClassifier(n_neighbors=i)
    knn.fit(X_train, y_train)
    pred_i = knn.predict(X_test)
    error_rate.append(np.mean(pred_i != y_test))

# plotting the results
plt.figure(figsize=(10, 6))
plt.plot(range(1, 60), error_rate, color='darkcyan', linestyle='--',
        marker='o', markersize=10, markerfacecolor='red')
plt.title('Error Rate vs K. Value')
plt.xlabel='K'
plt.ylabel='Error Rate'
plt.grid(False)
plt.show()

## Retrain the new K-Value

In [None]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=26)
knn.fit(X_train, y_train)
knn_pred = knn.predict(X_test)

knn_acc = accuracy_score(y_test, knn_pred)

print(f'Model Accuracy: {knn_acc}')
print(f'\nConfusion Matrix: \n{confusion_matrix(y_test, knn_pred)}')
print(f'\nClassification Report: \n{classification_report(y_test, knn_pred)}')

# Model Comparison

In [None]:
models = pd.DataFrame({
    'Model':['Logistic Regression','KNN'],
    'Accuracy Score' :[lr_acc, knn_acc]
})

models.sort_values(by='Accuracy Score', ascending=False)

### Thank you for your time!
#### I appreciate you reading this far. If you have any comments and/or tips for improvement, **please leave a comment!**
#### Cheers! :)