In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [3]:
df=pd.read_csv("C:\\Users\\suhan\\Downloads\\ad_click_dataset.csv")

In [5]:
df.head()

Unnamed: 0,id,full_name,age,gender,device_type,ad_position,browsing_history,time_of_day,click
0,670,User670,22.0,,Desktop,Top,Shopping,Afternoon,1
1,3044,User3044,,Male,Desktop,Top,,,1
2,5912,User5912,41.0,Non-Binary,,Side,Education,Night,1
3,5418,User5418,34.0,Male,,,Entertainment,Evening,1
4,9452,User9452,39.0,Non-Binary,,,Social Media,Morning,0


In [7]:
df.isnull().sum()  

id                     0
full_name              0
age                 4766
gender              4693
device_type         2000
ad_position         2000
browsing_history    4782
time_of_day         2000
click                  0
dtype: int64

In [9]:
df.dtypes

id                    int64
full_name            object
age                 float64
gender               object
device_type          object
ad_position          object
browsing_history     object
time_of_day          object
click                 int64
dtype: object

In [11]:
# age is numerical therefore use mean or median
df['age']=df['age'].fillna(df['age'].median())

In [13]:
# others categorical therefore use mode
for col in ['gender', 'device_type', 'ad_position','browsing_history', 'time_of_day']:
    df[col]=df[col].fillna(df[col].mode()[0])

In [15]:
df.isnull().sum()

id                  0
full_name           0
age                 0
gender              0
device_type         0
ad_position         0
browsing_history    0
time_of_day         0
click               0
dtype: int64

In [17]:
df['gender'].value_counts()

gender
Female        6527
Male          1810
Non-Binary    1663
Name: count, dtype: int64

In [19]:
df['device_type'].value_counts()

device_type
Desktop    4754
Mobile     2649
Tablet     2597
Name: count, dtype: int64

In [21]:
df['ad_position'].value_counts()

ad_position
Bottom    4817
Top       2597
Side      2586
Name: count, dtype: int64

In [23]:
df['browsing_history'].value_counts()

browsing_history
Entertainment    5957
Social Media     1054
Education        1029
Shopping          984
News              976
Name: count, dtype: int64

In [25]:
df['time_of_day'].value_counts()

time_of_day
Morning      4126
Afternoon    2016
Evening      1958
Night        1900
Name: count, dtype: int64

In [27]:
from sklearn.preprocessing import LabelEncoder

In [29]:
le=LabelEncoder()

In [31]:
cols_to_encode=['gender','device_type', 'ad_position','browsing_history', 'time_of_day']

In [33]:
for col in cols_to_encode:
    df[col]=le.fit_transform(df[col])

In [35]:
df.head()

Unnamed: 0,id,full_name,age,gender,device_type,ad_position,browsing_history,time_of_day,click
0,670,User670,22.0,0,0,2,3,0,1
1,3044,User3044,39.5,1,0,2,1,2,1
2,5912,User5912,41.0,2,0,1,0,3,1
3,5418,User5418,34.0,1,0,0,1,1,1
4,9452,User9452,39.0,2,0,0,4,2,0


In [37]:
# name and id not relevant 
df = df.drop(['id', 'full_name'], axis=1)

In [39]:
X=df.drop('click',axis=1)

In [41]:
y=df['click']

In [43]:
from sklearn.model_selection import train_test_split

In [45]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=42)

In [47]:
from sklearn.preprocessing import StandardScaler

In [49]:
scaler=StandardScaler()

In [51]:
X_train_scaled=scaler.fit_transform(X_train)

In [53]:
X_test_scaled=scaler.fit_transform(X_test)

In [57]:
from sklearn.neighbors import KNeighborsClassifier

In [59]:
knn = KNeighborsClassifier(n_neighbors=5)

In [61]:
knn.fit(X_train_scaled, y_train)

In [63]:
y_pred = knn.predict(X_test_scaled)

In [65]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

In [67]:
print("Accuracy:", accuracy_score(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

Accuracy: 0.667
[[ 257  448]
 [ 218 1077]]
              precision    recall  f1-score   support

           0       0.54      0.36      0.44       705
           1       0.71      0.83      0.76      1295

    accuracy                           0.67      2000
   macro avg       0.62      0.60      0.60      2000
weighted avg       0.65      0.67      0.65      2000



In [71]:
from sklearn.model_selection import GridSearchCV

params = {
    'n_neighbors': [3, 5, 7, 9],
    'weights': ['uniform', 'distance'],
    'metric': ['euclidean', 'manhattan']
}

grid = GridSearchCV(KNeighborsClassifier(), param_grid=params, cv=5, scoring='accuracy')
grid.fit(X_train_scaled, y_train)

print("Best params:", grid.best_params_)
print("Best accuracy:", grid.best_score_)


Best params: {'metric': 'manhattan', 'n_neighbors': 9, 'weights': 'distance'}
Best accuracy: 0.6841250000000001


In [73]:
from sklearn.metrics import roc_auc_score

y_proba = grid.best_estimator_.predict_proba(X_test_scaled)[:, 1]
print("ROC AUC Score:", roc_auc_score(y_test, y_proba))


ROC AUC Score: 0.6913245160053672


In [77]:
with open('knn.pkl', 'wb') as f:
    pickle.dump(knn, f)

In [79]:
with open('knn.pkl', 'rb') as file: 
    loaded_data = pickle.load(file)