# Classifier
*Sreehari P S*
***

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import pickle

### Fetch processed dataframe from pickle file
***

In [2]:
file_name = 'social-media-adds.pkl'
pickle_file = open('./pickles/'+ file_name, 'rb')
data = pickle.load(pickle_file)
pickle_file.close()
data.head()

Unnamed: 0,User ID,Age,EstimatedSalary,Purchased,Gender_Female,Gender_Male
0,15624510,19,19000,0,0,1
1,15810944,35,20000,0,0,1
2,15668575,26,43000,0,1,0
3,15603246,27,57000,0,1,0
4,15804002,19,76000,0,0,1


### Extracting features

In [3]:
data.columns

Index(['User ID', 'Age', 'EstimatedSalary', 'Purchased', 'Gender_Female',
       'Gender_Male'],
      dtype='object')

In [4]:
X = data.drop(columns=['Purchased', 'User ID'], axis = 1)
y = data['Purchased']

In [5]:
# Standard Scaler
from sklearn.preprocessing import StandardScaler
def standard_scale_data(dataframe):
    return StandardScaler().fit_transform(data)

### Split data sets

In [6]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, test_size=0.2)

### Check Model Accuracy

In [7]:
# Function to check model performances
import warnings
warnings.filterwarnings("ignore")
from sklearn.metrics import classification_report, confusion_matrix
def check_model_metrices(y_test, y_pred):
    print(classification_report(y_test, y_pred))
    print('Confusion Matrix = \n', confusion_matrix(y_test, y_pred))

### Build Models
***

#### 1. Multi Logistic Rregression

In [8]:
from sklearn.linear_model import LogisticRegression
rf = LogisticRegression(multi_class='multinomial', solver='lbfgs').fit(X_train, y_train)
rf_pred = rf.predict(X_test)
check_model_metrices(y_test, rf_pred)

              precision    recall  f1-score   support

           0       0.65      1.00      0.79        52
           1       0.00      0.00      0.00        28

    accuracy                           0.65        80
   macro avg       0.33      0.50      0.39        80
weighted avg       0.42      0.65      0.51        80

Confusion Matrix = 
 [[52  0]
 [28  0]]


#### 2. Linear SVM
***

In [None]:
from sklearn.svm import SVC
linear = SVC(kernel='linear', decision_function_shape='ovo') # decision_function_shape is set to One-vs_One for multi-class
linear.fit(X_train, y_train)
linear_pred = linear.predict(X_test)
check_model_metrices(y_test, linear_pred)

#### 5. kNN Classifier
***

In [None]:
from sklearn.neighbors import KNeighborsClassifier
def generate_kNN_model(x_train, y_train, x_test, k):
    knn_model = KNeighborsClassifier(n_neighbors=k, metric='minkowski')
    knn_model.fit(x_train, y_train)
    return knn_model.predict(x_test)

# Optimizing 'k' or 'n-neighbers' value
def find_optimal_k(x_train, y_train, x_test):
    accur_dict = dict()
    for k in np.arange(3,16):
        y_pred = generate_kNN_model(x_train, y_train, x_test, k)
        accur_dict[k] = accuracy_score(y_test, y_pred)
    #Plot
    plt.plot(list(accur_dict.keys()),list(accur_dict.values()), marker ='o')
    plt.title('k-Values vs Accuracy')
    plt.show()
    optimal_k = max(accur_dict, key = lambda x: accur_dict[x])
    print('Best k value = ', optimal_k)
    return optimal_k
k_optimal = find_optimal_k(X_train, y_train, X_test)
knn_y_pred = generate_kNN_model(X_train, y_train, X_test, k_optimal)
print(f'kNN classifier with k = {k_optimal} has :: \n')
check_model_metrices(y_test, knn_y_pred)

#### 6. Decision Trees
***

In [None]:
from sklearn.tree import DecisionTreeClassifier
dt_model = DecisionTreeClassifier()
dt_model.fit(X_train, y_train)
dt_pred = dt_model.predict(X_test)
check_model_metrices(y_test, linear_pred)

#### 7. Random Forest
***

In [None]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier()
rf.fit(X_train, y_train)
rf_pred = rf.predict(X_test)
check_model_metrices(y_test, linear_pred)