# Classification

## problem definition
- predict if a customer will purchase the product

### import required packages

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

### load the data

In [2]:
df = pd.read_csv('hearing_test.csv')
df.head(3)

Unnamed: 0,age,physical_score,test_result
0,33.0,40.7,1
1,50.0,37.2,1
2,52.0,24.7,0


### EDA

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 3 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   age             5000 non-null   float64
 1   physical_score  5000 non-null   float64
 2   test_result     5000 non-null   int64  
dtypes: float64(2), int64(1)
memory usage: 117.3 KB


In [4]:
df.isna().sum()

age               0
physical_score    0
test_result       0
dtype: int64

### pre-processing

In [6]:
df.head()

Unnamed: 0,age,physical_score,test_result
0,33.0,40.7,1
1,50.0,37.2,1
2,52.0,24.7,0
3,56.0,31.0,0
4,35.0,42.9,1


In [7]:
df.cov()

Unnamed: 0,age,physical_score,test_result
age,127.396398,-72.123723,-3.777956
physical_score,-72.123723,66.74566,3.173059
test_result,-3.777956,3.173059,0.240048


In [8]:
df.corr()

Unnamed: 0,age,physical_score,test_result
age,1.0,-0.782146,-0.683171
physical_score,-0.782146,1.0,0.792716
test_result,-0.683171,0.792716,1.0


In [9]:
# split the data into x and y
x = df.drop('test_result', axis=1)
y = df['test_result']

### split the data into train and test sets

In [10]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x, y, train_size=0.7, random_state=123456)

### train the model

In [14]:
from sklearn.svm import SVC

# create a model
model = SVC()

# train the model
model.fit(x_train, y_train)

### find the optimal values of hyper-parameters

In [None]:
from sklearn.model_selection import GridSearchCV, KFold

# create cross validator
k_fold = KFold(n_splits=5)


# create an object 
parameters = {
    "C": np.arange(10) * 0.1,
    "kernel": ['linear', 'poly', 'rbf', 'sigmoid'],
    "gamma": np.arange(10) * 0.1
}
grid_search_cv = GridSearchCV(estimator=model, param_grid=parameters, cv=k_fold)
grid_search_cv.fit(x_train, y_train)

In [43]:
grid_search_cv.best_score_

0.5

In [44]:
grid_search_cv.best_params_

{'C': 0.1, 'gamma': 0.1, 'kernel': 'poly'}

### model evaluation

In [15]:
y_pred = model.predict(x_test)
y_true = y_test

In [16]:
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

accuracy = accuracy_score(y_true, y_pred)
precision = precision_score(y_true, y_pred)
recall = recall_score(y_true, y_pred)
f1 = f1_score(y_true, y_pred)
roc_auc_score = roc_auc_score(y_true, y_pred)

print(confusion_matrix(y_true, y_pred))
print(f"accuracy = {accuracy}")
print(f"precision = {precision}")
print(f"recall = {recall}")
print(f"f1 = {f1}")
print(f"roc_auc_score = {roc_auc_score}")

[[501  84]
 [ 30 885]]
accuracy = 0.924
precision = 0.913312693498452
recall = 0.9672131147540983
f1 = 0.9394904458598726
roc_auc_score = 0.9118116855821774
