# Classification

## problem definition
- predict if a customer will purchase the product

### import required packages

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

### load the data

In [14]:
df = pd.read_csv('Data.csv')
df.head(3)

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44,72000,No
1,Spain,27,48000,Yes
2,Germany,30,54000,No


### EDA

In [15]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   Country    10 non-null     object
 1   Age        10 non-null     int64 
 2   Salary     10 non-null     int64 
 3   Purchased  10 non-null     object
dtypes: int64(2), object(2)
memory usage: 452.0+ bytes


In [16]:
df.isna().sum()

Country      0
Age          0
Salary       0
Purchased    0
dtype: int64

In [17]:
df['Purchased'].value_counts()

Purchased
No     5
Yes    5
Name: count, dtype: int64

### pre-processing

In [18]:
from sklearn.preprocessing import LabelEncoder

# create encoder
encoder = LabelEncoder()

# encode features
df['Country'] = LabelEncoder().fit_transform(df['Country'])
df['Purchased'] = LabelEncoder().fit_transform(df['Purchased'])

In [19]:
df.head()

Unnamed: 0,Country,Age,Salary,Purchased
0,0,44,72000,0
1,2,27,48000,1
2,1,30,54000,0
3,2,38,61000,0
4,1,40,50000,1


In [20]:
df.cov()

Unnamed: 0,Country,Age,Salary,Purchased
Country,0.766667,-0.277778,-5844.444,-0.166667
Age,-0.277778,82.277778,54777.78,-1.722222
Salary,-5844.444444,54777.777778,152711100.0,-1111.111111
Purchased,-0.166667,-1.722222,-1111.111,0.277778


In [21]:
df.corr()

Unnamed: 0,Country,Age,Salary,Purchased
Country,1.0,-0.034975,-0.540138,-0.361158
Age,-0.034975,1.0,0.488684,-0.360246
Salary,-0.540138,0.488684,1.0,-0.170598
Purchased,-0.361158,-0.360246,-0.170598,1.0


In [22]:
# split the data into x and y
x = df.drop(['Purchased', 'Salary'], axis=1)
y = df['Purchased']

### split the data into train and test sets

In [23]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x, y, train_size=0.7, random_state=123456)

### train the model

In [51]:
from sklearn.svm import SVC

# create a model
model = SVC(C=0.1, gamma=0.1, kernel='poly')

# train the model
model.fit(x_train, y_train)

### find the optimal values of hyper-parameters

In [41]:
from sklearn.model_selection import GridSearchCV, KFold

# create cross validator
k_fold = KFold(n_splits=5)


# create an object 
parameters = {
    "C": np.arange(10) * 0.1,
    "kernel": ['linear', 'poly', 'rbf', 'sigmoid'],
    "gamma": np.arange(10) * 0.1
}
grid_search_cv = GridSearchCV(estimator=model, param_grid=parameters, cv=k_fold)
grid_search_cv.fit(x_train, y_train)

200 fits failed out of a total of 2000.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
200 fits failed with the following error:
Traceback (most recent call last):
  File "/opt/homebrew/lib/python3.11/site-packages/sklearn/model_selection/_validation.py", line 895, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/opt/homebrew/lib/python3.11/site-packages/sklearn/base.py", line 1467, in wrapper
    estimator._validate_params()
  File "/opt/homebrew/lib/python3.11/site-packages/sklearn/base.py", line 666, in _validate_params
    validate_parameter_constraints(
  File "/opt/homebrew/lib/python3.11/site-packages/sklearn/utils/_param_validation.py", line 95, in validate_parameter_constraints
    raise InvalidParam

In [43]:
grid_search_cv.best_score_

0.5

In [44]:
grid_search_cv.best_params_

{'C': 0.1, 'gamma': 0.1, 'kernel': 'poly'}

### model evaluation

In [52]:
y_pred = model.predict(x_test)
y_true = y_test

In [54]:
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

accuracy = accuracy_score(y_true, y_pred)
precision = precision_score(y_true, y_pred)
recall = recall_score(y_true, y_pred)
f1 = f1_score(y_true, y_pred)
roc_auc_score = roc_auc_score(y_true, y_pred)

print(confusion_matrix(y_true, y_pred))
print(f"accuracy = {accuracy}")
print(f"precision = {precision}")
print(f"recall = {recall}")
print(f"f1 = {f1}")
print(f"roc_auc_score = {roc_auc_score}")

[[2 0]
 [1 0]]
accuracy = 0.6666666666666666
precision = 0.0
recall = 0.0
f1 = 0.0
roc_auc_score = 0.5


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
