# Raisin -  SVM Classification
## Raisin Dataset; 2 Class: Kecimen and Besni Raisin

https://www.kaggle.com/datasets/muratkokludataset/raisin-dataset

### Import Libraries

In [129]:
import numpy as np
import pandas as pd
from pandas_profiling import ProfileReport
from statsmodels.stats.outliers_influence import variance_inflation_factor


from sklearn.model_selection import train_test_split

#import dtale

### Read Data

In [130]:
raisin = pd.read_excel("Raisin_Dataset.xlsx")
raisin = pd.get_dummies(data=raisin, columns=['Class'],drop_first=True)

profile = ProfileReport(raisin, title = 'Raisin',html = {'style':{'full_width' : True}})
profile.to_widgets()

### Define Features and Target

In [131]:
target = ['Class_Kecimen']
features = [x for x in raisin.columns if x not in target]

### Normalize Data

In [132]:
for i in features:    
    raisin[i] = (raisin[i] - raisin[i].mean()) / raisin[i].std()

### Specific check correlation with target

In [133]:
raisin.corrwith(raisin['Class_Kecimen'])*100

Area               -62.571543
MajorAxisLength    -67.319368
MinorAxisLength    -50.310200
Eccentricity       -43.850000
ConvexArea         -62.556662
Extent              15.468850
Perimeter          -66.598058
Class_Kecimen      100.000000
dtype: float64

### Check for Multicollinearity

In [134]:
vif_data = pd.DataFrame()
vif_data["feature"] = raisin.columns

vif_data["VIF"] = [variance_inflation_factor(raisin.values, i) for i in range(len(raisin.columns))]
  
print(vif_data)

           feature         VIF
0             Area  410.834063
1  MajorAxisLength  129.326090
2  MinorAxisLength   40.166949
3     Eccentricity    5.228764
4       ConvexArea  456.639340
5           Extent    1.605388
6        Perimeter  187.216827
7    Class_Kecimen    1.344321


### Split train/test

In [135]:
X_train, X_test, y_train, y_test = train_test_split(raisin[features], raisin[target], test_size=0.33, random_state=42)

In [136]:
from sklearn import svm

In [137]:
clf = svm.SVC()
clf.fit(X_train, y_train.values.ravel())

In [138]:
y_pred = clf.predict(X_test)

In [139]:
from sklearn.metrics import *
print('SVM with all features')
print('accuracy_score:  ', accuracy_score(y_pred=y_pred,y_true=y_test.values.ravel()))
print('precision_score: ', precision_score(y_pred=y_pred,y_true=y_test.values.ravel()))
print('recall:          ', recall_score(y_pred=y_pred,y_true=y_test.values.ravel()))
print('F1:              ', f1_score(y_pred=y_pred,y_true=y_test.values.ravel()))
print('AUC Score :      ', roc_auc_score(y_score=y_pred,y_true=y_test.values.ravel()))

SVM with all features
accuracy_score:   0.8619528619528619
precision_score:  0.845679012345679
recall:           0.8954248366013072
F1:               0.8698412698412699
AUC Score :       0.8609068627450979


### Take off Multicollinearity

In [140]:
features_col = [x for x in raisin.columns if x not in ['Class_Kecimen','Area','MajorAxisLength','MinorAxisLength','Eccentricity','ConvexArea','Extent']]

vif_data = pd.DataFrame()
vif_data["feature"] = features_col

vif_data["VIF"] = [variance_inflation_factor(raisin.values, i) for i in range(len(features_col))]
  
print(vif_data)

     feature         VIF
0  Perimeter  410.834063


In [141]:
X_train2, X_test2, y_train2, y_test2 = train_test_split(raisin[features_col], raisin[target], test_size=0.33, random_state=42)
clf2 = svm.SVC(kernel='rbf', gamma=0.7)
clf2.fit(X_train2, y_train2.values.ravel())

In [142]:
y_pred2 = clf2.predict(X_test2)

In [143]:
from sklearn.metrics import *
print('SVM without Multicollinearity')
print('accuracy_score:  ', accuracy_score(y_pred=y_pred2,y_true=y_test2.values.ravel()))
print('precision_score: ', precision_score(y_pred=y_pred2,y_true=y_test2.values.ravel()))
print('recall:          ', recall_score(y_pred=y_pred2,y_true=y_test2.values.ravel()))
print('F1:              ', f1_score(y_pred=y_pred2,y_true=y_test2.values.ravel()))
print('AUC Score :      ', roc_auc_score(y_score=y_pred2,y_true=y_test2.values.ravel()))

SVM without Multicollinearity
accuracy_score:   0.8653198653198653
precision_score:  0.8553459119496856
recall:           0.8888888888888888
F1:               0.8717948717948718
AUC Score :       0.8645833333333333
