In [1]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.svm import LinearSVC
from sklearn.model_selection import validation_curve

In [2]:
df=pd.read_csv('diabetes_2_features.csv')
df.head(2)

Unnamed: 0,Glucose,BloodPressure,Outcome
0,131,109,diabetic
1,114,52,non-diabetic


In [5]:
# features and target sets
X, y=df.iloc[:,:2], df['Outcome']
display(X.head(2))
display(y.head(2))

Unnamed: 0,Glucose,BloodPressure
0,131,109
1,114,52


0        diabetic
1    non-diabetic
Name: Outcome, dtype: object

In [7]:
# std is relatively high, that shows high variability in my only feature (balance). SVM is sensiive to this, 
### so we transorm our features set
X.describe()

Unnamed: 0,Glucose,BloodPressure
count,54.0,54.0
mean,117.277778,90.092593
std,36.758989,23.448272
min,56.0,50.0
25%,84.5,71.5
50%,113.0,89.5
75%,154.25,108.75
max,177.0,130.0


# SVM

In [9]:
# define and fit (and transform) the scaler
scaler = MinMaxScaler()
X_trns = scaler.fit_transform(X)
X_trns

array([[0.61983471, 0.7375    ],
       [0.47933884, 0.025     ],
       [0.42975207, 0.        ],
       [0.23140496, 0.3125    ],
       [0.10743802, 0.15      ],
       [0.10743802, 0.075     ],
       [0.39669421, 0.2625    ],
       [0.85950413, 0.875     ],
       [0.46280992, 0.3125    ],
       [0.70247934, 0.85      ],
       [0.82644628, 0.15      ],
       [0.91735537, 0.6625    ],
       [0.83471074, 0.525     ],
       [0.        , 0.4375    ],
       [0.97520661, 0.0125    ],
       [0.54545455, 0.35      ],
       [0.46280992, 0.7125    ],
       [0.04132231, 0.3375    ],
       [0.61157025, 0.925     ],
       [0.81818182, 0.3875    ],
       [1.        , 0.65      ],
       [0.95867769, 0.8       ],
       [0.9338843 , 0.55      ],
       [0.99173554, 0.575     ],
       [0.0661157 , 0.7375    ],
       [0.79338843, 0.975     ],
       [0.21487603, 0.4       ],
       [0.26446281, 0.4375    ],
       [0.7107438 , 0.6125    ],
       [0.03305785, 0.8       ],
       [0.

AttributeError: 'numpy.ndarray' object has no attribute 'table'

In [None]:
# Splitting to train and test (Default 75/25)
x_train, x_test, y_train, y_test = train_test_split(X_trns, y_default, test_size=.2, random_state=0)

In [10]:
# define a LinearSVC(random_state=0)
svm1 = LinearSVC(random_state=0)
svm1.fit(X, y)



LinearSVC(random_state=0)

In [11]:
# Evaluate performance of the model
print('Linear SVC acc on train data: {:.2%}'.format(svm1.score(X, y)))
# print('Linear SVC on test data: {:.2%}'.format(svm1.score(x_test, y_test)))

Linear SVC acc on train data: 42.59%


In [15]:
# Make predictions
b = [[800],[100]]
# svm1.predict(b)

In [16]:
b_trns=scaler.transform(b)
b_trns



ValueError: X has 1 features, but MinMaxScaler is expecting 2 features as input.

In [17]:
# the correct prediction
svm1.predict(b_trns)

NameError: name 'b_trns' is not defined

In [18]:
# get the linear equation for the decision boundry
# intercept
svm1.intercept_.round(2)

array([0.42])

In [19]:
# Coef
svm1.coef_.round(2)

array([[-0.01, -0.  ]])

In [20]:
LinearSVC()

LinearSVC()

In [34]:
# validation_curve has two outputs:
## 1- scores on train 2- scores on tests
### the order is important
train_scores, test_scores = validation_curve(LinearSVC(random_state=0), 
                                             X_trns, y, 
                                             param_name = 'C', 
                                             param_range=[10, 5, 1, 0.1, 0.01, 0.001], 
                                             cv = 5)

In [35]:
train_scores.round(4)

array([[0.8837, 0.8605, 0.907 , 0.9767, 0.9091],
       [0.8837, 0.8605, 0.907 , 0.9767, 0.9091],
       [0.8605, 0.8837, 0.8837, 0.9767, 0.9091],
       [0.8605, 0.8605, 0.9302, 0.9767, 0.9091],
       [0.6744, 0.8372, 0.7442, 0.907 , 0.8864],
       [0.5814, 0.5814, 0.5814, 0.8837, 0.9091]])

In [36]:
test_scores.round(4)

array([[1.    , 0.9091, 1.    , 0.6364, 0.8   ],
       [1.    , 0.8182, 1.    , 0.6364, 0.8   ],
       [1.    , 0.8182, 1.    , 0.6364, 0.8   ],
       [1.    , 0.7273, 1.    , 0.6364, 0.8   ],
       [0.6364, 0.8182, 0.9091, 0.6364, 0.7   ],
       [0.5455, 0.5455, 0.5455, 0.6364, 0.7   ]])

In [37]:
# axis = 1 calculates mean for each row
print('ave cross val scores on train:', train_scores.mean(axis = 1).round(4))

ave cross val scores on train: [0.9074 0.9074 0.9027 0.9074 0.8098 0.7074]


In [38]:
# C=10 has the highest test score
print('ave cross val scores on test:', test_scores.mean(axis = 1).round(4))

ave cross val scores on test: [0.8691 0.8509 0.8509 0.8327 0.74   0.5945]


In [43]:
print('average score in 5 folds')
print('training:', train_scores.mean(axis = 1).round(3))
print('test:', test_scores.mean(axis = 1).round(3))

average score in 5 folds
training: [0.907 0.907 0.903 0.907 0.81  0.707]
test: [0.869 0.851 0.851 0.833 0.74  0.595]


In [40]:
# returns mean of all levels of C
train_scores.mean()

0.8570295983086683

In [41]:
test_scores.mean()

0.7896969696969697