# SVM with Bank Marking dataset


# Importing

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import precision_score, accuracy_score, confusion_matrix, accuracy_score, f1_score
from sklearn.model_selection import cross_val_score
from sklearn.svm import SVC
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

In [None]:
dataAll = pd.read_csv('../input/bank-additional-full.csv',sep=';') 
dataAll = dataAll.drop('duration',axis=1) 
# data.head()
data_n = dataAll[dataAll["y"] =='no'].iloc[0:15000,:]
data_y = dataAll[dataAll["y"] == "yes"]
data = pd.concat([data_n,data_y])
data.shape
data.y.value_counts()
data.to_csv("banking_data.csv")

In [None]:
print(data.columns)
data.info()

In [None]:
data.isnull().sum()

In [None]:
data.describe().T


In [None]:
data.describe(include =['object']).T

In [None]:
data.y.value_counts()

In [None]:
# sns.pairplot(data, hue = 'y' )

In [None]:
# plt.figure(figsize=(20,12)) 
# sns.heatmap(data.corr(), annot=True)

Convert categorical data by using pandas get dummies

In [None]:
categorical_data = ['job','marital','education','contact','month','day_of_week','default','housing','loan','poutcome']
data_dummies = pd.get_dummies(data,columns=categorical_data, drop_first = True)

In [None]:
data_dummies.shape

In [None]:

data_dummies['y']=data_dummies['y'].map({'yes': 1,'no': 0})
data_dummies.head()

# Train Test Split

In [None]:

# raw data
X = data_dummies.drop('y', axis=1).values
y = data_dummies['y'].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=8008, stratify=y)


print('X train size: ', X_train.shape)
print('y train size: ', y_train.shape)
print('X test size: ', X_test.shape)
print('y test size: ', y_test.shape)

Before the training starts, standardize the *numeric* variables in the training set:

In [None]:

# standardize numeric variables only
scaler = StandardScaler()
X_train[:,0:9] = scaler.fit_transform(X_train[:, 0:9])
X_test[:,0:9] = scaler.fit_transform(X_test[:, 0:9])

In [None]:
X_train[0]

Support Vector Machine
1. Linear
2. Sigmoid
3. Poly
4. rbf


In [None]:
svc_linear = SVC(kernel = 'linear')
svc_linear.fit(X_train, y_train)
svcpred_linear = svc_linear.predict(X_test)
print(confusion_matrix(y_test, svcpred_linear))
print('Accuracy: ' , round(accuracy_score(y_test, svcpred_linear),2)*100)
print('F1 Score: ', round(f1_score(y_test, svcpred_linear),2)*100 )
SVCCV_linear = (cross_val_score(svc_linear, X_train, y_train, cv=2, n_jobs=1, scoring = 'accuracy').mean())
print(SVCCV_linear)
SVCCV_linear = (cross_val_score(svc_linear, X_train, y_train, cv=2, n_jobs=1, scoring = 'f1').mean())
print(SVCCV_linear)

In [None]:
svc_sigmoid = SVC(kernel = 'sigmoid')
svc_sigmoid.fit(X_train, y_train)
svcpred_sigmoid = svc_sigmoid.predict(X_test)

print(confusion_matrix(y_test, svcpred_sigmoid))
print('Accuracy :' , round(accuracy_score(y_test, svcpred_sigmoid),2)*100)
print('F1: ', round(f1_score(y_test, svcpred_sigmoid),2)*100 )
SVCCV_sigmoid = (cross_val_score(svc_sigmoid, X_train, y_train, cv=2, n_jobs=1, scoring = 'accuracy').mean())
print(SVCCV_sigmoid)
SVCCV_sigmoid = (cross_val_score(svc_sigmoid, X_train, y_train, cv=2, n_jobs=1, scoring = 'f1').mean())
print(SVCCV_sigmoid)

In [None]:
svc_poly = SVC(kernel = 'poly')
svc_poly.fit(X_train, y_train)
svcpred_poly = svc_poly.predict(X_test)

print(confusion_matrix(y_test, svcpred_poly))
print('Accuracy: ' , round(accuracy_score(y_test, svcpred_poly),2)*100)
print('F1 Score: ', round(f1_score(y_test, svcpred_poly),2)*100 )
SVCCV_poly = (cross_val_score(svc_poly, X_train, y_train, cv=2, n_jobs=1, scoring = 'accuracy').mean())
print(SVCCV_poly)
SVCCV_poly = (cross_val_score(svc_poly, X_train, y_train, cv=2, n_jobs=1, scoring = 'f1').mean())
print(SVCCV_poly)

In [None]:
svc_rbf = SVC(kernel = 'rbf')
svc_rbf.fit(X_train, y_train)
svcpred_rbf = svc_rbf.predict(X_test)

print(confusion_matrix(y_test, svcpred_rbf))
print('Accuracy :' , round(accuracy_score(y_test, svcpred_rbf),2)*100)
print('F1 Score: ', round(f1_score(y_test, svcpred_rbf),2)*100 )
SVCCV_rbf = (cross_val_score(svc_rbf, X_train, y_train, cv=2, n_jobs=1, scoring = 'accuracy').mean())
print(SVCCV_rbf)
SVCCV_rbf = (cross_val_score(svc_rbf, X_train, y_train, cv=2, n_jobs=1, scoring = 'f1').mean())
print(SVCCV_rbf)

In [None]:
print(svc_rbf)

In [None]:
print("linear f1: ", round(f1_score(y_test, svcpred_linear),2)*100)
print("poly f1: ",   round(f1_score(y_test, svcpred_poly),2)*100)
print("sigmoid f1: ", round(f1_score(y_test, svcpred_sigmoid),2)*100)
print("RBF f1:" , round(f1_score(y_test, svcpred_rbf),2)*100)

In [None]:
%%time
svc= SVC(kernel = 'sigmoid' , C=0.01)
svc.fit(X_train, y_train)
svcpred = svc.predict(X_test)
print(confusion_matrix(y_test, svcpred))
print(round(accuracy_score(y_test, svcpred),2)*100)
print('precision on the evaluation set: ', round(f1_score(y_test, svcpred),2)*100 )
# SVCCV_sigmoid = (cross_val_score(svc, X_train, y_train, cv=2, n_jobs=1, scoring = 'accuracy').mean())
# print(SVCCV_sigmoid)
# SVCCV_sigmoid = (cross_val_score(svc, X_train, y_train, cv=2, n_jobs=1, scoring = 'f1').mean())
# print(SVCCV_sigmoid)

In [None]:
svc= SVC(kernel = 'sigmoid' , C=1, gamma = 1)
svc.fit(X_train, y_train)
svcpred = svc.predict(X_test)
# print(confusion_matrix(y_test, svcpred))
print('Accuracy: ', round(accuracy_score(y_test, svcpred),2)*100)
print('F1: ', round(f1_score(y_test, svcpred),2)*100 )

In [None]:
svc= SVC(kernel = 'sigmoid' , C=1, gamma = 20)
svc.fit(X_train, y_train)
svcpred = svc.predict(X_test)
# print(confusion_matrix(y_test, svcpred))
print('Accuracy: ', round(accuracy_score(y_test, svcpred),2)*100)
print('F1: ', round(f1_score(y_test, svcpred),2)*100 )

In [None]:

svc= SVC(kernel = 'rbf')
svc.fit(X_train, y_train)
svcpred = svc.predict(X_test)
print(confusion_matrix(y_test, svcpred))
print(round(accuracy_score(y_test, svcpred),2)*100)
print('F1 Score: ', round(f1_score(y_test, svcpred),2)*100 )



In [None]:
svc= SVC(kernel = 'rbf' , C=1, gamma = 20)
svc.fit(X_train, y_train)
svcpred = svc.predict(X_test)
# print(confusion_matrix(y_test, svcpred))
print('Accuracy: ', round(accuracy_score(y_test, svcpred),2)*100)
print('F1: ', round(f1_score(y_test, svcpred),2)*100 )

In [None]:
svc= SVC(kernel = 'rbf' , C=1, gamma = 10)
svc.fit(X_train, y_train)
svcpred = svc.predict(X_test)
# print(confusion_matrix(y_test, svcpred))
print('Accuracy: ', round(accuracy_score(y_test, svcpred),2)*100)
print('F1: ', round(f1_score(y_test, svcpred),2)*100 )

In [None]:
svc= SVC(kernel = 'rbf' , C=1, gamma = 1)
svc.fit(X_train, y_train)
svcpred = svc.predict(X_test)
# print(confusion_matrix(y_test, svcpred))
print('Accuracy: ', round(accuracy_score(y_test, svcpred),2)*100)
print('F1: ', round(f1_score(y_test, svcpred),2)*100 )

In [None]:
svc= SVC(kernel = 'poly' )
svc.fit(X_train, y_train)
svcpred = svc.predict(X_test)
# print(confusion_matrix(y_test, svcpred))
print('Accuracy: ', round(accuracy_score(y_test, svcpred),2)*100)
print('F1: ', round(f1_score(y_test, svcpred),2)*100 )

In [None]:
svc= SVC(kernel = 'poly', C=1,gamma = 1)
svc.fit(X_train, y_train)
svcpred = svc.predict(X_test)
# print(confusion_matrix(y_test, svcpred))
print('Accuracy: ', round(accuracy_score(y_test, svcpred),2)*100)
print('F1: ', round(f1_score(y_test, svcpred),2)*100 )

In [None]:
svc= SVC(kernel = 'poly', C=1,gamma = 10)
svc.fit(X_train, y_train)
svcpred = svc.predict(X_test)
# print(confusion_matrix(y_test, svcpred))
print('Accuracy: ', round(accuracy_score(y_test, svcpred),2)*100)
print('F1: ', round(f1_score(y_test, svcpred),2)*100 )

In [None]:
SVCCV_rbf = (cross_val_score(svc, X_train, y_train, cv=2, n_jobs=1, scoring = 'accuracy').mean())
print(SVCCV_rbf)

SVCCV_rbf = (cross_val_score(svc, X_train, y_train, cv=2, n_jobs=1, scoring = 'f1').mean())
print(SVCCV_rbf)

In [None]:

svc= SVC(kernel = 'linear')
svc.fit(X_train, y_train)
svcpred = svc.predict(X_test)
print(confusion_matrix(y_test, svcpred))
print(round(accuracy_score(y_test, svcpred),2)*100)
print('precision on the evaluation set: ', round(precision_score(y_test, svcpred),2)*100 )
SVCCV_linear = (cross_val_score(svc, X_train, y_train, cv=2, n_jobs=1, scoring = 'accuracy').mean())
print(SVCCV_linear)

SVCCV_linear = (cross_val_score(svc, X_train, y_train, cv=2, n_jobs=1, scoring = 'f1').mean())
print(SVCCV_linear)

In [None]:
SVCCV_linear = (cross_val_score(svc, X_train, y_train, cv=2, n_jobs=1, scoring = 'accuracy').mean())
print(SVCCV_linear)

SVCCV_linear = (cross_val_score(svc, X_train, y_train, cv=2, n_jobs=1, scoring = 'f1').mean())
print(SVCCV_linear)

The prediction accuracy on the evaluation set is 90.0%, which is pretty good given that we did not select models based on accuracy. The prediction precision is 64.8%, which means among all clients that are predicited to subscribe a term deposit, 64.8% will acutally do so . This precision score is very close to the model's mean cross-validated precision score on $(X_{train}, y_{train})$, which is a good sign that the model is not overfitting. <br/>
There are definitely space to improve prediction precision. Some ideas include:
* Include more models into GridSearchCV (e.g. different gamma and C values for SVC, or even classifiers other than SVM)
* Change data processing pipeline (e.g. treat variables as nominal instead of ordinal, standardardization on each CV fold)
* Change train-test ratio