# SVM 
Support vector machine algoithm on the Spotify data set 

In [54]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn import neighbors
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

import pandas as pd
%matplotlib inline

# visualization library
import seaborn as sns
sns.set(style="white", color_codes=True)
sns.set_context(rc={"font.family":'sans',"font.size":24,"axes.titlesize":24,"axes.labelsize":24})   

# change the theme
!jt -t gruvboxd -T

In [55]:
# download test data
df = pd.read_csv (r'C:\Users\Emelie\Documents\Modern method in ML\Project\project_train.csv')
# download train data that's given from canvas
X_testR = pd.read_csv (r'C:\Users\Emelie\Documents\Modern method in ML\Project\project_test.csv')

# take out outliers
df = df.drop([68,94,84], axis=0)
df.describe()

X = df.iloc[:, :-1].values
y = df.iloc[:, 11].values

# we split the training data into validation and training data
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20)

#### Scale the data

In [56]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(X_train) # scale against train data

X_scaled = scaler.transform(X)
# Given test data from canvas 
X_scaledR = scaler.transform(X_testR)

#### SVM with Sklearn

In [57]:
from sklearn import svm

svm_clf = svm.SVC()

#### Find the best parameters with gridsearch

In [58]:
from sklearn.model_selection import GridSearchCV

param_grid = {'gamma': [0.006,0.055],'C' : [185,186,184], 'kernel': ['rbf','poly','linear']}
search = GridSearchCV(svm_clf, param_grid, cv=10,scoring='accuracy')
X_scaled = scaler.transform(X)
search.fit(X_scaled,y)
print(search.best_score_)
print(search.best_params_)



0.775
{'C': 185, 'gamma': 0.006, 'kernel': 'rbf'}


#### Apply SVM with cross validation 

In [62]:
# classifier with best parameters
svm_clf = svm.SVC(gamma=0.006,C=185,kernel='rbf')

scores = cross_val_score(svm_clf, X_scaled, y, cv=KFold(n_splits=10, random_state=1234, shuffle=True), scoring = 'accuracy')
print(np.mean(scores))
print(np.var(scores))

0.8354166666666666
0.0036848958333333334


#### Test prediction with sklearns fit and predict

In [63]:
svm_clf.fit(X_train, y_train)
ypred = svm_clf.predict(X_test)
results = svm_clf.score(X_test, y_test)
print(results)

0.7291666666666666


### Use unlable data from Spotify

In [19]:
df_AB = pd.read_csv (r'C:\Users\Emelie\Documents\Modern method in ML\Project\Extra_data\extra_data_Andrew Belle.csv')
df_AV = pd.read_csv (r'C:\Users\Emelie\Documents\Modern method in ML\Project\Extra_data\extra_data_Avicii.csv')
df_VJ = pd.read_csv (r'C:\Users\Emelie\Documents\Modern method in ML\Project\Extra_data\extra_data_Vance Joy.csv')
df_IM = pd.read_csv (r'C:\Users\Emelie\Documents\Modern method in ML\Project\Extra_data\extra_data_Iron Maiden.csv')
df_LG = pd.read_csv (r'C:\Users\Emelie\Documents\Modern method in ML\Project\Extra_data\extra_data_Lady Gaga.csv') 

AB = df_AB.to_numpy()
AV = df_AV.to_numpy()
VJ = df_VJ.to_numpy()
IM = df_IM.to_numpy()
LG = df_LG.to_numpy()

X_unlab = np.concatenate((AB, AV, VJ, IM, LG))
X_unlab = X_unlab[:, 7: ]

X_unlab_scaled = scaler.transform(X_unlab)

#### Outliers

In [20]:
df = pd.DataFrame(X_unlab) 

outliers_percentage = []
variables = []
for k, v in df.items():
    Q1 = v.quantile(0.25)
    Q3 = v.quantile(0.75)
    IRQ = Q3 - Q1
    v_col = v[(v <= Q1 - 1.5 * IRQ) | (v >= Q3 + 1.5 * IRQ)]
    perc = np.shape(v_col)[0] * 100.0 / np.shape(df)[0]
    outliers_percentage.append(perc)
    variables.append(k)
#     print("Column %s outliers = %.2f%%" % (k, perc))

outliers = pd.DataFrame({'Variable':variables, '% Outliers':outliers_percentage })
outliers.sort_values(by=["% Outliers"],ascending=False)

Unnamed: 0,Variable,% Outliers
7,7,22.621185
8,8,7.899461
3,3,5.745063
5,5,5.385996
9,9,1.974865
0,0,0.718133
1,1,0.0
2,2,0.0
4,4,0.0
6,6,0.0


#### Running model

In [44]:
svm_clf = svm.SVC()
svm_clf.fit(X_train, y_train)

K = 10
C_max = 200.0
C_labeled = 180.0
C_unlabeled_0 =  C_labeled/(10*K) 

for k in range(K):
    Y_unlab = svm_clf.predict(X_unlab_scaled)

    #Update weight factor
    C_unlabeled = (C_max - C_unlabeled_0)/(K**2)*k**2 + C_unlabeled_0

    X_new_fit = np.concatenate((X_train, X_unlab_scaled))
    Y_new_fit = np.concatenate((y_train, Y_unlab))

    #Retrain the TSVM
    svm_clf.fit(X_new_fit, Y_new_fit, sample_weight=np.concatenate((C_labeled*np.ones(y_train.shape), C_unlabeled*np.ones(Y_unlab.shape))))

In [45]:
ypred = svm_clf.predict(X_test)
results = svm_clf.score(X_test, y_test)
print(results)

0.6666666666666666
