# SVM Implementation

### Data Preprocessing

In [1]:
import numpy as np
import pandas as pd 

path = "data/fix_polarity_wordcount4.csv"

# assign column names
colnames = ['pleasantness', 'attention', 'sensitivity', 'aptitude', 'polarity', 'emotion']

# lagay sa dataframe yung dataset
data = pd.read_csv(path, names=colnames)

# X = data.drop(data.columns[[4,5]], axis=1)
X = data.drop('emotion', axis=1)
y = data['emotion']

print X.head()
print y.head()

   pleasantness  attention  sensitivity  aptitude  polarity
0          0.00        0.0       -1.000     -1.00     -1.00
1         -0.97        0.0       -0.990      0.00     -0.98
2          0.00        0.0        0.971      0.00     -0.97
3         -0.94        0.0        0.000      0.00     -0.94
4          0.00        0.0       -0.880     -0.95     -0.91
0   -1
1   -1
2   -1
3   -1
4   -1
Name: emotion, dtype: int64


### Train, Test, Split
Divides the dataset into training and test sets

In [2]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=0)

X_train.count()

pleasantness    2420
attention       2420
sensitivity     2420
aptitude        2420
polarity        2420
dtype: int64

### Training the Classifier
We will train the Support Vector Classifier using the parameters found with grid search

In [3]:
from sklearn.svm import SVC
classifier = SVC(kernel='rbf', gamma=.1, C=1000)
classifier.fit(X_train, y_train)  
# SVC(C=1000, kernel='rbf', gamma=0.01).fit(X_train, y_train).score(X_test, y_test)

SVC(C=1000, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma=0.1, kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

### Cross Validation

In [4]:
from sklearn.model_selection import cross_val_score

all_accuracies = cross_val_score(estimator=classifier, X=X_train, y=y_train, cv=10)

print all_accuracies
print "Mean Accuracy: ", all_accuracies.mean()

[0.66803279 0.7037037  0.67768595 0.68181818 0.64049587 0.63636364
 0.63636364 0.65145228 0.69294606 0.58506224]
Mean Accuracy:  0.6573924344229085


### Making Predictions and Generating Confusion Matrix

In [5]:
y_pred = classifier.predict(X_test)

from sklearn.metrics import confusion_matrix

print confusion_matrix(y_test, y_pred)

[[ 76  77  12]
 [ 14 130  43]
 [  1  33 220]]


### Recall, Precision, F1

In [6]:
from sklearn.metrics import classification_report

print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

          -1       0.84      0.46      0.59       165
           0       0.54      0.70      0.61       187
           1       0.80      0.87      0.83       254

   micro avg       0.70      0.70      0.70       606
   macro avg       0.73      0.67      0.68       606
weighted avg       0.73      0.70      0.70       606



## Saving the Model

In [8]:
from sklearn.externals import joblib

# save the model to disk
filename = 'final_model_70.pkl'
joblib.dump(classifier, filename)
 
#Loading the saved model with joblib
pipe = joblib.load('final_model_70.pkl')

# New data to predict
pr = pd.read_csv('data/sample.csv')
pred_cols = list(pr.columns.values)[:-1]

# pr = [[-0.33,-0.03,-0.26,-0.14,-0.17]]
# pred_cols = list(pr.values)

# apply the whole pipeline to data
pred = pd.Series(pipe.predict(pr[pred_cols]))
print pred #predicted
print pr #actual 

0    1
1    1
2   -1
3    0
dtype: int64
["'pleasantness'", " 'attention'", " 'sensitivity'", " 'aptitude'", " 'polarity'"]
   'pleasantness'   'attention'   'sensitivity'   'aptitude'   'polarity'  \
0           0.792        0.8275          0.0000       0.8085       0.8090   
1           0.855        0.8910          0.0000       0.4885       0.8995   
2          -0.330       -0.0300         -0.2600      -0.1400      -0.1700   
3           0.038        0.0095          0.0135       0.0590       0.0650   

    'emotion'  
0           1  
1           1  
2          -1  
3           0  


In [10]:
df_actual_predicted = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred})

df_actual_predicted.to_csv('actual_vs_predicted_FINAL')

print(df_actual_predicted)

      Actual  Predicted
2161       0          0
2990       1          1
2189       1          0
10        -1         -1
1850       0          0
2456       0          1
2588       1          1
1283      -1          0
2112       0          0
768        1          1
2034       0          0
1931       0          0
2518       1          1
253       -1          0
530        1          1
1643       0          0
1002      -1         -1
794        1          1
2029      -1          0
482        1          1
1302      -1          0
2020       0          0
453        1          1
2769       1          1
1945       0          0
2927       1          1
1005      -1         -1
562        1          1
953       -1         -1
2014       0          0
...      ...        ...
1061      -1         -1
743        1          1
1412       0          0
457        1          1
2777       1          1
427        1          1
2291       0          1
1914       0          0
346        1          1
1397       0    