# Prediciting Post Reaction using Support Vector Machine

## Import Libraries

### Main Libraries

In [1]:
import pandas as pd
import numpy as np
# Ignore warnings
import warnings
warnings.filterwarnings('ignore')

### Scikit Learn

In [2]:
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

## Load Dataset

In [3]:
df_train = pd.read_csv('postSenti_train.tsv', index_col='Unnamed: 0', sep='\t')
df_test = pd.read_csv('postSenti_test.tsv', index_col='Unnamed: 0', sep='\t') 

In [4]:
# Separate features and target for training dataset
X_train = df_train.drop(['sentiment_class'], axis=1)
y_train = df_train['sentiment_class']

In [5]:
# Separate features and target for testing dataset
X_test = df_test.drop(['sentiment_class'], axis=1)
y_test = df_test['sentiment_class']

## Support Vector Classification

### Feature selection using RandomForestClassifier--feature_importances

In [6]:
# Instantiate RandomForestClassifier
rf = RandomForestClassifier(random_state=35).fit(X_train, y_train)
rf_feat_imp = rf.feature_importances_
# Feature importance
feature_importance = pd.DataFrame(rf_feat_imp, index=X_train.columns, columns=["Feat_Imp"])

In [7]:
#Sort in descending order to get Most Important to Least Important
feature_importance.sort_values("Feat_Imp",ascending=False, inplace=True)
feature_importance.reset_index(inplace=True)
feature_importance

Unnamed: 0,index,Feat_Imp
0,Topic 0,0.167904
1,hour_day,0.139367
2,Topic 1,0.109699
3,Topic 2,0.069233
4,Topic 7,0.067534
5,Topic 9,0.06399
6,Topic 5,0.060787
7,Topic 6,0.057733
8,Topic 8,0.055409
9,Topic 4,0.051431


In [8]:
# Instantiate SVC
clf = SVC()

# Function to get the accuracy of the model
def get_accuracy(xtrain, ytrain, xtest, ytest):
    clf.fit(xtrain, ytrain)
    ypred = clf.predict(xtest)
    return accuracy_score(ypred, ytest)*100

In [9]:
score_dict = {}
# Store accuracies in score_dict
for i in range(1, len(feature_importance)):
    features = feature_importance.iloc[:i]
    predictor = list(features['index'])
    key = 'top' + str(i)
    score_dict[key] = get_accuracy(X_train[predictor], y_train, X_test[predictor], y_test)

In [10]:
# Check scores to determine how many features will be used
score_dict

{'top1': 60.204081632653065,
 'top2': 60.204081632653065,
 'top3': 60.204081632653065,
 'top4': 60.204081632653065,
 'top5': 60.204081632653065,
 'top6': 60.71428571428571,
 'top7': 60.71428571428571,
 'top8': 60.71428571428571,
 'top9': 60.71428571428571,
 'top10': 60.71428571428571,
 'top11': 60.204081632653065,
 'top12': 61.224489795918366,
 'top13': 61.224489795918366,
 'top14': 61.224489795918366,
 'top15': 61.224489795918366,
 'top16': 61.224489795918366,
 'top17': 61.224489795918366}

In [11]:
# Top 6 Feature Importance
top_feature = feature_importance.iloc[:6]
predictor = list(top_feature['index'])
predictor

['Topic 0', 'hour_day', 'Topic 1', 'Topic 2', 'Topic 7', 'Topic 9']

### Hyperparameter Optimization using GridSearch

In [12]:
param_grid = {'kernel': ['linear', 'rbf', 'poly'],
              'degree': [3],
              'gamma': ['auto'], 
              'C': [0.1, 1, 5]}

In [13]:
# Instantiate GridSearchCV
grid_search = GridSearchCV(clf, param_grid=param_grid, cv=5)

In [14]:
# Fit X_train
grid_search.fit(X_train[predictor], y_train)

GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='rbf', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'kernel': ['linear', 'rbf', 'poly'], 'degree': [3], 'gamma': ['auto'], 'C': [0.1, 1, 5]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [15]:
print("Best estimator:")
print(grid_search.best_estimator_)
print("Best parameters:")
print(grid_search.best_params_)
print("Best score:")
print(grid_search.best_score_)

Best estimator:
SVC(C=1, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='poly',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)
Best parameters:
{'C': 1, 'degree': 3, 'gamma': 'auto', 'kernel': 'poly'}
Best score:
0.5659411011523687


### Build SVC Model

In [16]:
# Instantiate SVC
svm = SVC(C=0.1, degree=3, gamma='auto', kernel='poly')
# Fit the training dataset on the SVC
svm.fit(X_train[predictor], y_train)
# Predict the labels on validation dataset
y_pred = svm.predict(X_test[predictor])

### Evaluate SVC Model

In [17]:
# Get Metrics
accuracy = accuracy_score(y_pred, y_test)*100
# Store metrics in dictionary
score_dict = {'SVC': pd.Series([accuracy], index=['AccuracyScore'])}
# Transform dictionary to df
df_metrics = pd.DataFrame(score_dict)
df_metrics

Unnamed: 0,SVC
AccuracyScore,60.714286


## Cross Validation

In [18]:
print('Mean Cross validated score -> ', np.mean(cross_val_score(svm, X_train, y_train, cv=5)*100))

Mean Cross validated score ->  56.33778308257688
