## SVM Classification

In [1]:
# Load dependencies for this Jupyter Notebook
import pandas as pd
import time
import numpy as np
from functools import reduce
from lib.util import fetch_extracted
import matplotlib.pyplot as plt

import seaborn as sns

#Train and Test preprocessing
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn import preprocessing

#Classifiers:
from sklearn import svm

In [3]:
# Read CSV

events=[
            "germanwings-crash",
            "sydneysiege",
            "ottawashooting",
            "ferguson",
            "charliehebdo",
        ]

events_threads={}
for event in events:
    X,y=fetch_extracted(event)
    X=X.drop(X.columns.values[np.where(np.isnan(X.values))[1]],axis=1)
    events_threads[event]={'X':X.values,'y':y.values,'columns':X.columns}

In [4]:
def test_models(models,X_train, X_test, y_train, y_test):
    for model_name in models:
        model=models[model_name]
        model.fit(X_train,y_train)
        y_pred=model.predict(X_test)
        print('%s train accuracy:' % model_name, accuracy_score(model.predict(X_train),y_train))
        print('%s test accuracy:' % model_name, accuracy_score(y_pred, y_test))
        # print('%s classification report:' % model_name, classification_report(y_pred, y_test))
        print('%s y_test:' % model_name, y_test)
        print('%s y_pred:' % model_name, y_pred)
        print('%s precision:' % model_name, precision_score(y_pred, y_test))
        print('%s recall:' % model_name, recall_score(y_pred, y_test))
        print('%s f1-score:' % model_name, f1_score(y_pred, y_test))
        print()
        
def split_train_and_test_data(train_events,test_events):
    d=events_threads[train_events[0]]['X'].shape[1]
    X_train=np.zeros((0,d))
    X_test=np.zeros((0,d))
    y_train=np.zeros((0))
    y_test=np.zeros((0))
    for event in train_events:
        if event in test_events:
            X_train1, X_test1, y_train1, y_test1 = train_test_split(events_threads[event]['X'], events_threads[event]['y'], test_size=0.25, random_state=1)
            X_train=np.concatenate((X_train,X_train1),axis=0)
            y_train=np.concatenate((y_train,y_train1),axis=None)  
            X_test=np.concatenate((X_test,X_test1),axis=0)
            y_test=np.concatenate((y_test,y_test1),axis=None)
        else:
            X_train=np.concatenate((X_train,events_threads[event]['X']),axis=0)
            y_train=np.concatenate((y_train,events_threads[event]['y']),axis=0)


    for event in test_events:
        if event not in train_events:
            X_test=np.concatenate((X_test,events_threads[event]['X']),axis=0)
            y_test=np.concatenate((y_test,events_threads[event]['y']),axis=0)

    le = preprocessing.LabelEncoder()
    le.fit(y_train)
    y_train=le.transform(y_train)
    y_test=le.transform(y_test)
    return X_train, X_test, y_train, y_test


## Testing

In [5]:
models={
    'LinearSVC_with_L1_Regularization' : svm.LinearSVC(penalty='l1',dual=False,max_iter=3000),
    'linear_SVM':svm.SVC(gamma='scale', kernel='linear'),
    'SVM_with_RBF_kernel': svm.SVC(gamma='scale', kernel='rbf'),
    'SVM_with_sigmoid_kernel' : svm.SVC(gamma='scale', kernel='sigmoid'),
}

## 1. Train and test pada *charliehebdo* event:

In [8]:
X_train, X_test, y_train, y_test=split_train_and_test_data(['charliehebdo'],['charliehebdo'])
test_models(models,X_train, X_test, y_train, y_test)



LinearSVC_with_L1_Regularization train accuracy: 0.9899679829242263
LinearSVC_with_L1_Regularization test accuracy: 0.9916755602988261
LinearSVC_with_L1_Regularization y_test: [0 0 0 ... 0 0 1]
LinearSVC_with_L1_Regularization y_pred: [0 0 0 ... 0 0 1]
LinearSVC_with_L1_Regularization precision: 0.9721518987341772
LinearSVC_with_L1_Regularization recall: 0.9948186528497409
LinearSVC_with_L1_Regularization f1-score: 0.9833546734955185

linear_SVM train accuracy: 0.9943792244752757
linear_SVM test accuracy: 0.9957310565635006
linear_SVM y_test: [0 0 0 ... 0 0 1]
linear_SVM y_pred: [0 0 0 ... 0 0 1]
linear_SVM precision: 0.9831223628691983
linear_SVM recall: 1.0
linear_SVM f1-score: 0.9914893617021276

SVM_with_RBF_kernel train accuracy: 0.9811454998221274
SVM_with_RBF_kernel test accuracy: 0.9784418356456777
SVM_with_RBF_kernel y_test: [0 0 0 ... 0 0 1]
SVM_with_RBF_kernel y_pred: [0 0 0 ... 0 0 1]
SVM_with_RBF_kernel precision: 0.9172995780590717
SVM_with_RBF_kernel recall: 0.9972477064

## 2. Train and test pada *germanwings-crash* event:

In [9]:
X_train, X_test, y_train, y_test=split_train_and_test_data(['germanwings-crash'],['germanwings-crash'])
test_models(models,X_train, X_test, y_train, y_test)

LinearSVC_with_L1_Regularization train accuracy: 0.9977262391996362
LinearSVC_with_L1_Regularization test accuracy: 0.9959128065395095
LinearSVC_with_L1_Regularization y_test: [0 0 0 0 0 0 0 0 1 0 0 0 0 1 1 0 0 0 1 0 0 1 0 0 0 0 0 1 0 1 0 0 0 1 0 0 0
 0 0 0 0 1 0 0 1 0 0 1 0 1 1 0 0 0 1 0 0 1 0 1 0 1 0 1 0 0 1 1 0 1 0 1 1 1
 0 1 0 0 1 1 0 0 0 0 0 0 0 0 1 0 0 0 1 1 0 1 0 0 0 0 0 0 0 0 1 0 1 1 0 0 0
 0 1 0 0 1 1 0 1 0 1 0 0 1 0 0 0 0 1 1 0 0 0 1 0 0 1 0 1 1 1 0 0 1 0 0 0 0
 0 0 1 0 0 1 0 0 0 0 0 0 0 0 0 1 1 1 1 1 0 0 1 0 1 0 1 0 0 0 1 0 1 1 0 0 0
 1 0 0 1 0 1 0 0 0 1 0 0 0 0 1 0 0 0 0 0 0 1 0 0 1 1 1 1 0 0 0 0 0 0 0 0 1
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 1 1 1 1 0 0 0 0 1 0 0 0 1 1 0 0 0 0 0
 0 0 1 0 0 0 0 1 0 0 0 0 0 0 0 1 0 1 0 0 1 0 1 0 1 0 0 0 0 1 0 0 0 0 0 0 1
 1 0 0 0 0 0 0 1 0 0 1 0 0 0 1 0 0 1 1 0 1 0 0 1 1 0 0 0 0 0 1 1 1 0 0 1 0
 0 0 0 0 1 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 0 1 0 0 0 1 0 0 0 0 1 1 1
 0 0 0 0 1 0 0 0 0 0 1 0 0 0 0 0 0 0 1 1 0 1 0 1 0 0 1 1 1 0 0 0 0 1 1 0 1

## 3. Train pada *germanwings-crash* event dan test pada *charliehebdo* event:

In [11]:
X_train, X_test, y_train, y_test=split_train_and_test_data(['germanwings-crash'],['charliehebdo'])
test_models(models,X_train, X_test, y_train, y_test)

LinearSVC_with_L1_Regularization train accuracy: 0.9979543129901125
LinearSVC_with_L1_Regularization test accuracy: 0.5724119530416222
LinearSVC_with_L1_Regularization y_test: [0 1 0 ... 1 0 1]
LinearSVC_with_L1_Regularization y_pred: [1 1 0 ... 1 0 1]
LinearSVC_with_L1_Regularization precision: 0.9995828118481435
LinearSVC_with_L1_Regularization recall: 0.37428727641958914
LinearSVC_with_L1_Regularization f1-score: 0.544638290617719

linear_SVM train accuracy: 0.9952267303102625
linear_SVM test accuracy: 0.5795090715048026
linear_SVM y_test: [0 1 0 ... 1 0 1]
linear_SVM y_pred: [1 1 0 ... 1 0 1]
linear_SVM precision: 0.9995828118481435
linear_SVM recall: 0.3782162588792423
linear_SVM f1-score: 0.5487860742098031

SVM_with_RBF_kernel train accuracy: 0.9740879645414252
SVM_with_RBF_kernel test accuracy: 0.971291355389541
SVM_with_RBF_kernel y_test: [0 1 0 ... 1 0 1]
SVM_with_RBF_kernel y_pred: [0 1 0 ... 1 0 0]
SVM_with_RBF_kernel precision: 0.9261576971214017
SVM_with_RBF_kernel recall

## 4. Train pada *germanwings-crash* event dan test pada *ottawashooting* event:

In [13]:
X_train, X_test, y_train, y_test=split_train_and_test_data(['germanwings-crash'],['ottawashooting'])
test_models(models,X_train, X_test, y_train, y_test)

LinearSVC_with_L1_Regularization train accuracy: 0.9979543129901125
LinearSVC_with_L1_Regularization test accuracy: 0.6024868606588899
LinearSVC_with_L1_Regularization y_test: [0 1 0 ... 0 0 1]
LinearSVC_with_L1_Regularization y_pred: [1 1 1 ... 1 0 1]
LinearSVC_with_L1_Regularization precision: 0.9990632318501171
LinearSVC_with_L1_Regularization recall: 0.4076834862385321
LinearSVC_with_L1_Regularization f1-score: 0.5790688204153659

linear_SVM train accuracy: 0.9952267303102625
linear_SVM test accuracy: 0.6049224458402769
linear_SVM y_test: [0 1 0 ... 0 0 1]
linear_SVM y_pred: [1 1 1 ... 1 0 1]
linear_SVM precision: 0.9990632318501171
linear_SVM recall: 0.4091693842317284
linear_SVM f1-score: 0.58056614044638

SVM_with_RBF_kernel train accuracy: 0.9740879645414252
SVM_with_RBF_kernel test accuracy: 0.9766696577361876
SVM_with_RBF_kernel y_test: [0 1 0 ... 0 0 1]
SVM_with_RBF_kernel y_pred: [0 1 0 ... 0 0 0]
SVM_with_RBF_kernel precision: 0.9236533957845433
SVM_with_RBF_kernel recall:

## 5. Train pada *ottawashooting* event dan test pada *fergusn* event:

In [14]:
X_train, X_test, y_train, y_test=split_train_and_test_data(['ottawashooting'],['ferguson'])
test_models(models,X_train, X_test, y_train, y_test)

LinearSVC_with_L1_Regularization train accuracy: 0.9860274323804641
LinearSVC_with_L1_Regularization test accuracy: 0.9893852685988554
LinearSVC_with_L1_Regularization y_test: [0 1 0 ... 0 0 0]
LinearSVC_with_L1_Regularization y_pred: [0 1 0 ... 0 0 0]
LinearSVC_with_L1_Regularization precision: 0.9784729229734275
LinearSVC_with_L1_Regularization recall: 0.9827702702702703
LinearSVC_with_L1_Regularization f1-score: 0.9806168885892467

linear_SVM train accuracy: 0.993334187924625
linear_SVM test accuracy: 0.9921543289643714
linear_SVM y_test: [0 1 0 ... 0 0 0]
linear_SVM y_pred: [0 1 0 ... 0 0 0]
linear_SVM precision: 0.9952909519004373
linear_SVM recall: 0.9765676567656766
linear_SVM f1-score: 0.98584041312677

SVM_with_RBF_kernel train accuracy: 0.9812844507114472
SVM_with_RBF_kernel test accuracy: 0.9795089532951818
SVM_with_RBF_kernel y_test: [0 1 0 ... 0 0 0]
SVM_with_RBF_kernel y_pred: [0 1 0 ... 0 0 0]
SVM_with_RBF_kernel precision: 0.9404641775983855
SVM_with_RBF_kernel recall: 