# Sentiment Analysis - SVM with TFIDF Term Weighting

`SVM Kernel = Linear; Term Weighting = TFRF`

## Import Necessary Libraries

In [1]:
import pandas as pd
import numpy as np
import numpy as np 
import re #RegEx
import itertools
import matplotlib.pyplot as plt
 
from sklearn import svm #Import SVM Classification
from sklearn.feature_extraction.text import CountVectorizer #Count Vector Space Model
from sklearn import metrics #Matrix Builder
from sklearn.metrics import accuracy_score  
from sklearn.model_selection import KFold #Import KFold
from sklearn.metrics import confusion_matrix
from sklearn.svm import SVC #Support Vector Classifier
from sklearn.metrics import classification_report
from textvec import vectorizers

## Load Dataset

*Dataset is already preprocessed before.*

In [2]:
df = pd.read_csv('data/clean_tweets.csv')
df #Print Dataset

Unnamed: 0,tweet,stemming,label
0,#AyoTolakUUIKN \n\nProyek IKN membuka peluang ...,"['proyek', 'ikn', 'buka', 'peluang', 'oligarki...",-1.0
1,Tolak UU IKN karena berpotensi merusak lingkun...,"['tolak', 'uu', 'ikn', 'potensi', 'rusak', 'li...",-1.0
2,UU IKN hanya akan merugikan rakyat dengan huta...,"['uu', 'ikn', 'rugi', 'rakyat', 'hutang', 'rib...",-1.0
3,Jika UU ini diterapkan yang terjadi adalah mas...,"['uu', 'terap', 'masyarakat', 'rasa', 'rugi', ...",-1.0
4,UU IKN Syarat kepentingan oligarki. Tolak n ba...,"['uu', 'ikn', 'syarat', 'penting', 'oligarki',...",-1.0
...,...,...,...
5887,Horee!! Kabar gembira gaes!\nProses pemindahan...,"['horee', 'kabar', 'gembira', 'proses', 'pinda...",1.0
5888,Juru Bicara Presiden RI Fadjroel Rachman menga...,"['juru', 'bicara', 'presiden', 'ri', 'fadjroel...",1.0
5889,Pemerintah Indonesia mengajak Korea Selatan un...,"['perintah', 'indonesia', 'ajak', 'korea', 'se...",1.0
5890,@pikiran_rakyat masalah Jakarta tepatnya harus...,"['jakarta', 'tepat', 'asai', 'hadap', 'selesai...",-1.0


*Count each labels total value*

In [3]:
df['label'].value_counts()

 1.0    3750
-1.0    2142
Name: label, dtype: int64

### Implementing TFRF Feature Weighthing

In [4]:
vectorizer = CountVectorizer(strip_accents='unicode',token_pattern=r'\w{1,}',ngram_range=(1, 1)).fit(df['stemming'])
tfrf_vec = vectorizers.TfrfVectorizer(sublinear_tf=True)
tfrf_vec.fit(vectorizer.transform(df['stemming']), df['label'])
TFRF = tfrf_vec.transform(vectorizer.transform(df['stemming']))

In [5]:
print(TFRF)

  (0, 6111)	0.23436911255176732
  (0, 5846)	0.22360323542954716
  (0, 4599)	0.22476229022161281
  (0, 4232)	0.20697071832324376
  (0, 4014)	0.21616383976678716
  (0, 3267)	0.2065916872246241
  (0, 2076)	0.740126567306348
  (0, 1075)	0.3487316733153249
  (0, 838)	0.20714656738456683
  (1, 6111)	0.2178650688089469
  (1, 5846)	0.20785731422693862
  (1, 4904)	0.19220502003944884
  (1, 4599)	0.20893474951385574
  (1, 4487)	0.19366804060206536
  (1, 4014)	0.2009417935326057
  (1, 3379)	0.20242409822881666
  (1, 3312)	0.19377934446250483
  (1, 3219)	0.20286466279250187
  (1, 3175)	0.19529483667654718
  (1, 2076)	0.6880075781227821
  (1, 1811)	0.3459396089100405
  (2, 6111)	0.2060207729892293
  (2, 5947)	0.18108416364901792
  (2, 4888)	0.18299477055267177
  (2, 4815)	0.18117572603076199
  :	:
  (5890, 3379)	0.18466777841264792
  (5890, 2940)	0.32537808546115826
  (5890, 2616)	0.17460897193688868
  (5890, 2481)	0.17460897193688868
  (5890, 2325)	0.3176474903402862
  (5890, 2322)	0.1750494662727

*Separate label to its own representative array*

In [6]:
label = []
for data in df['label']:
    label.append(data)
kolom = label.pop

### Average SVM Function

In [7]:
def avgSVM(k):
    total = 0
    for i in range(k): #Iterate for k times
        total = total + accuracy[i]
    print("SVM Average Accuracy :", total / k)

In [8]:
def Average(lst):
    return sum(lst) / len(lst)

*Finding best k for KFold Cross Validation*

In [9]:
folds = range(2,11)
for k in folds:
    accuracy=[]
    kFoldCrossValidation = KFold(n_splits=k, random_state=50, shuffle = True)
    for train, test in kFoldCrossValidation.split(TFRF, label):
        trainData, testData = TFRF[train], TFRF[test]
        label = np.array(label)
        trainData2, testData2 = label[train], label[test]
        
        SVM = SVC(kernel = 'linear', C = 1)
        model = SVM.fit(trainData, trainData2)
        prediksi = model.predict(testData)
        
        accuracy.append(accuracy_score(testData2, prediksi))
        
    print('Folds : %d | Avg Accuracy : %.3f | Max, Min : %.3f, %.3f' 
          % (k, Average(accuracy), max(accuracy), min(accuracy)))
    print("\n")

Folds : 2 | Avg Accuracy : 0.862 | Max, Min : 0.865, 0.858


Folds : 3 | Avg Accuracy : 0.864 | Max, Min : 0.866, 0.860


Folds : 4 | Avg Accuracy : 0.865 | Max, Min : 0.874, 0.855


Folds : 5 | Avg Accuracy : 0.870 | Max, Min : 0.880, 0.858


Folds : 6 | Avg Accuracy : 0.870 | Max, Min : 0.883, 0.860


Folds : 7 | Avg Accuracy : 0.868 | Max, Min : 0.878, 0.850


Folds : 8 | Avg Accuracy : 0.871 | Max, Min : 0.881, 0.849


Folds : 9 | Avg Accuracy : 0.871 | Max, Min : 0.890, 0.840


Folds : 10 | Avg Accuracy : 0.871 | Max, Min : 0.890, 0.832




### Implementing KFold with chosen K value

In [10]:
#K-Fold Cross Validation will iterate k=5 times
kFoldCrossValidation = KFold(n_splits=10, random_state=50, shuffle = True)
for train, test in kFoldCrossValidation.split(TFRF, label):
    
    print("==========================================================================================")
    print("Amount of Train Data: ", len(train))
    print("Amount of Test Data: ", len(test))
    print("\nTrain Data: \n", train)
    print("\nTest Data: \n", test)
    #Initiate Train and Test Data then transform to TFIDF value. Then copy to new Train and Test variables. 
    trainData, testData = TFRF[train], TFRF[test]
    label = np.array(label)
    trainData2, testData2 = label[train], label[test]
    
    SVM = SVC(kernel = 'linear', C = 1)
    model = SVM.fit(trainData, trainData2)
    prediksi = model.predict(testData)
    
    print("\nSVM Prediction  : \n", prediksi)
    
    print("\nConfusion Matrix: \n", metrics.confusion_matrix(testData2, prediksi))
   
    accuracy.append(accuracy_score(testData2, prediksi))
    
    print("\nSVM Accuracy : ", accuracy_score(testData2, prediksi))
    print()\
    
    label_target = ['positif','negatif']
    print(metrics.classification_report(testData2, prediksi, labels=[1,-1]))#Confussion Matrix
    
avgSVM(5)

Amount of Train Data:  4713
Amount of Test Data:  1179

Train Data: 
 [   1    2    3 ... 5889 5890 5891]

Test Data: 
 [   0    4    8 ... 5883 5885 5888]

SVM Prediction  : 
 [-1. -1. -1. ...  1. -1.  1.]

Confusion Matrix: 
 [[306 136]
 [ 31 706]]

SVM Accuracy :  0.8583545377438507

              precision    recall  f1-score   support

           1       0.84      0.96      0.89       737
          -1       0.91      0.69      0.79       442

    accuracy                           0.86      1179
   macro avg       0.87      0.83      0.84      1179
weighted avg       0.86      0.86      0.85      1179

Amount of Train Data:  4713
Amount of Test Data:  1179

Train Data: 
 [   0    1    2 ... 5888 5889 5890]

Test Data: 
 [   3    5    6 ... 5876 5887 5891]

SVM Prediction  : 
 [-1. -1. -1. ... -1.  1.  1.]

Confusion Matrix: 
 [[302 114]
 [ 34 729]]

SVM Accuracy :  0.8744698897370653

              precision    recall  f1-score   support

           1       0.86      0.96      0.9