# Prediksi SVM

# Import Dataset

In [64]:
import pandas as pd
from sklearn.model_selection import train_test_split

data_clean = pd.read_csv('Dataset/jobstreet_3000_hasiltextpreprocessing.csv')
data_clean = pd.DataFrame(data_clean)

A = data_clean['text_tokens_stemmed']
B = data_clean['Label']

In [65]:
data_clean = data_clean.drop(columns=['score', 'text_clean', 'text_Stopword', 'text_tokens', 'content'])
# data_clean.drop(columns='content')

In [66]:
data_clean

Unnamed: 0,Label,text_tokens_stemmed
0,negatif,worth it
1,negatif,susah re login by seek gagal
2,negatif,tolong baik aplikasi login eror gausah suruh s...
3,positif,excellent
4,positif,good job
...,...,...
2749,positif,update
2750,positif,bantu
2751,negatif,selesai update ga login sih parah
2752,positif,good


In [67]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Handle missing values by filling them with an empty string

A = A.fillna('')

tfid_vectorizer = TfidfVectorizer()

A_fit_tfid = tfid_vectorizer.fit_transform(A)
A_tfid = tfid_vectorizer.transform(A)

In [68]:
print(A.shape)
print(A_tfid.shape)

(2754,)
(2754, 3933)


In [69]:
import sklearn.metrics as metrics
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score

for c in [0.01, 0.05, 0.25, 0.5, 0.75, 1]:
    svm = LinearSVC(C=c)
    svm.fit(A_tfid, B)
    print ("Accuracy for C=%s: %s" % (c, accuracy_score(B, svm.predict(A_tfid))))

svm = LinearSVC(C = 1)
svm.fit(A_tfid, B)

Accuracy for C=0.01: 0.8874364560639071
Accuracy for C=0.05: 0.9168482207697894
Accuracy for C=0.25: 0.9509803921568627
Accuracy for C=0.5: 0.9665940450254176
Accuracy for C=0.75: 0.9745824255628177
Accuracy for C=1: 0.9785766158315178


In [70]:
A_tfid.toarray()

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

# Perform SVM

In [71]:
B_pred = svm.predict(A_tfid)

In [72]:
data_clean['Label SVM'] = B_pred

In [73]:
data_clean = pd.DataFrame(data_clean)
data_clean.head()

Unnamed: 0,Label,text_tokens_stemmed,Label SVM
0,negatif,worth it,negatif
1,negatif,susah re login by seek gagal,negatif
2,negatif,tolong baik aplikasi login eror gausah suruh s...,negatif
3,positif,excellent,positif
4,positif,good job,positif


In [74]:
data_clean.to_csv('Dataset\jobstreet_3000_hasil_sentimen_SVM.csv', index=False)

data_clean2 = pd.read_csv('Dataset/jobstreet_3000_hasil_sentimen_SVM.csv')
data_clean2 = pd.DataFrame(data_clean2)
data_clean2.head()

Unnamed: 0,Label,text_tokens_stemmed,Label SVM
0,negatif,worth it,negatif
1,negatif,susah re login by seek gagal,negatif
2,negatif,tolong baik aplikasi login eror gausah suruh s...,negatif
3,positif,excellent,positif
4,positif,good job,positif


In [75]:
# buatlah k-fold cross validation dengan k = 5
from sklearn.model_selection import cross_val_score

cross_val_score(svm, A_tfid, B, cv=5, scoring='accuracy')
# print setiap k
print(f"Akurasi setiap fold: {cross_val_score(svm, A_tfid, B, cv=5, scoring='accuracy')}")
# print rata-rata akurasi
print(f"Rata-rata akurasi: {cross_val_score(svm, A_tfid, B, cv=5, scoring='accuracy').mean()}")
# print standar deviasi
print(f"Standar deviasi: {cross_val_score(svm, A_tfid, B, cv=5, scoring='accuracy').std()}")
# print akurasi tertinggi
print(f"Akurasi tertinggi: {cross_val_score(svm, A_tfid, B, cv=5, scoring='accuracy').max()}")
# print akurasi terendah
print(f"Akurasi terendah: {cross_val_score(svm, A_tfid, B, cv=5, scoring='accuracy').min()}")
# print waktu eksekusi
print(f"Waktu eksekusi: {cross_val_score(svm, A_tfid, B, cv=5, scoring='accuracy').mean()}")


Akurasi setiap fold: [0.86751361 0.8584392  0.86388385 0.83303085 0.86909091]
Rata-rata akurasi: 0.858391684540505
Standar deviasi: 0.013198642722777336
Akurasi tertinggi: 0.8690909090909091
Akurasi terendah: 0.8330308529945554
Waktu eksekusi: 0.858391684540505


In [76]:
from sklearn.metrics import accuracy_score

print("Accuracy score: ", accuracy_score(data_clean['Label'], data_clean['Label SVM']))

Accuracy score:  0.9785766158315178


# Evaluasi Model

In [77]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

print('Accuracy score:', accuracy_score(data_clean2['Label'], data_clean2['Label SVM']))
print('Precision score:', precision_score(data_clean2['Label'], data_clean2['Label SVM'], average='weighted'))
print('Recall score:', recall_score(data_clean2['Label'], data_clean2['Label SVM'], average='weighted'))
print('F1 score:', f1_score(data_clean2['Label'], data_clean2['Label SVM'], average='weighted'))

# confusion matrix
print(f'confusion matrix:\n{confusion_matrix(data_clean2["Label"], data_clean2["Label SVM"])}')
print('=========================================================\n')

print(classification_report(data_clean2['Label'], data_clean2['Label SVM']))

Accuracy score: 0.9785766158315178
Precision score: 0.9786236640331221
Recall score: 0.9785766158315178
F1 score: 0.978584416234606
confusion matrix:
[[1223   24]
 [  35 1472]]

              precision    recall  f1-score   support

     negatif       0.97      0.98      0.98      1247
     positif       0.98      0.98      0.98      1507

    accuracy                           0.98      2754
   macro avg       0.98      0.98      0.98      2754
weighted avg       0.98      0.98      0.98      2754



In [78]:
data_clean

Unnamed: 0,Label,text_tokens_stemmed,Label SVM
0,negatif,worth it,negatif
1,negatif,susah re login by seek gagal,negatif
2,negatif,tolong baik aplikasi login eror gausah suruh s...,negatif
3,positif,excellent,positif
4,positif,good job,positif
...,...,...,...
2749,positif,update,negatif
2750,positif,bantu,positif
2751,negatif,selesai update ga login sih parah,negatif
2752,positif,good,positif


In [79]:
data_clean.to_csv('Dataset/jobstreet_3000_hasil_sentimen_SVM.csv', index=False)