# Prediksi SVM

# Import Dataset

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split

data_clean = pd.read_csv('Dataset/jobstreet_3000_hasiltextpreprocessing.csv')
data_clean = pd.DataFrame(data_clean)

A = data_clean['text_tokens_stemmed']

X_train, X_test, y_train, y_test = train_test_split(data_clean['text_tokens_stemmed'], data_clean['Label'], test_size=0.2, random_state=0)

In [2]:
data_clean = data_clean.drop(columns=['score', 'text_clean', 'text_Stopword', 'text_tokens', 'content'])
# data_clean.drop(columns='content')

In [3]:
data_clean

Unnamed: 0,Label,text_tokens_stemmed
0,positif,nyaman the best resposif
1,negatif,ni gimanasii daritdi bikin akun register aja g...
2,negatif,gak masuk
3,negatif,maaf buka jobstreet register masuk ga buka
4,positif,good
...,...,...
2766,positif,layan baik
2767,positif,mantap
2768,positif,ok
2769,positif,bantu cari loker


In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Handle missing values by filling them with an empty string
X_train = X_train.fillna('')
X_test = X_test.fillna('')
A = A.fillna('')

tfid_vectorizer = TfidfVectorizer()
X_train_tfid = tfid_vectorizer.fit_transform(X_train)
X_test_tfid = tfid_vectorizer.transform(X_test)
A_tfid = tfid_vectorizer.transform(A)

In [5]:
print(X_train_tfid.shape)
print(y_train.shape)
print(X_test_tfid.shape)
print(y_test.shape)
print(A.shape)
print(A_tfid.shape)

(2216, 3403)
(2216,)
(555, 3403)
(555,)
(2771,)
(2771, 3403)


In [7]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer()
vectorizer.fit(X_train)

In [8]:
X_train = vectorizer.transform(X_train)
X_test = vectorizer.transform(X_test)
A = vectorizer.transform(A)

In [9]:
import sklearn.metrics as metrics
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score

for c in [0.01, 0.05, 0.25, 0.5, 0.75, 1]:
    svm = LinearSVC(C=c)
    svm.fit(X_train, y_train)
    print ("Accuracy for C=%s: %s" % (c, accuracy_score(y_test, svm.predict(X_test))))

Accuracy for C=0.01: 0.818018018018018
Accuracy for C=0.05: 0.836036036036036
Accuracy for C=0.25: 0.8414414414414414
Accuracy for C=0.5: 0.8432432432432433
Accuracy for C=0.75: 0.8378378378378378
Accuracy for C=1: 0.8378378378378378


In [10]:
X_train.toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

# Perform SVM

In [14]:
svm = LinearSVC(C = 0.25)
svm.fit(X_train, y_train)

In [15]:
print("Accuracy score model final: %s" % accuracy_score(y_test, svm.predict(X_test)))

Accuracy score model final: 0.8414414414414414


# Evaluasi Model

In [20]:
y_pred = svm.predict(X_test)
b_pred = svm.predict(A)
print('Accuracy of SVM classifier on test set: {:.2f}'.format(svm.score(X_test, y_test)))

from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))
print(metrics.confusion_matrix(y_test, y_pred))

Accuracy of SVM classifier on test set: 0.84
              precision    recall  f1-score   support

     negatif       0.83      0.77      0.79       222
     positif       0.85      0.89      0.87       333

    accuracy                           0.84       555
   macro avg       0.84      0.83      0.83       555
weighted avg       0.84      0.84      0.84       555

[[170  52]
 [ 36 297]]


In [21]:
# masukkan y_pred ke dalam data_clean
data_clean['Label_pred'] = b_pred

In [22]:
data_clean

Unnamed: 0,Label,text_tokens_stemmed,Label_pred
0,positif,nyaman the best resposif,positif
1,negatif,ni gimanasii daritdi bikin akun register aja g...,negatif
2,negatif,gak masuk,negatif
3,negatif,maaf buka jobstreet register masuk ga buka,negatif
4,positif,good,positif
...,...,...,...
2766,positif,layan baik,positif
2767,positif,mantap,positif
2768,positif,ok,positif
2769,positif,bantu cari loker,positif


In [23]:
data_clean.to_csv('Dataset/jobstreet_3000_hasil_sentimen_SVM.csv', index=False)