In [None]:
import re
import numpy as np
import pandas as pd
import string
import nltk
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
data = pd.read_csv('drive/My Drive/TUGAS AKHIR/DATASET/dataset movie review.csv')
data = data.iloc[:10000]
data.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [None]:
data['sentiment'].value_counts()

positive    5028
negative    4972
Name: sentiment, dtype: int64

# Preprocessing

## Drop NaN

In [None]:
data.isna().sum()

review       0
sentiment    0
dtype: int64

In [None]:
data = data.dropna()
data.reset_index(drop=True, inplace=True)
data.isnull().sum()

review       0
sentiment    0
dtype: int64

## Drop Duplicate

In [None]:
data.duplicated().sum()


17

In [None]:
data = data.drop_duplicates()
data.reset_index(drop=True, inplace=True)
data.duplicated().sum()


0

## Cleaning

In [None]:
stopwords = nltk.corpus.stopwords.words('english')
ps = nltk.PorterStemmer()

def clean_text(text):
    text = "".join([c for c in text if c not in string.punctuation])
    token = re.split('\W+', text) #split kata
    text = " ".join([ps.stem(word) for word in token if word not in stopwords])
    return text


In [None]:
data['review_clean'] = data['review'].apply(lambda x: clean_text(x))
data.head()

Unnamed: 0,review,sentiment,review_clean
0,One of the other reviewers has mentioned that ...,positive,one review mention watch 1 Oz episod youll hoo...
1,A wonderful little production. <br /><br />The...,positive,A wonder littl product br br the film techniqu...
2,I thought this was a wonderful way to spend ti...,positive,I thought wonder way spend time hot summer wee...
3,Basically there's a family where a little boy ...,negative,basic there famili littl boy jake think there ...
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive,petter mattei love time money visual stun film...


# N-gram

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(ngram_range=(1,2)) #unigram & bigram
X = cv.fit_transform(data['review_clean'])
print(X.shape)

(9983, 831861)


In [None]:
from sklearn.feature_extraction.text import TfidfTransformer
ti = TfidfTransformer()
X_tf = ti.fit_transform(X)
print(X_tf.shape)

(9983, 831861)


# Split Data

In [None]:
from sklearn.model_selection import train_test_split
X_Train, X_Test, Y_Train, Y_Test = train_test_split(X_tf, data['sentiment'], test_size = 0.25)

# Model

In [None]:
from datetime import datetime
from sklearn.svm import SVC
start= datetime.now()
model = SVC()
model.fit(X_Train,Y_Train)
Y_pred = model.predict(X_Test)
end = datetime.now()
print('Waktu Running: {}'.format(end - start))

Waktu Running: 0:02:24.266165


In [None]:
from sklearn import metrics
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
print("Hasil Akurasi SVM:", accuracy_score(Y_Test, Y_pred)*100, "%")
print("Confusion Matrix")
print(confusion_matrix(Y_Test, Y_pred))
print("Classification report")
print(classification_report(Y_Test, Y_pred))

Hasil Akurasi SVM: 87.4198717948718 %
Confusion Matrix
[[1053  196]
 [ 118 1129]]
Classification report
              precision    recall  f1-score   support

    negative       0.90      0.84      0.87      1249
    positive       0.85      0.91      0.88      1247

    accuracy                           0.87      2496
   macro avg       0.88      0.87      0.87      2496
weighted avg       0.88      0.87      0.87      2496

