<a href="https://colab.research.google.com/github/raihankr/ml-sentiment-analysis/blob/main/sentiment_analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Dicoding - Projek Analisis Sentimen
Dibuat oleh: Raihan Khairul Rochman

**Objektif:**  
Menganalisis sentimen pada ulasan pengguna terhadap aplikasi **Pinterest** di Play Store

# Import Library

In [1]:
!pip install google_play_scraper
!pip install bahasa

import re
import csv
import json
import nltk
import string
import requests
import numpy as np
import pandas as pd
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
nltk.download('punkt')
nltk.download('stopwords')
from bahasa.stemmer import Stemmer
from google_play_scraper import Sort, reviews
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from imblearn.over_sampling import RandomOverSampler
from keras.models import Sequential
from keras.layers import LSTM, Dense, Embedding, Dropout, GRU
from keras.callbacks import Callback, EarlyStopping, ReduceLROnPlateau
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.optimizers import Adam

Collecting google_play_scraper
  Downloading google_play_scraper-1.2.7-py3-none-any.whl.metadata (50 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/50.2 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m50.2/50.2 kB[0m [31m3.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading google_play_scraper-1.2.7-py3-none-any.whl (28 kB)
Installing collected packages: google_play_scraper
Successfully installed google_play_scraper-1.2.7
Collecting bahasa
  Downloading bahasa-1.0.1.tar.gz (100 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m100.4/100.4 kB[0m [31m4.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting six==1.10.0 (from bahasa)
  Downloading six-1.10.0-py2.py3-none-any.whl.metadata (1.3 kB)
Downloading six-1.10.0-py2.py3-none-any.whl (10 kB)
Building wheels for collected packages: bahasa
  Building wheel for bahasa (setup.py) ... [?25l

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


# Data Scraping

In [2]:
scraped_data, token = reviews(
    'com.pinterest',
    lang='id',
    country='id',
    sort=Sort.MOST_RELEVANT,
    count=18000,
)

In [3]:
with open('reviews.csv', mode='w', newline='', encoding='utf-8') as file:
    writer = csv.writer(file)
    writer.writerow(['Review'])
    for review in scraped_data:
        writer.writerow([review['content']])

# Load & Clean Dataset

In [4]:
df = pd.read_csv('reviews.csv')

In [5]:
df = df.dropna().drop_duplicates()

In [6]:
df.head()

Unnamed: 0,Review
0,Dev tolong diperbaiki saya sudah menemukan ban...
1,Kendala di pencarian. Saat saya mencari dengan...
2,Awalnya bagus cuma sekarang buruk banget karna...
3,"bagus, tapi belakangan ini suka loading lama b..."
4,Untuk aplikasinya sudah baik dan cukup menarik...


In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 17980 entries, 0 to 17999
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Review  17980 non-null  object
dtypes: object(1)
memory usage: 280.9+ KB


**Deskripsi Data**:

---


Saya mengambil data dari sekitar 18.000  ulasan pengguna paling relevan terhadap aplikasi *Pinterest* di *platform* Google Play Store

# Text Preprocessing

In [8]:
# Tambahan stopwords untuk bahasa Indonesia
stopwords1 = pd.read_csv('https://raw.githubusercontent.com/ramaprakoso/analisis-sentimen/master/kamus/stopword.txt', header=None, names=['word'])
stopwords1 = stopwords1['word'].to_list()

In [9]:
response = requests.get('https://raw.githubusercontent.com/louisowen6/NLP_bahasa_resources/master/combined_slang_words.txt')
slangwords = json.loads(response.text)

In [10]:
def cleanText(text):
    result = re.sub(r"(([@#]|https?:\/\/)\S+|\d|[^\w\s])", "", text)
    result.replace("\n", " ")
    result = result.translate(str.maketrans("", "", string.punctuation))
    result = result.strip(" ")
    return result

casefoldingText = lambda text: text.lower()

def fixSlangWords(words):
    result = []
    for word in words:
        if word in slangwords:
            result.append(slangwords[word])
        else:
            result.append(word)
    return result

def filterWords(words):
    stopwords_list = set(stopwords.words('indonesian'))
    stopwords_list.update(stopwords1)
    stopwords_list.update(stopwords.words('english'))

    result = []
    for word in words:
        if word not in stopwords_list:
            result.append(word)
    return result

stemmer = Stemmer()

toSentence = lambda words: ' '.join(words)

In [11]:
df['Clean'] = df['Review'].apply(cleanText).apply(casefoldingText)
df['Tokenized'] = df['Clean'].apply(word_tokenize)
df['Formalized'] = df['Tokenized'].apply(fixSlangWords)
df['Stemmed'] = df['Formalized']\
    .apply(toSentence)\
    .apply(stemmer.stem)\
    .apply(word_tokenize)
df['Filtered'] = df['Stemmed'].apply(filterWords)
df['Final'] = df ['Filtered'].apply(toSentence)

# Data Labeling

In [12]:
lexicon_positive = pd.read_csv('https://raw.githubusercontent.com/fajri91/InSet/master/positive.tsv', delimiter='\t', index_col=0).T.loc['weight'].to_dict()
lexicon_negative = pd.read_csv('https://raw.githubusercontent.com/fajri91/InSet/master/negative.tsv', delimiter='\t', index_col=0).T.loc['weight'].to_dict()

In [13]:
def sentiment_analysis(words):
    score = 0
    for word in words:
        if word in lexicon_positive:
            score += lexicon_positive[word]
        if word in lexicon_negative:
            score += lexicon_negative[word]
    polarity: str
    polarity4: str
    polarity5: str

    if score >= 3:
        polarity = 'positive'
    elif score <= -3:
        polarity = 'negative'
    else:
        polarity = 'neutral'

    if score > 4:
        polarity4 = 'very_positive'
    elif score > 0:
        polarity4 = 'positive'
    elif score >= -4:
        polarity4 = 'negative'
    else:
        polarity4 = 'very_negative'

    if score > 5:
        polarity5 = 'very_positive'
    elif score > 1:
        polarity5 = 'positive'
    elif score >= -1:
        polarity5 = 'neutral'
    elif score >= -5:
        polarity5 = 'negative'
    else:
        polarity5 = 'very_negative'

    return score, polarity, polarity4, polarity5

In [14]:
labeled = df['Filtered'].apply(sentiment_analysis)
labeled = list(zip(*labeled))
df['Score'], df['Polarity'], df['Polarity4'], df['Polarity5'] = labeled

In [15]:
df[['Score']].describe()

Unnamed: 0,Score
count,17980.0
mean,-2.639989
std,7.009154
min,-60.0
25%,-6.0
50%,-2.0
75%,2.0
max,49.0


In [16]:
df['Polarity'].value_counts()

Unnamed: 0_level_0,count
Polarity,Unnamed: 1_level_1
negative,8684
neutral,5659
positive,3637


In [17]:
df['Polarity4'].value_counts()

Unnamed: 0_level_0,count
Polarity4,Unnamed: 1_level_1
very_negative,6420
negative,6003
positive,3299
very_positive,2258


In [18]:
df['Polarity5'].value_counts()

Unnamed: 0_level_0,count
Polarity5,Unnamed: 1_level_1
very_negative,5405
negative,4495
neutral,3572
positive,2785
very_positive,1723


# Feature Selection

In [19]:
X, y, y4, y5 = df['Final'], df['Polarity'], df['Polarity4'], df['Polarity5']

# Tokenizing

In [20]:
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(X)
X_token = tokenizer.texts_to_sequences(X)
X_token = pad_sequences(X_token, maxlen=100)

# Oversampling

In [21]:
ros = RandomOverSampler(random_state=0)
X_over, y_over = ros.fit_resample(X_token, y)
X4_over, y4_over = ros.fit_resample(X_token, y4)
X5_over, y5_over = ros.fit_resample(X_token, y5)

In [22]:
y_over.value_counts()

Unnamed: 0_level_0,count
Polarity,Unnamed: 1_level_1
negative,8684
neutral,8684
positive,8684


In [23]:
y4_over.value_counts()

Unnamed: 0_level_0,count
Polarity4,Unnamed: 1_level_1
very_negative,6420
negative,6420
positive,6420
very_positive,6420


In [24]:
y5_over.value_counts()

Unnamed: 0_level_0,count
Polarity5,Unnamed: 1_level_1
very_negative,5405
negative,5405
neutral,5405
very_positive,5405
positive,5405


In [25]:
y_over_encoded = pd.get_dummies(y_over, columns=['Polarity']).values.astype(int)
y4_over_encoded = pd.get_dummies(y4_over, columns=['Polarity4']).values.astype(int)
y5_over_encoded = pd.get_dummies(y5_over, columns=['Polarity5']).values.astype(int)

# Data Splitting

In [37]:
X_train_over, X_test_over, y_train_over, y_test_over = train_test_split(X_over, y_over_encoded, random_state=0, train_size=.9, shuffle=True)
X4_train_over, X4_test_over, y4_train_over, y4_test_over = train_test_split(X4_over, y4_over_encoded, random_state=0, train_size=.85, shuffle=True)
X5_train_over, X5_test_over, y5_train_over, y5_test_over = train_test_split(X5_over, y5_over_encoded, random_state=0, train_size=.8, shuffle=True)

# Modelling

In [38]:
class ReachAccuracy(Callback):
    def __init__(self, target: float, patience: int = 0, restore_best_weights: bool = False):
        super().__init__()
        self.target = target
        self.patience = patience
        self.restore_best_weights = restore_best_weights
        self.wait = 0
        self.best = 0
        self.best_weights = 0

    def on_epoch_end(self, epoch, logs):
        current = logs.get('val_accuracy')
        if np.greater(current, self.target):
            if np.greater(current, self.best):
                self.best = current
                self.best_weights = self.model.get_weights()

            if self.wait >= self.patience:
                self.model.stop_training = True
                if (self.restore_best_weights):
                    self.model.set_weights(self.best_weights)
                print(f'Stopped training: Reached target accuracy ({self.target}): {current}')

            self.wait += 1

## Model 1
* Algoritma: LSTM
* Klasifikasi: 3 Kelas
* Data: 26052 row (hasil oversampling)
* Data Splitting: 90% Training : 10% Testing

In [39]:
lstm = Sequential([
    Embedding(5000, 15),
    LSTM(64, activation='tanh', recurrent_activation='sigmoid',
                recurrent_initializer='orthogonal', use_bias=True),
    Dense(3, activation='softmax'),
])

lstm.compile(optimizer=Adam(learning_rate=.0001), loss='categorical_crossentropy', metrics=['accuracy'])
lstm_hist = lstm.fit(X_train_over, y_train_over, epochs=50, batch_size=32, verbose=2, validation_data=(X_test_over, y_test_over),
                     callbacks=[
                        EarlyStopping(monitor='val_accuracy', patience=5, restore_best_weights=True),
                        ReachAccuracy(0.92, patience=3, restore_best_weights=True)
                    ])

Epoch 1/50
733/733 - 9s - 12ms/step - accuracy: 0.5190 - loss: 0.9937 - val_accuracy: 0.6450 - val_loss: 0.8058
Epoch 2/50
733/733 - 4s - 6ms/step - accuracy: 0.7037 - loss: 0.6883 - val_accuracy: 0.7425 - val_loss: 0.6179
Epoch 3/50
733/733 - 5s - 7ms/step - accuracy: 0.7940 - loss: 0.5197 - val_accuracy: 0.8054 - val_loss: 0.4779
Epoch 4/50
733/733 - 5s - 7ms/step - accuracy: 0.8485 - loss: 0.4053 - val_accuracy: 0.8507 - val_loss: 0.3945
Epoch 5/50
733/733 - 10s - 13ms/step - accuracy: 0.8822 - loss: 0.3346 - val_accuracy: 0.8676 - val_loss: 0.3498
Epoch 6/50
733/733 - 5s - 6ms/step - accuracy: 0.9044 - loss: 0.2805 - val_accuracy: 0.8872 - val_loss: 0.3024
Epoch 7/50
733/733 - 5s - 6ms/step - accuracy: 0.9203 - loss: 0.2423 - val_accuracy: 0.8906 - val_loss: 0.2938
Epoch 8/50
733/733 - 5s - 7ms/step - accuracy: 0.9289 - loss: 0.2134 - val_accuracy: 0.9045 - val_loss: 0.2612
Epoch 9/50
733/733 - 10s - 13ms/step - accuracy: 0.9412 - loss: 0.1876 - val_accuracy: 0.9144 - val_loss: 0.2

In [40]:
# Akurasi Training LSTM
lstm.evaluate(X_train_over, y_train_over)

[1m733/733[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - accuracy: 0.9675 - loss: 0.1210


[0.1207486167550087, 0.9685234427452087]

In [41]:
# Akurasi Testing LSTM
lstm.evaluate(X_test_over, y_test_over)

[1m82/82[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.9334 - loss: 0.1956


[0.19580575823783875, 0.9355334043502808]

## Model 2
* Algoritma: LSTM dengan Dropout Layer
* Klasifikasi: 4 kelas
* Data: 25680 row (hasil oversampling)
* Data Splitting: 85% training : 15% Testing

In [42]:
lstm_2 = Sequential([
    Embedding(input_dim=5000, output_dim=15),
    LSTM(64),
    Dropout(0.3),
    Dense(4, activation='softmax')
])

lstm_2.compile(optimizer=Adam(learning_rate=.0001), loss='categorical_crossentropy', metrics=['accuracy'])
lstm_2_hist = lstm_2.fit(X4_train_over, y4_train_over, epochs=50, batch_size=32, verbose=2, validation_data=(X4_test_over, y4_test_over),
                           callbacks=[
                                EarlyStopping(monitor='val_accuracy', patience=10, restore_best_weights=True),
                                ReachAccuracy(0.92, patience=3, restore_best_weights=True)
                            ])

Epoch 1/50
683/683 - 6s - 9ms/step - accuracy: 0.3187 - loss: 1.3631 - val_accuracy: 0.3692 - val_loss: 1.3228
Epoch 2/50
683/683 - 5s - 8ms/step - accuracy: 0.4914 - loss: 1.1500 - val_accuracy: 0.5636 - val_loss: 1.0009
Epoch 3/50
683/683 - 10s - 15ms/step - accuracy: 0.6038 - loss: 0.9174 - val_accuracy: 0.6168 - val_loss: 0.8557
Epoch 4/50
683/683 - 9s - 14ms/step - accuracy: 0.6621 - loss: 0.7926 - val_accuracy: 0.6846 - val_loss: 0.7571
Epoch 5/50
683/683 - 5s - 8ms/step - accuracy: 0.7077 - loss: 0.6970 - val_accuracy: 0.7222 - val_loss: 0.6734
Epoch 6/50
683/683 - 4s - 6ms/step - accuracy: 0.7546 - loss: 0.6142 - val_accuracy: 0.7505 - val_loss: 0.6164
Epoch 7/50
683/683 - 5s - 8ms/step - accuracy: 0.7901 - loss: 0.5458 - val_accuracy: 0.7734 - val_loss: 0.5487
Epoch 8/50
683/683 - 5s - 8ms/step - accuracy: 0.8154 - loss: 0.4873 - val_accuracy: 0.8030 - val_loss: 0.4936
Epoch 9/50
683/683 - 5s - 7ms/step - accuracy: 0.8395 - loss: 0.4335 - val_accuracy: 0.8183 - val_loss: 0.460

In [43]:
# Akurasi Training Simple RNN
lstm_2.evaluate(X4_train_over, y4_train_over)

[1m683/683[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 5ms/step - accuracy: 0.9859 - loss: 0.0760


[0.07652278989553452, 0.9860729575157166]

In [44]:
# Akurasi Testing Simple RNN
lstm_2.evaluate(X4_test_over, y4_test_over)

[1m121/121[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - accuracy: 0.9279 - loss: 0.2699


[0.2646080553531647, 0.9273104667663574]

## Model 3
* Algoritma: GRU
* Klasifikasi: 5 kelas
* Data: 27025 row (hasil oversampling)
* Data Splitting: 80% Training : 20% Testing

In [62]:
gru = Sequential([
    Embedding(5000, 15),
    GRU(32),
    Dropout(0.3),
    Dense(16, activation='relu'),
    Dropout(0.3),
    Dense(5, activation='softmax')
])

gru.compile(optimizer=Adam(learning_rate=.0001), loss='categorical_crossentropy', metrics=['accuracy'])
gru_hist = gru.fit(X5_train_over, y5_train_over, epochs=50, batch_size=32, verbose=2, validation_data=(X5_test_over, y5_test_over),
                callbacks = [
                    EarlyStopping(monitor='val_accuracy', patience=10, restore_best_weights=True),
                    ReachAccuracy(0.92, patience=3, restore_best_weights=True),
                    # ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=3, min_lr=1e-6)
                ])

Epoch 1/50
676/676 - 6s - 10ms/step - accuracy: 0.2439 - loss: 1.6064 - val_accuracy: 0.3066 - val_loss: 1.5977
Epoch 2/50
676/676 - 5s - 8ms/step - accuracy: 0.3439 - loss: 1.4862 - val_accuracy: 0.3928 - val_loss: 1.2782
Epoch 3/50
676/676 - 11s - 16ms/step - accuracy: 0.4357 - loss: 1.2036 - val_accuracy: 0.4801 - val_loss: 1.0594
Epoch 4/50
676/676 - 8s - 11ms/step - accuracy: 0.5148 - loss: 1.0432 - val_accuracy: 0.5623 - val_loss: 0.9222
Epoch 5/50
676/676 - 8s - 13ms/step - accuracy: 0.5933 - loss: 0.9218 - val_accuracy: 0.7106 - val_loss: 0.7799
Epoch 6/50
676/676 - 5s - 7ms/step - accuracy: 0.6512 - loss: 0.8102 - val_accuracy: 0.8004 - val_loss: 0.6580
Epoch 7/50
676/676 - 5s - 7ms/step - accuracy: 0.7008 - loss: 0.7140 - val_accuracy: 0.8250 - val_loss: 0.5643
Epoch 8/50
676/676 - 6s - 9ms/step - accuracy: 0.7319 - loss: 0.6465 - val_accuracy: 0.8581 - val_loss: 0.5068
Epoch 9/50
676/676 - 10s - 15ms/step - accuracy: 0.7605 - loss: 0.5916 - val_accuracy: 0.8699 - val_loss: 0

In [63]:
# Akurasi Training GRU
gru.evaluate(X5_train_over, y5_train_over)

[1m676/676[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 5ms/step - accuracy: 0.9859 - loss: 0.0916


[0.09277333319187164, 0.9848751425743103]

In [64]:
# Akurasi Testing GRU
gru.evaluate(X5_test_over, y5_test_over)

[1m169/169[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - accuracy: 0.9208 - loss: 0.2817


[0.31516969203948975, 0.9197039604187012]

# Inference

In [65]:
def predict(text):
    sentiment = ['Negative', 'Neutral', 'Positive']
    sequence = tokenizer.texts_to_sequences([text])
    test = pad_sequences(sequence, maxlen=100)
    return sentiment[np.around(lstm.predict(test), decimals=0).argmax(axis=1)[0]]

In [66]:
predict('Aplikasi ini tampilannya buruk sekali. Jangan download!')

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 22ms/step


'Negative'

In [67]:
predict('Aplikasinya biasa saja')

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step


'Neutral'

In [68]:
predict('Aplikasinya sangat mantap. Membantu pekerjaan')

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step


'Positive'