---
title: NLP - Sentiment Analysis
jupyter:
  jupytext:
    text_representation:
      extension: .qmd
      format_name: quarto
      format_version: '1.0'
      jupytext_version: 1.16.3
  kernelspec:
    display_name: Python 3 (ipykernel)
    language: python
    name: python3
---

# Install Packages

In [1]:
import os

packages = ['Sastrawi', 'wordcloud', 'gitpython', 'xgboost', 'nltk']
for i in packages:
  try:
    __import__(i)
  except ImportError:
    print(i+' not found')
    os.system('pip install '+i)

gitpython not found



[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.0[0m[39;49m -> [0m[32;49m24.1.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


# Import base packages

In [2]:
import pandas as pd  
import numpy as np  
import matplotlib.pyplot as plt  
import seaborn as sns  
import datetime as dt  
import re  
import string  
from nltk.tokenize import word_tokenize  
from nltk.corpus import stopwords  
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory  
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory  
from wordcloud import WordCloud  

# Import Segmented Packages

In [3]:
import re
import string
import json
import csv
import nltk  

from git import Repo
from os.path import exists
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import BernoulliNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier

# Settings

In [4]:
pd.options.mode.chained_assignment = None  
seed = 0
np.random.seed(seed)  
resources = {
  'punkt': 'tokenizers/punkt',
  'stopwords': 'corpora/stopwords'
}

for k,v in resources.items():
  try:
    nltk.data.find(v)
  except LookupError:
    print('punkt not found')
    nltk.download(k)  

## Prepare Dataset (the data scrapped with external script)

In [5]:
comments = "datasets/comments_food.json"
post_comments_df = pd.read_json(comments)
post_comments_df.shape
post_comments_df.head()
post_comments_df.to_csv('comments.csv', index=False)
 
# Menghitung jumlah baris dan kolom dalam DataFrame
jumlah_ulasan, jumlah_kolom = post_comments_df.shape

In [6]:
post_comments_df.head()

Unnamed: 0,username,date,text
0,delimarachma06,2023-12-13 11:51:07,KAN KAN
1,cntikaputriiiiii,2023-11-06 14:56:28,sesuai ekspektasi 😁
2,x.y.k.a,2023-09-17 01:22:16,kenapa nggk langsung ditermos aja
3,hann_arashi,2023-11-07 07:27:25,"mantap, sesuai ekspektasi"
4,ekky3026,2023-09-17 10:27:30,sesuai ekspektasi 😭


In [7]:
post_comments_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19669 entries, 0 to 19668
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype         
---  ------    --------------  -----         
 0   username  19669 non-null  object        
 1   date      19669 non-null  datetime64[ns]
 2   text      19669 non-null  object        
dtypes: datetime64[ns](1), object(2)
memory usage: 461.1+ KB


### Remove data with Null

In [8]:
clean_df = post_comments_df.dropna()

### Remove data with duplicates

In [9]:
clean_df.duplicated()

0        False
1        False
2        False
3        False
4        False
         ...  
19664     True
19665     True
19666     True
19667     True
19668     True
Length: 19669, dtype: bool

In [10]:
clean_df = clean_df.drop_duplicates()

clean_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 19229 entries, 0 to 19626
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype         
---  ------    --------------  -----         
 0   username  19229 non-null  object        
 1   date      19229 non-null  datetime64[ns]
 2   text      19229 non-null  object        
dtypes: datetime64[ns](1), object(2)
memory usage: 600.9+ KB


## Text Preprocessing

* cleaningText(text)
* casefoldingText(text)
* tokenizingText(text)
* filteringText(text)
* stemmingText(text)
* lemmatizingText(text)
* toSentence(list_words)

In [11]:
def cleaningText(text):
  text = re.sub(r'@[A-Za-z0-9]+', '', text) 
  text = re.sub(r'#[A-Za-z0-9]+', '', text) 
  text = re.sub(r'RT[\s]', '', text) 
  text = re.sub(r"http\S+", '', text) 
  text = re.sub(r'[0-9]+', '', text) 
  text = re.sub(r'[^\w\s]', '', text) 
  
  text = text.replace('\n', ' ') 
  text = text.translate(str.maketrans('', '', string.punctuation)) 
  text = text.strip(' ') 
  return text
 
def casefoldingText(text): 
  text = text.lower()
  return text
 
def tokenizingText(text): 
  text = word_tokenize(text)
  return text
 
def filteringText(text): 
  listStopwords = set(stopwords.words('indonesian'))
  listStopwords1 = set(stopwords.words('english'))
  listStopwords.update(listStopwords1)
  listStopwords.update(['iya','yaa','gak','nya','na','sih','ku',"di","ga","ya","gaa","loh","kah","woi","woii","woy"])
  filtered = []
  for txt in text:
      if txt not in listStopwords:
          filtered.append(txt)
  text = filtered
  return text
 
def lemmatizingText(text):
  nlp = spacy.blank("id")
  #nlp.add_pipe("lemmatizer", config = {"mode": "lookup"})
  nlp.initialize()
  words = text.split()
  lemmatized_words = [nlp(word) for word in words]
  lemmatized_text = ' '.join(lemmatized_words)
  return lemmatized_text


def stemmingText(text): 
  factory = StemmerFactory()
  stemmer = factory.create_stemmer()
  words = text.split()
  stemmed_words = [stemmer.stem(word) for word in words]
  stemmed_text = ' '.join(stemmed_words)
  return stemmed_text
 
def toSentence(list_words): 
  sentence = ' '.join(word for word in list_words)
  return sentence

In [12]:
source = "/tmp/slang/"
if not exists(source+".git"):
  Repo.clone_from("https://github.com/louisowen6/NLP_bahasa_resources", source)

slangwords = json.load(open("/tmp/slang/combined_slang_words.txt"))
def fix_slangwords(text):
    words = text.split()
    fixed_words = []
 
    for word in words:
        if word.lower() in slangwords:
            fixed_words.append(slangwords[word.lower()])
        else:
            fixed_words.append(word)
 
    fixed_text = ' '.join(fixed_words)
    return fixed_text

### Apply Everything

In [13]:
clean_df['text_clean'] = clean_df['text'].apply(cleaningText)
 
clean_df['text_casefoldingText'] = clean_df['text_clean'].apply(casefoldingText)
 
clean_df['text_slangwords'] = clean_df['text_casefoldingText'].apply(fix_slangwords)
 
clean_df['text_tokenizingText'] = clean_df['text_slangwords'].apply(tokenizingText)
 
clean_df['text_stopword'] = clean_df['text_tokenizingText'].apply(filteringText)
 
clean_df['text_akhir'] = clean_df['text_stopword'].apply(toSentence)

#clean_df['text_lemmatized'] = clean_df['text_akhir'].apply(lemmatizingText)
#clean_df['text_akhir'] = clean_df['text_joined'].apply(stemmingText)

## Labeling

In [14]:
lexicon_positive = dict()
lexicon_negative = dict()

source = "/tmp/lexicon/"
if not exists(source+".git"):
  Repo.clone_from('https://github.com/angelmetanosaa/dataset', source)
 
else:
  print("Data Exist")

lexicons = ["lexicon_positive.csv", "lexicon_negative.csv"]

for i in lexicons:
  reader = csv.reader(open(source+i), delimiter=',')
  
  for row in reader:
    if i == "lexicon_positive.csv":
      lexicon_positive[row[0]] = int(row[1])
    else:
      lexicon_negative[row[0]] = int(row[1])

Data Exist


In [15]:
def sentiment_analysis_lexicon_indonesia(text):
    score = 0
    for word in text:
        if (word in lexicon_positive):
            score = score + lexicon_positive[word]
        else:
            score = score + lexicon_negative[word]
 
    polarity=''
 
    if (score > 0):
        polarity = 'positive'
    elif (score < 0):
        polarity = 'negative'
    else:
        polarity = 'neutral'
 
    return score, polarity

In [16]:
results = clean_df['text_stopword'].apply(sentiment_analysis_lexicon_indonesia)
results = list(zip(*results))
clean_df['polarity_score'] = results[0]
clean_df['polarity'] = results[1]
print(clean_df['polarity'].value_counts())

polarity
neutral     11727
negative     4057
positive     3445
Name: count, dtype: int64


## Data Splitting and Feature Extraction with TF-IDF

In [17]:
X = clean_df['text_akhir']
y = clean_df['polarity']
 
tfidf = TfidfVectorizer(max_features=200, min_df=17, max_df=0.8 )
X_tfidf = tfidf.fit_transform(X)
 
features_df = pd.DataFrame(X_tfidf.toarray(), columns=tfidf.get_feature_names_out())
 
features_df
 
X_train, X_test, y_train, y_test = train_test_split(X_tfidf, y, test_size=0.2, random_state=42)

## Modeling

### Naive Bayes

In [18]:
naive_bayes = BernoulliNB()
 
naive_bayes.fit(X_train.toarray(), y_train)
 
y_pred_train_nb = naive_bayes.predict(X_train.toarray())
y_pred_test_nb = naive_bayes.predict(X_test.toarray())
 
accuracy_train_nb = accuracy_score(y_pred_train_nb, y_train)
accuracy_test_nb = accuracy_score(y_pred_test_nb, y_test)
 
print('Naive Bayes - accuracy_train:', accuracy_train_nb)
print('Naive Bayes - accuracy_test:', accuracy_test_nb)

Naive Bayes - accuracy_train: 0.8333224988623805
Naive Bayes - accuracy_test: 0.8341133645345814


### Random Forest (highest accuracy)

In [19]:
random_forest = RandomForestClassifier()
 
random_forest.fit(X_train.toarray(), y_train)
 
y_pred_train_rf = random_forest.predict(X_train.toarray())
y_pred_test_rf = random_forest.predict(X_test.toarray())
 
accuracy_train_rf = accuracy_score(y_pred_train_rf, y_train)
accuracy_test_rf = accuracy_score(y_pred_test_rf, y_test)
 
print('Random Forest - accuracy_train:', accuracy_train_rf)
print('Random Forest - accuracy_test:', accuracy_test_rf)

Random Forest - accuracy_train: 0.8937788467789118
Random Forest - accuracy_test: 0.8504940197607904


### Logistic Regression

In [20]:
logistic_regression = LogisticRegression()
 
logistic_regression.fit(X_train.toarray(), y_train)
 
y_pred_train_lr = logistic_regression.predict(X_train.toarray())
y_pred_test_lr = logistic_regression.predict(X_test.toarray())
 
accuracy_train_lr = accuracy_score(y_pred_train_lr, y_train)
 
accuracy_test_lr = accuracy_score(y_pred_test_lr, y_test)
 
print('Logistic Regression - accuracy_train:', accuracy_train_lr)
print('Logistic Regression - accuracy_test:', accuracy_test_lr)

Logistic Regression - accuracy_train: 0.8560748878632256
Logistic Regression - accuracy_test: 0.8569942797711908


### Decision Tree

In [21]:
decision_tree = DecisionTreeClassifier()
 
decision_tree.fit(X_train.toarray(), y_train)
 
y_pred_train_dt = decision_tree.predict(X_train.toarray())
y_pred_test_dt = decision_tree.predict(X_test.toarray())
 
accuracy_train_dt = accuracy_score(y_pred_train_dt, y_train)
accuracy_test_dt = accuracy_score(y_pred_test_dt, y_test)
 
print('Decision Tree - accuracy_train:', accuracy_train_dt)
print('Decision Tree - accuracy_test:', accuracy_test_dt)

Decision Tree - accuracy_train: 0.8937788467789118
Decision Tree - accuracy_test: 0.8380135205408217


### XGBoost

In [22]:
xgb = XGBClassifier()
 
y_train_label, y_train_code = np.unique(y_train, return_inverse=True)
y_test_label, y_test_code = np.unique(y_test, return_inverse=True)
xgb.fit(X_train.toarray(), y_train_code)
 
y_pred_train_dt = xgb.predict(X_train.toarray())
y_pred_test_dt = xgb.predict(X_test.toarray())
 
accuracy_train_dt = accuracy_score(y_pred_train_dt, y_train_code)
accuracy_test_dt = accuracy_score(y_pred_test_dt, y_test_code)
 
print('XGBoost - accuracy_train:', accuracy_train_dt)
print('XGBoost - accuracy_test:', accuracy_test_dt)

XGBoost - accuracy_train: 0.8658909185464474
XGBoost - accuracy_test: 0.8551742069682787


### predict
