## Library Preparation

In [1]:
import re
import numpy as np 
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

import nltk
from nltk.corpus import stopwords
from nltk.classify import SklearnClassifier

from wordcloud import WordCloud,STOPWORDS
import matplotlib.pyplot as plt

from subprocess import check_output

nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [2]:
!pip install sastrawi



In [3]:
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory

In [11]:
#augment the stopwords with nonstandard twitter words
stopwords_set = set(stopwords.words("indonesian"))
stopwords_aug = {"ya","yak","iya","yg","ga","gak","gk","udh","sdh","udah","dah","nih","ini","deh","sih","dong","donk",
                 "sm","knp","utk","yaa","tdk","gini","gitu","bgt","gt","nya","kalo","cb","jg","jgn","gw","ge",
                 "sy","min","mas","mba","mbak","pak","kak","trus","trs","bs","bisa","aja","saja","no",
                 "w","g","gua","gue","emang","emg","wkwk","dr","kau","dg","gimana","apapun","apa",
                 "klo","yah","banget","pake","terus","krn","jadi","jd","mu","ku","si","hehe",
                 "tp","pa","lu","lo","lw","tw","tau","karna","kayak","ky","lg","untuk","tuk","dg","dgn"}
stopwords_all = stopwords_set.union(stopwords_aug)

In [12]:
def clean_text(text):
    filtered_tokens = ""
    for token in text:
      if re.search('[a-zA-Z\s]', token):
        filtered_tokens = filtered_tokens + token.lower()
        
    return filtered_tokens

def tokenize_clean(text):
    
    #tokenisasi
    tokens = [word.lower() for sent in nltk.sent_tokenize(text) for word
        in nltk.word_tokenize(sent)]
    
    #clean token from numeric and other character like puntuation
    filtered_tokens = []
    for token in tokens:
        if re.search('[a-zA-Z]', token):
            filtered_tokens.append(token)
            
    return filtered_tokens

def remove_stopwords(tokenized_text):
    
    cleaned_token = []
    for token in tokenized_text:
        if token not in stopwords_all:
            cleaned_token.append(token)
            
    return cleaned_token

def stemming_text(tokenized_text):
    
    #stem using Sastrawi StemmerFactory 
    factory = StemmerFactory()
    stemmer = factory.create_stemmer()

    stems = []
    for token in tokenized_text:
        stems.append(stemmer.stem(token))

    return stems

def text_preprocessing(text):
    #tokenize, remove non alpha numeric and make lower
    text_tmp = re.sub('[^a-zA-Z]', ' ', text)
    text_tmp = text_tmp.lower()
    text_tmp = text_tmp.split()
    
    #remove stopwords
    prep02 = remove_stopwords(text_tmp)
    
    #stemmingnya lambat banget
    #prep03 = stemming_text(prep01)

    prep03 = ' '.join(prep02)
            
    return prep03

## Download Dataset

In [7]:
!mkdir -p dataset
!wget https://raw.githubusercontent.com/project303/dataset/master/Twitter.csv -P dataset

--2021-07-27 22:34:44--  https://raw.githubusercontent.com/project303/dataset/master/Twitter.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.111.133, 185.199.109.133, 185.199.108.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.111.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 413468 (404K) [text/plain]
Saving to: ‘dataset/Twitter.csv’


2021-07-27 22:34:44 (10.7 MB/s) - ‘dataset/Twitter.csv’ saved [413468/413468]



In [5]:
dataset = pd.read_csv('dataset/Twitter.csv', sep='|')

len(dataset)

4294

In [6]:
dataset.head()

Unnamed: 0,id,text,sentiment
0,715600523756314626,@IndosatCare iya nomernya masih itu. Mksh.,Positif
1,715599778948599808,@IndosatCare udah di DM yah,Positif
2,704874550631145472,@Telkomsel aku pakai loop kak :),Positif
3,704875356910563328,@rikawidjaya04 Terima kasih juga Kak Rika atas...,Positif
4,713428238458953732,@ndusell saya simpati tan,Positif


## Data Preprocessing

In [7]:
dataset = dataset[dataset.sentiment != "Netral"]
tweets = np.array(dataset['text'])
sentiments = np.array(dataset['sentiment'])

train_data, test_data, train_label, test_label = train_test_split(tweets, sentiments, test_size=0.2, random_state=4)

In [8]:
train_label[:10]

array(['Positif', 'Positif', 'Negatif', 'Positif', 'Negatif', 'Negatif',
       'Positif', 'Negatif', 'Positif', 'Positif'], dtype=object)

In [9]:
len(train_data)

2092

In [13]:
train_data_clean = []

for tweet_text in train_data:
  train_data_clean.append(text_preprocessing(tweet_text))

test_data_clean = []
for tweet_text in test_data:
  test_data_clean.append(text_preprocessing(tweet_text))

In [14]:
train_data_clean[:10]

['indosatcare oke terima kasih bantuannya',
 'ryamizard kemenhan bentuk tim',
 'telkomsel paket telkomsel mahal bnget kartu telkomsel murah mw pakai intrnet kartu pakai',
 'ijin',
 'indosatcare tolong sms diblok pulsa kepotong',
 'indosatcare sya cek kuota ok dpt sms saldo kuota super internet cek status paket',
 'operasi militer menteri pertahanan ryamizard ryacudu pertimbangan matang',
 'telkomsel grapari pondok gede buka jam sim card hang',
 'emenhan kebutuhan bandan intelijen pertahanan metro tv news',
 'telkomsel info tunggu thx']

## Feature Extraction

In [36]:
# build TFIDF features on train reviews
tv = TfidfVectorizer( ngram_range=(1,1),
                     analyzer='word',
                     lowercase=True,
                     sublinear_tf=True,
                     use_idf=True
                     )
tv_train_features = tv.fit_transform(train_data_clean)

In [16]:
len(tv.get_feature_names())

4045

In [17]:
tv_train_features.shape

(2092, 4045)

In [18]:
tv_test_features = tv.transform(test_data_clean)

In [19]:
tv_test_features.shape

(524, 4045)

## Model Building

In [20]:
from sklearn.linear_model import SGDClassifier, LogisticRegression

model_lr = LogisticRegression(penalty='l2', max_iter=1000, C=1)

In [21]:
# build model    
model_lr.fit(tv_train_features, train_label)

LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=1000,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [25]:
predictions_lr = model_lr.predict(tv_test_features) 

In [27]:
from sklearn import metrics

neg_cnt = 0
pos_cnt = 0
neg_cnt_x = 0
pos_cnt_x = 0

for i in range(0, len(test_label)):
  if test_label[i] == 'Positif':
    pos_cnt = pos_cnt + 1
    if test_label[i] == predictions_lr[i]:
      pos_cnt_x = pos_cnt_x + 1
  else:
    neg_cnt = neg_cnt + 1
    if test_label[i] == predictions_lr[i]:
      neg_cnt_x = neg_cnt_x + 1

print(neg_cnt)
print(pos_cnt)

print('[Positif]: %s/%s '  % (pos_cnt,pos_cnt_x))
print('[Negatif]: %s/%s '  % (neg_cnt,neg_cnt_x))

print("Accuracy(in %):", metrics.accuracy_score(test_label, predictions_lr)*100)

264
260
[Positif]: 260/242 
[Negatif]: 264/216 
Accuracy(in %): 87.40458015267176


In [28]:
print('Accuracy \t: ', np.round( metrics.accuracy_score(test_label, predictions), 4))
print('Precision \t: ', np.round(metrics.precision_score(test_label, 
                                                     predictions,
                                                     average='weighted'), 4))
print('Recall  \t: ', np.round( metrics.recall_score(test_label,
                                                    predictions,
                                                    average='weighted'), 4))
print('F1 Score  \t: ', np.round( metrics.f1_score(test_label, 
                                                  predictions,
                                                  average='weighted'), 4))

Accuracy 	:  0.874
Precision 	:  0.8791
Recall  	:  0.874
F1 Score  	:  0.8737


In [None]:
#from sklearn.naive_bayes import GaussianNB
#gnb = GaussianNB()
#gnb.fit(tv_train_features, train_label)

In [None]:
#model_nbg = gnb.predict(tv_test_features) 

In [30]:
from sklearn.svm import SVC
model_svm = SVC(kernel = 'linear', random_state = 0)
model_svm.fit(tv_train_features, train_label)
predict_svm = model_svm.predict(tv_test_features)
print("Accuracy(in %):", metrics.accuracy_score(test_label, predict_svm)*100)

Accuracy(in %): 91.41221374045801


In [31]:
from sklearn.ensemble import RandomForestClassifier
model_rf = RandomForestClassifier(n_estimators = 500, criterion = 'entropy', random_state = 0)
model_rf.fit(tv_train_features, train_label)
predict_rf = model_rf.predict(tv_test_features)
print("Accuracy(in %):", metrics.accuracy_score(test_label, predict_rf)*100)

Accuracy(in %): 89.12213740458014


## Prediction Wrapper

In [47]:
def Predict_Sentiment(text, model):
  data_txt =[]
  data_txt.append(text_preprocessing(text))
  feature_p = tv.transform(data_txt) 
  predict_p = model.predict(feature_p)

  return predict_p[0]

In [57]:
Predict_Sentiment('Telkomsel sinyal jelek', model_svm)

'Negatif'