# Hands-on Sentiment Analytics TF-IDF

## Library Preparation

In [None]:
import re
import numpy as np 
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

import nltk
from nltk.corpus import stopwords
from nltk.classify import SklearnClassifier

from wordcloud import WordCloud,STOPWORDS
import matplotlib.pyplot as plt

from subprocess import check_output

nltk.download('stopwords')
nltk.download('punkt')

In [None]:
!pip install sastrawi

In [None]:
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory

In [None]:
#augment the stopwords with nonstandard twitter words
stopwords_set = set(stopwords.words("indonesian"))
stopwords_aug = {"ya","yak","iya","yg","ga","gak","gk","udh","sdh","udah","dah","nih","ini","deh","sih","dong","donk",
                 "sm","knp","utk","yaa","tdk","gini","gitu","bgt","gt","nya","kalo","cb","jg","jgn","gw","ge",
                 "sy","min","mas","mba","mbak","pak","kak","trus","trs","bs","bisa","aja","saja","no",
                 "w","g","gua","gue","emang","emg","wkwk","dr","kau","dg","gimana","apapun","apa",
                 "klo","yah","banget","pake","terus","krn","jadi","jd","mu","ku","si","hehe",
                 "tp","pa","lu","lo","lw","tw","tau","karna","kayak","ky","lg","untuk","tuk","dg","dgn"}
stopwords_all = stopwords_set.union(stopwords_aug)

In [None]:
def clean_text(text):
    filtered_tokens = ""
    for token in text:
      if re.search('[a-zA-Z\s]', token):
        filtered_tokens = filtered_tokens + token.lower()
        
    return filtered_tokens

def tokenize_clean(text):
    
    #tokenisasi
    tokens = [word.lower() for sent in nltk.sent_tokenize(text) for word
        in nltk.word_tokenize(sent)]
    
    #clean token from numeric and other character like puntuation
    filtered_tokens = []
    for token in tokens:
        if re.search('[a-zA-Z]', token):
            filtered_tokens.append(token)
            
    return filtered_tokens

def remove_stopwords(tokenized_text):
    
    cleaned_token = []
    for token in tokenized_text:
        if token not in stopwords_all:
            cleaned_token.append(token)
            
    return cleaned_token

def stemming_text(tokenized_text):
    
    #stem using Sastrawi StemmerFactory 
    factory = StemmerFactory()
    stemmer = factory.create_stemmer()

    stems = []
    for token in tokenized_text:
        stems.append(stemmer.stem(token))

    return stems

def text_preprocessing(text):
    #tokenize, remove non alpha numeric and make lower
    text_tmp = re.sub('[^a-zA-Z]', ' ', text)
    text_tmp = text_tmp.lower()
    text_tmp = text_tmp.split()
    
    #remove stopwords
    prep02 = remove_stopwords(text_tmp)
    
    #stemmingnya lambat banget
    #prep03 = stemming_text(prep02)

    prep03 = ' '.join(prep02)
            
    return prep03

## Download Dataset

In [None]:
!mkdir -p dataset
!wget https://raw.githubusercontent.com/project303/dataset/master/Twitter.csv -P dataset

In [None]:
dataset = pd.read_csv('dataset/Twitter.csv', sep='|')

len(dataset)

In [None]:
dataset.head()

## Data Preprocessing

In [None]:
dataset = dataset[dataset.sentiment != "Netral"]
tweets = np.array(dataset['text'])
sentiments = np.array(dataset['sentiment'])

train_data, test_data, train_label, test_label = train_test_split(tweets, sentiments, test_size=0.2, random_state=4)

In [None]:
train_label[:10]

In [None]:
len(train_data)

In [None]:
train_data_clean = []

for tweet_text in train_data:
  train_data_clean.append(text_preprocessing(tweet_text))

test_data_clean = []
for tweet_text in test_data:
  test_data_clean.append(text_preprocessing(tweet_text))

In [None]:
train_data_clean[:10]

## Feature Extraction

In [None]:
tfidf = TfidfVectorizer(max_features=200000,    
                        use_idf=True,           
                        ngram_range=(1,1))


tfidf_train = tfidf.fit_transform(train_data_clean)

In [None]:
len(tfidf.get_feature_names())

In [None]:
tfidf.get_feature_names()

In [None]:
tfidf_train.shape

In [None]:
print(tfidf_train[0])

In [None]:
tfidf_test = tfidf.transform(test_data_clean)

In [None]:
tfidf_test.shape

## Model Building

In [None]:
from sklearn.linear_model import SGDClassifier, LogisticRegression

model_lr = LogisticRegression(max_iter=1000)

In [None]:
# build model    
model_lr.fit(tfidf_train, train_label)

In [None]:
predictions_lr = model_lr.predict(tfidf_test) 

In [None]:
from sklearn import metrics

neg_cnt = 0
pos_cnt = 0
neg_cnt_x = 0
pos_cnt_x = 0

for i in range(0, len(test_label)):
  if test_label[i] == 'Positif':
    pos_cnt = pos_cnt + 1
    if test_label[i] == predictions_lr[i]:
      pos_cnt_x = pos_cnt_x + 1
  else:
    neg_cnt = neg_cnt + 1
    if test_label[i] == predictions_lr[i]:
      neg_cnt_x = neg_cnt_x + 1

print(neg_cnt)
print(pos_cnt)

print('[Positif]: %s/%s '  % (pos_cnt,pos_cnt_x))
print('[Negatif]: %s/%s '  % (neg_cnt,neg_cnt_x))

print("Accuracy(in %):", metrics.accuracy_score(test_label, predictions_lr)*100)

In [None]:
print('Accuracy \t: ', np.round( metrics.accuracy_score(test_label, predictions_lr), 4))
print('Precision \t: ', np.round(metrics.precision_score(test_label, 
                                                     predictions_lr,
                                                     average='weighted'), 4))
print('Recall  \t: ', np.round( metrics.recall_score(test_label,
                                                    predictions_lr,
                                                    average='weighted'), 4))
print('F1 Score  \t: ', np.round( metrics.f1_score(test_label, 
                                                  predictions_lr,
                                                  average='weighted'), 4))

In [None]:
from sklearn.naive_bayes import MultinomialNB
model_nb = MultinomialNB()
model_nb.fit(tfidf_train, train_label)
predict_nb = model_nb.predict(tfidf_test)
print("Accuracy(in %):", metrics.accuracy_score(test_label, predict_nb)*100)

In [None]:
from sklearn.svm import SVC
model_svm = SVC(kernel = 'linear', random_state = 0)
model_svm.fit(tfidf_train, train_label)
predict_svm = model_svm.predict(tfidf_test)
print("Accuracy(in %):", metrics.accuracy_score(test_label, predict_svm)*100)

In [None]:
from sklearn.ensemble import RandomForestClassifier
model_rf = RandomForestClassifier(n_estimators = 500, criterion = 'entropy', random_state = 1)
model_rf.fit(tfidf_train, train_label)
predict_rf = model_rf.predict(tfidf_test)
print("Accuracy(in %):", metrics.accuracy_score(test_label, predict_rf)*100)

## Prediction Wrapper

In [None]:
def Predict_Sentiment(text, model):
  data_txt =[]
  data_txt.append(text_preprocessing(text))
  feature_p = tfidf.transform(data_txt) 
  predict_p = model.predict(feature_p)

  return predict_p[0]

In [None]:
Predict_Sentiment('Telkomsel sinyal jelek', model_svm)