# SENTIMENT ANALYSIS



In [None]:
#Release: 1.2102.0601

# Library

For this lab, we will need ``wordcloud`` library.
Use pip to install the library from Anaconda prompt : ``pip install wordcloud``.

In [None]:
import numpy as np 
import pandas as pd
from sklearn.model_selection import train_test_split 

import nltk
from nltk.corpus import stopwords
from nltk.classify import SklearnClassifier

from wordcloud import WordCloud,STOPWORDS
import matplotlib.pyplot as plt

from subprocess import check_output

nltk.download('stopwords')
nltk.download('punkt')

# Read Data

In [None]:
!mkdir -p dataset
!wget https://raw.githubusercontent.com/project303/dataset/master/Twitter.csv -P dataset

In [None]:
!ls dataset

Get number of dataset

In [None]:
data = pd.read_csv('dataset/Twitter.csv', sep='|')
# Choose the column we will be using
data = data[['text','sentiment']]
len(data)

Sample of dataset

In [None]:
data.head(5)

In [None]:
data.groupby("sentiment").count()

In [None]:
pd.set_option('display.max_colwidth', None)
data.head(5)

In [None]:
data[(data.sentiment == 'Negatif')].head(10)

Split Data into Training and Test Data

In [None]:
# Bagi dataset menjadi data training dan testing
train, test = train_test_split(data,test_size = 0.2)

# Hapus sentiment yang netral
train = train[train.sentiment != "Netral"]

# Pisahkan data trining positif dan negatif
train_pos = train[ train['sentiment'] == 'Positif']
train_pos = train_pos['text']
train_neg = train[ train['sentiment'] == 'Negatif']
train_neg = train_neg['text']

In [None]:
train.groupby("sentiment").count()

In [None]:
train_pos

In [None]:
train_neg

Draw WordCloud

In [None]:
def wordcloud_draw(data, color = 'black'):
    words = ' '.join(data)
    cleaned_word = " ".join([word for word in words.split()
                            if 'http' not in word
                                and not word.startswith('@')
                                and not word.startswith('#')
                                and word != 'RT'
                            ])
    wordcloud = WordCloud(stopwords=stopwords_all,
                      background_color=color,
                      width=2500,
                      height=2000
                     ).generate(cleaned_word)
    plt.figure(1,figsize=(13, 13))
    plt.imshow(wordcloud)
    plt.axis('off')
    plt.show()

print("Positive words")
wordcloud_draw(train_pos,'white')
print("Negative words")
wordcloud_draw(train_neg)

Add Stopwords

In [None]:
#augment the stopwords with nonstandard twitter words
stopwords_set = set(stopwords.words("indonesian"))
stopwords_aug = {"ya","yak","iya","yg","ga","gak","gk","udh","sdh","udah","dah","nih","ini","deh","sih","dong","donk",
                 "sm","knp","utk","yaa","tdk","gini","gitu","bgt","gt","nya","kalo","cb","jg","jgn","gw","ge",
                 "sy","min","mas","mba","mbak","pak","kak","trus","trs","bs","bisa","aja","saja","no",
                 "w","g","gua","gue","emang","emg","wkwk","dr","kau","dg","gimana","apapun","apa",
                 "klo","yah","banget","pake","terus","krn","jadi","jd","mu","ku","si","hehe",
                 "tp","pa","lu","lo","lw","tw","tau","karna","kayak","ky","lg","untuk","tuk","dg","dgn"}
stopwords_all = stopwords_set.union(stopwords_aug)

In [None]:
#stopwords_set

Data Preparation for Training Dataset

In [None]:
# tweets adalah data training yang telah dibersihkan
tweets = []
#stopwords_set = set(stopwords.words("indonesia"))

for index, row in train.iterrows():
    words_filtered = [e.lower() for e in row.text.split() if len(e) >= 3]
    words_cleaned = [word for word in words_filtered
        if 'http' not in word
        and not word.startswith('@')
        and not word.startswith('#')
        and word != 'RT']
    words_without_stopwords = [word for word in words_cleaned if not word in stopwords_all]
    tweets.append((words_cleaned,row.sentiment))

In [None]:
tweets[:5]

Feature Extraction

In [None]:
# Extracting word features
def get_words_in_tweets(tweets):
    all = []
    for (words, sentiment) in tweets:
        all.extend(words)
    return all

def get_word_features(wordlist):
    wordlist = nltk.FreqDist(wordlist)
    features = wordlist.keys()
    return features
w_features = get_word_features(get_words_in_tweets(tweets))


def extract_features(document):
    document_words = set(document)
    features = {}
    for word in w_features:
        features['containts(%s)' % word] = (word in document_words)
    return features


wordcloud_draw(w_features)

Sentiment Classification Using Naive Bayes

In [None]:
# Training the Naive Bayes classifier
training_set = nltk.classify.apply_features(extract_features,tweets)
classifier = nltk.NaiveBayesClassifier.train(training_set)

In [None]:
print("Naive Bayes Algo accuracy percent:", (nltk.classify.accuracy(classifier, training_set))*100)

Test Classifier Using Test Dataset

In [None]:
test_pos = test[ test['sentiment'] == 'Positif']
test_pos = test_pos['text']
test_neg = test[ test['sentiment'] == 'Negatif']
test_neg = test_neg['text']

In [None]:
neg_cnt = 0
pos_cnt = 0
for obj in test_neg: 
    res =  classifier.classify(extract_features(obj.split()))
    if(res == 'Negatif'): 
        neg_cnt = neg_cnt + 1
        #print(obj.split())
for obj in test_pos: 
    res =  classifier.classify(extract_features(obj.split()))
    if(res == 'Positif'): 
        pos_cnt = pos_cnt + 1
        
print('[Negatif]: %s/%s '  % (len(test_neg),neg_cnt))        
print('[Positif]: %s/%s '  % (len(test_pos),pos_cnt))

In [None]:
test_pos.head()

In [None]:
test_neg.head()

In [None]:
tweets_test = []
#stopwords_set = set(stopwords.words("indonesia"))

for index, row in test.iterrows():
    words_filtered = [e.lower() for e in row.text.split() if len(e) >= 3]
    words_cleaned = [word for word in words_filtered
        if 'http' not in word
        and not word.startswith('@')
        and not word.startswith('#')
        and word != 'RT']
    words_without_stopwords = [word for word in words_cleaned if not word in stopwords_all]
    tweets_test.append((words_cleaned,row.sentiment))

test_set = nltk.classify.apply_features(extract_features,tweets_test)

In [None]:
print("Naive Bayes Algo accuracy on test percent:", (nltk.classify.accuracy(classifier, test_set))*100)

Feature

In [None]:
w_features

In [None]:
nltk.FreqDist(get_words_in_tweets(tweets))

Test Classifier

In [None]:
pred =  classifier.classify(extract_features(['oke', 'nasional', 'banget', 'paket', 'pulsa']))
pred

In [None]:
classifier.show_most_informative_features(20)

Save Classifier

In [None]:
#To save the trained claassifier, do the following
import pickle
f = open('my_classifier.pickle', 'wb')
pickle.dump(classifier, f)
f.close()

In [None]:
#To reload it:
f = open('my_classifier.pickle', 'rb')
classifier = pickle.load(f)
f.close()

In [None]:
pred = classifier.classify(extract_features(['makasih', 'sinyal', 'banget']))
pred

In [None]:
extract_features(['kementerian', 'sinyal', 'banget'])