In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

import Sastrawi
import nltk

import re
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import cross_validate
from sklearn.metrics import confusion_matrix
from sklearn.naive_bayes import MultinomialNB
from nltk.tag import CRFTagger
from collections import Counter
import warnings
warnings.filterwarnings("ignore")
pd.set_option('display.max_colwidth', 300)

In [2]:
train_data = pd.read_csv('dataset/train_set.csv', delimiter=',', encoding='Latin')
tester_data = pd.read_csv('dataset/test_set.csv', delimiter=',', encoding='Latin')

In [3]:
emoticons = pd.read_csv('emoticon.txt', delimiter='\t', names=['emoticon', 'emoticon_score'], encoding='Latin')

In [4]:
emoticon_texts = emoticons['emoticon'].tolist()
emoticon_scores = emoticons['emoticon_score'].tolist()

In [5]:
emoticonset = set()
for i in range(len(emoticon_texts)):
    emoticonset.add((emoticon_texts[i], emoticon_scores[i]))

In [6]:
def extract_emoticon(tweet):
    score = 0
    for emoticon_text, emoticon_score in emoticonset:
        if emoticon_text in tweet:
            score += emoticon_score
    return score

In [7]:
train_data['emoticon_score'] = train_data['tweet'].apply(extract_emoticon)
tester_data['emoticon_score'] = tester_data['tweet'].apply(extract_emoticon)

In [8]:
kbba_ = pd.read_csv('kbba.txt', delimiter='\t', names=['from', 'to'])
kbba_from = kbba_['from'].tolist()
kbba_to = kbba_['to'].tolist()

kbba_repo = list()
for i in range(len(kbba_from)):
    kbba_repo.append((kbba_from[i], kbba_to[i]))
    
abbr_ = pd.read_csv('singkatan-lib.csv', delimiter=',', names=['from', 'to'])
abbr_from = abbr_['from'].tolist()
abbr_to = abbr_['to'].tolist()

abbr_repo = list()
for i in range(len(abbr_from)):
    abbr_repo.append((abbr_from[i], abbr_to[i]))

noises_ = pd.read_csv('noise.txt', names=['noise'])
noises_repo = noises_['noise'].tolist()

def normalisasi(tweet):
    normal_tw = tweet.lower() #lowercase
    normal_tw_words = nltk.word_tokenize(normal_tw)
    
    normal_tw_words_normalized = []
    for word in normal_tw_words:
        match = False
        for kbba_f, kbba_t in kbba_repo:
            if word == kbba_f:
                normal_tw_words_normalized.append(kbba_t)
                match = True
                break
        if not match:
            normal_tw_words_normalized.append(word)

        
    normal_tw = " ".join(normal_tw_words_normalized)

    normal_tw_words = nltk.word_tokenize(normal_tw)
    normal_tw_words_normalized = []
    for word in normal_tw_words:
        match = False
        for abbr_f, abbr_t in abbr_repo:
            if word == abbr_f:
                normal_tw_words_normalized.append(abbr_t)
                match = True
                break
        if not match:
            normal_tw_words_normalized.append(word)
    normal_tw = " ".join(normal_tw_words_normalized)

#     normal_tw_words_normalized = []
#     for word in normal_tw_words:
#         match = False
#         for noise in noises_repo:
#             if word == noise:
#                 match = True
#                 break
#         if not match:
#             normal_tw_words_normalized.append(word)
#     normal_tw = " ".join(normal_tw_words_normalized)

    normal_tw = re.sub('(\.){1,}', ' ', normal_tw)
    normal_tw = re.sub('\s+', ' ', normal_tw) # remove extra space
    normal_tw = normal_tw.strip() #trim depan belakang
    normal_tw = re.sub(r'(wk){2,}|(ck){2,}|(ha){2,}|(he){2,}', ' emotxtawa ', normal_tw)
    normal_tw = re.sub(r'(hiks)|(hu){2,}', ' emotxtangis ', normal_tw)
    normal_tw   =   re.sub(r'[^\w\s\.]',' ',normal_tw)   #buang punctuation
    normal_tw = re.sub(r'([A-Za-z])\1{1,}\s', r'\1', normal_tw)
    normal_tw = re.sub(r'([A-Za-z])\1{1,}$', r'\1', normal_tw)
    normal_tw = normal_tw.strip()
    return normal_tw

In [9]:
def extract_emoticon_2(tweet):
    score = 0
    words = nltk.word_tokenize(tweet)
    for word in words:
        if word == 'emotxtawa':
            score += 1
        if word == 'emotxtangis':
            score -= 1
    return score

In [10]:
train_data['tweet'] = train_data['tweet'].apply(normalisasi)
tester_data['tweet'] = tester_data['tweet'].apply(normalisasi)

In [11]:
train_data['emoticon_score'] = train_data['emoticon_score'] + train_data['tweet'].apply(extract_emoticon_2)
tester_data['emoticon_score'] = tester_data['emoticon_score'] + tester_data['tweet'].apply(extract_emoticon_2)

In [12]:
train_data.to_csv('train_data_normalized.csv', index=False, header=False)
tester_data.to_csv('tester_data_normalized.csv', index=False, header=False)

In [13]:
train_data_formalized = pd.read_csv('dataset/train_data_formalized.csv', encoding='Latin')
tester_data_formalized = pd.read_csv('dataset/tester_data_formalized.csv', encoding='Latin')

In [14]:
stopwords = pd.read_csv('stopwords.txt', header=None)[0].values
def remove_stopwords(tweet, stopwords):
    special_list = ['username', 'url', 'sensitive-no']
    token = nltk.word_tokenize(tweet)
    token_afterremoval = []
    for k in token:
        if k not in stopwords and k not in special_list:
            token_afterremoval.append(k)
    str_clean = ' '.join(token_afterremoval)
    return str_clean

In [15]:
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory


def stemming(tweet):
    token = nltk.word_tokenize(tweet)
    stem_kalimat = []
    factory = StemmerFactory()
    stemmer = factory.create_stemmer()
    for k in token:
        stem_kata = stemmer.stem(k)
        stem_kalimat.append(stem_kata)
    stem_kalimat_str = ' '.join(stem_kalimat)
    return stem_kalimat_str

In [16]:
def pre_processing(tweets):
    temp_tweets = tweets.copy()
    temp_tweets['tweet'] = temp_tweets['tweet'].apply(lambda tweet: remove_stopwords(tweet, stopwords))
    temp_tweets['tweet'] = temp_tweets['tweet'].apply(stemming)
    return temp_tweets

In [17]:
train_data_preprocess = pre_processing(train_data_formalized)
tester_data_preprocess = pre_processing(tester_data_formalized)

train_data_preprocess.to_csv('train_data_preprocessed.csv', index=False)
tester_data_preprocess.to_csv('tester_data_preprocessed.csv', index=False)

# train_data_preprocess = pd.read_csv('train_data_preprocessed.csv', delimiter=',', encoding='Latin-1', names=['id', 'sentimen', 'tweet', 'capital_count', 'exclamation_count', 'word_count', 'char_count', 'word_capital_count'])

In [18]:
positives = set(pd.read_csv('positif_vania.txt', names=['word'])['word'].tolist())
negatives = set(pd.read_csv('negatif_vania.txt', names=['word'])['word'].tolist())

In [19]:
def extract_negative_lexicon(tweet):
    score = 0
    words = nltk.word_tokenize(tweet)
    for negative in negatives:
        if negative in words:
            score += 1
    for positive in positives:
        if ('tidak ' + positive) in tweet:
            score += 1
        if ('tdk ' + positive) in tweet:
            score += 1
        if ('ga ' + positive) in tweet:
            score += 1
        if ('bukan ' + positive) in tweet:
            score += 1
    return score

In [20]:
def extract_positive_lexicon(tweet):
    score = 0
    words = nltk.word_tokenize(tweet)
    for positive in positives:
        if positive in words:
            score += 1
    for negative in negatives:
        if ('tidak ' + negative) in tweet:
            score += 1
        if ('tdk ' + negative) in tweet:
            score += 1
        if ('ga ' + negative) in tweet:
            score += 1
        if ('bukan ' + negative) in tweet:
            score += 1
    return score

In [21]:
def extract_feature(data):
    temp_data = data.copy()
    temp_data['lexicon_pos_score'] = temp_data['tweet'].apply(extract_positive_lexicon)
    temp_data['lexicon_neg_score'] = temp_data['tweet'].apply(extract_negative_lexicon)
    temp_dat_2 = data.copy()
    temp_dat_2['lexicon_score'] = temp_data['lexicon_pos_score'] - temp_data['lexicon_neg_score']
    return temp_dat_2

In [22]:
train_data_extracted = extract_feature(train_data_preprocess)
tester_data_extracted = extract_feature(tester_data_preprocess)

In [23]:
# zero_cond = (train_data_extracted['sentimen'] == 0) & (train_data_extracted['lexicon_score'] == 0) & (train_data_extracted['emoticon_score'] == 0)
# train_data_extracted['lexicon_score'].iloc[zero_cond.values] = -2 

# zero_cond = (tester_data_extracted['lexicon_score'] == 0) & (tester_data_extracted['emoticon_score'] == 0)
# tester_data_extracted['lexicon_score'].iloc[zero_cond.values] = -2 

In [24]:
features = ['emoticon_score', 'lexicon_score']
target = 'sentimen'

X, y = train_data_extracted[features].values, train_data_extracted[target].values
Xx = tester_data_extracted[features].values

In [25]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression, RidgeClassifier
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

classifiers = [
    ('Decission Tree', DecisionTreeClassifier()),
    ('Logistic Regression', LogisticRegression()),
    ('SVM', LinearSVC()),
    ('Multinomial Naive Bayes', MultinomialNB()),
    ('KNN', KNeighborsClassifier()),
    ('Ensemble', GradientBoostingClassifier())
]

from sklearn.model_selection import KFold

train_scores = 0
test_scores = 0

kfold = KFold(n_splits=10, random_state=46)
dt = RandomForestClassifier()

for train_index, test_index in kfold.split(X):
    X_train, y_train = X[train_index], y[train_index]
    X_test, y_test = X[test_index], y[test_index]
    dt.fit(X_train, y_train)
    train_scores += accuracy_score(dt.predict(X_train), y_train)
    test_scores += accuracy_score(dt.predict(X_test), y_test)
    
print(train_scores / 10)
print(test_scores / 10)

0.8157454406089217
0.8136812646799154


In [26]:
predicted = dt.predict(X)
train_data_extracted['predicted'] = predicted

In [27]:
train_data_extracted[train_data_extracted['predicted'] != train_data_extracted['sentimen']].head(30).tail(10)

Unnamed: 0,id,sentimen,tweet,emoticon_score,lexicon_score,predicted
150,151,1,sukadeh teman ngetweet konten wawas dunia nyata share ilmu nyinyir kritik manfaat nyambung ngomongnya tidak bahas lamtur lipen,0,0,0
158,159,1,percaya tuhan rencana orang buat salah tidak ulang,0,0,0
159,160,1,go seung jae song hyeri jadi nungguin urus orang ending hubung emotxtangis seung jae muncul bentar hasil rebut atensi mas choi myeong ho ganteng,-1,1,0
165,166,0,helloowpak asi games tinggal hitung minggu jakarta kelar benah tidak malu palembang 95 kerja kelar anies urusin trotoar selesai gubernur,0,3,1
171,172,1,buset indosat sangat deh nge berentiin pakai bukan id indosat super wifi,0,-1,0
175,176,0,tuju bang bikin hidup tidak tenang iya tidak mikirin cari order enak gedeg bikin opik begitu,0,2,1
184,185,1,cinta si hitam idung lebar,0,0,0
186,187,1,ih canda bin dipaketin langsung sana,0,0,0
192,193,0,sehat hidupin data seluler muncul notif kanan data seluler otomatis mati ulang padal kuota msih 1 gb tidak pakai respon respon iya,0,1,1
194,195,0,malas iya dinas malam orang tidur kerja sh libur keluh rumah seni triplek,0,1,1


In [28]:
tester_predicted = dt.predict(Xx)

In [29]:
tester_predicted

array([1, 0, 0, ..., 0, 1, 0])

In [30]:
tester_data_extracted['predicted'] = tester_predicted

In [31]:
tester_data_extracted

Unnamed: 0,tweetID,tweet,emoticon_score,lexicon_score,predicted
0,0,wanita suka hancur hubung orang bangga hasil rusak kebahagian orang silah tidak berkah bahagia he,0,1,1
1,1,sombong apa sms dibls,0,-1,0
2,2,apadah p cie cie cie bebe cie kiwkiw,0,0,0
3,3,tdrlah besok medical check up moga lancar wml,0,0,0
4,4,crew serbu bsm seru bang syariah mandiri bekas pic,0,1,1
5,5,sian ditelantarin indah nge lho,1,1,1
6,6,diri sembahyang tunai zakat tatlah rasul rahmat 24 56,0,1,1
7,7,pikir bandar bayarin makan evil dead pokok star trek keren,0,2,1
8,8,tidak kreatif ambil kutip orang tertawa suka iya ungkapin tikung,0,2,1
9,9,iya bahas twiter kali ven teman tertawa iya bukti sayang manta han,0,2,1


In [32]:
tester_data_extracted.to_csv('results3.csv', header=False, index=False, columns=['tweetID', 'predicted'])