In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

import Sastrawi
import nltk

import re
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import cross_validate
from sklearn.metrics import confusion_matrix
from sklearn.naive_bayes import MultinomialNB
from nltk.tag import CRFTagger
from collections import Counter
import warnings
warnings.filterwarnings("ignore")
pd.set_option('display.max_colwidth', 300)

In [2]:
train_data = pd.read_csv('dataset/train_set.csv', delimiter=',', encoding='Latin')
tester_data = pd.read_csv('dataset/test_set.csv', delimiter=',', encoding='Latin')

In [3]:
emoticons = pd.read_csv('emoticon.txt', delimiter='\t', names=['emoticon', 'emoticon_score'])

In [4]:
emoticon_texts = emoticons['emoticon'].tolist()
emoticon_scores = emoticons['emoticon_score'].tolist()

In [None]:
emoticonset = set()
for i in range(len(emoticon_texts)):
    emoticonset.add((emoticon_texts[i], emoticon_scores[i]))

In [None]:
def ortografi_exception(tweet):
    special_list = ['[USERNAME]', '[URL]', '[SENSITIVE-NO]']
    for sp in special_list:
        tweet = tweet.replace(sp, '')
    return tweet

def extract_ortografi_word_capital_count(tweet):
    words = nltk.word_tokenize(ortografi_exception(tweet))
    count = 0
    for word in words:
        if all([c.isupper() for c in word]):
            count = count + 1
    return count / len(words)

In [None]:
def extract_exclamation_count(tweet):
    tweet = re.sub(r'!{1,}', '!', tweet)
    return sum((1 for c in tweet if c == "!"))

In [None]:
train_data['ortografi'] = train_data['tweet'].apply(extract_ortografi_word_capital_count)
tester_data['ortografi'] = tester_data['tweet'].apply(extract_ortografi_word_capital_count)

train_data['exclamation'] = train_data['tweet'].apply(extract_exclamation_count)
tester_data['exclamation'] = tester_data['tweet'].apply(extract_exclamation_count)

In [None]:
import re 
def extract_emoticon(tweet):
    score = 0
    for emoticon_text, emoticon_score in emoticonset:
        occurence = 0
        if emoticon_text in tweet:
            score += emoticon_score
        for i in range(len(tweet) - len(emoticon_text) - 1):
            if len(emoticon_text) <= len(tweet) and tweet[i:(i+len(emoticon_text))] == emoticon_text:
                occurence += 1
        score += (occurence * emoticon_score)
    return score

In [None]:
extract_emoticon("cie andien yang lagi bep marah marah mulu :p :) :) :) :) :-) :(")

6

In [None]:
train_data['emoticon_score'] = train_data['tweet'].apply(extract_emoticon)
tester_data['emoticon_score'] = tester_data['tweet'].apply(extract_emoticon)

In [None]:
kbba_ = pd.read_csv('kbba.txt', delimiter='\t', names=['from', 'to'])
kbba_from = kbba_['from'].tolist()
kbba_to = kbba_['to'].tolist()

kbba_repo = list()
for i in range(len(kbba_from)):
    kbba_repo.append((kbba_from[i], kbba_to[i]))
    
abbr_ = pd.read_csv('singkatan-lib.csv', delimiter=',', names=['from', 'to'])
abbr_from = abbr_['from'].tolist()
abbr_to = abbr_['to'].tolist()

abbr_repo = list()
for i in range(len(abbr_from)):
    abbr_repo.append((abbr_from[i], abbr_to[i]))

noises_ = pd.read_csv('noise.txt', names=['noise'])
noises_repo = noises_['noise'].tolist()

def normalisasi(tweet):
    normal_tw = tweet.lower() #lowercase
    normal_tw_words = nltk.word_tokenize(normal_tw)
    
    normal_tw_words_normalized = [""]
    for word in normal_tw_words:
        match = False
        for kbba_f, kbba_t in kbba_repo:
            if word == kbba_f:
                normal_tw_words_normalized.append(kbba_t)
                match = True
                break
        if not match:
            normal_tw_words_normalized.append(word)
    
    normal_tw = " ".join(normal_tw_words_normalized)

    normal_tw_words = nltk.word_tokenize(normal_tw)
    normal_tw_words_normalized = [""]
    for word in normal_tw_words:
        match = False
        for abbr_f, abbr_t in abbr_repo:
            if word == abbr_f:
                normal_tw_words_normalized.append(abbr_t)
                match = True
                break
        if not match:
            normal_tw_words_normalized.append(word)
    normal_tw = " ".join(normal_tw_words_normalized)

#     normal_tw_words_normalized = []
#     for word in normal_tw_words:
#         match = False
#         for noise in noises_repo:
#             if word == noise:
#                 match = True
#                 break
#         if not match:
#             normal_tw_words_normalized.append(word)
#     normal_tw = " ".join(normal_tw_words_normalized)

    normal_tw = re.sub('(\.){1,}', ' ', normal_tw)
    normal_tw = re.sub('\s+', ' ', normal_tw) # remove extra space
    normal_tw = normal_tw.strip() #trim depan belakang
    normal_tw = re.sub(r'(wk){2,}|(wka){2,}|(ck){2,}|(ha){2,}|(he){2,}', ' emotxtawa ', normal_tw)
    normal_tw = re.sub(r'(hiks)|(kiw){2,}|(hu){2,}', ' emotxtangis ', normal_tw)
    normal_tw   =   re.sub(r'[^\w\s\.]',' ',normal_tw)   #buang punctuation
    normal_tw = re.sub(r'([A-Za-z])\1{1,}\s', r'\1', normal_tw)
    normal_tw = re.sub(r'([A-Za-z])\1{1,}$', r'\1', normal_tw)
    normal_tw = normal_tw.strip()
    return normal_tw

In [None]:
def extract_emoticon_2(tweet):
    score = 0
    words = nltk.word_tokenize(tweet)
    for word in words:
        if word == 'emotxtawa':
            score += 1
        if word == 'emotxtangis':
            score -= 1
    return score

In [None]:
train_data['tweet'] = train_data['tweet'].apply(normalisasi)
tester_data['tweet'] = tester_data['tweet'].apply(normalisasi)

In [None]:
train_data['emoticon_score'] = train_data['emoticon_score'] + train_data['tweet'].apply(extract_emoticon_2)
tester_data['emoticon_score'] = tester_data['emoticon_score'] + tester_data['tweet'].apply(extract_emoticon_2)

In [None]:
train_data.to_csv('train_data_normalized.csv', index=False, header=False)
tester_data.to_csv('tester_data_normalized.csv', index=False, header=False)

In [None]:
train_data_formalized = pd.read_csv('dataset/train_data_formalized.csv', encoding='Latin')
tester_data_formalized = pd.read_csv('dataset/tester_data_formalized.csv', encoding='Latin')

# train_data_formalized = train_data
# tester_data_formalized = tester_data

In [None]:
stopwords = pd.read_csv('stopwords.txt', header=None)[0].values
def remove_stopwords(tweet, stopwords):
    special_list = ['username', 'url', 'sensitive-no']
    token = nltk.word_tokenize(tweet)
    token_afterremoval = []
    for k in token:
        if k not in stopwords and k not in special_list:
            token_afterremoval.append(k)
    str_clean = ' '.join(token_afterremoval)
    return str_clean

In [None]:
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory


def stemming(tweet):
    token = nltk.word_tokenize(tweet)
    stem_kalimat = []
    factory = StemmerFactory()
    stemmer = factory.create_stemmer()
    for k in token:
        stem_kata = stemmer.stem(k)
        stem_kalimat.append(stem_kata)
    stem_kalimat_str = ' '.join(stem_kalimat)
    return stem_kalimat_str

In [None]:
def pre_processing(tweets):
    temp_tweets = tweets.copy()
    temp_tweets['tweet'] = temp_tweets['tweet'].apply(lambda tweet: remove_stopwords(tweet, stopwords))
#     temp_tweets['tweet'] = temp_tweets['tweet'].apply(stemming)
    return temp_tweets

In [None]:
train_data_preprocess = pre_processing(train_data_formalized)
tester_data_preprocess = pre_processing(tester_data_formalized)

# train_data_preprocess.to_csv('train_data_preprocessed.csv', index=False)
# tester_data_preprocess.to_csv('tester_data_preprocessed.csv', index=False)

# train_data_preprocess = pd.read_csv('train_data_preprocessed.csv', delimiter=',', encoding='Latin-1', names=['id', 'sentimen', 'tweet', 'capital_count', 'exclamation_count', 'word_count', 'char_count', 'word_capital_count'])

In [None]:
positives = set(pd.read_csv('positif_vania.txt', names=['word'])['word'].tolist())
negatives = set(pd.read_csv('negatif_vania.txt', names=['word'])['word'].tolist())

In [None]:
ct = CRFTagger()
ct.set_model_file("all_indo_man_tag_corpus_model.crf.tagger")

def extract_jj(tweet):
    words = nltk.word_tokenize(tweet)
    tag = ct.tag_sents([words])
    flat_tag = [item for sublist in tag for item in sublist]
    pos_count = Counter([j for i,j in flat_tag])
    return pos_count['JJ']

def extract_neg(tweet):
    words = nltk.word_tokenize(tweet)
    tag = ct.tag_sents([words])
    flat_tag = [item for sublist in tag for item in sublist]
    pos_count = Counter([j for i,j in flat_tag])
    return pos_count['NEG']

In [None]:
jj = set()

def extract_negative_lexicon(tweet):
    score = 0
    words = nltk.word_tokenize(tweet)
    tag = ct.tag_sents([words])
    flat_tag = [item for sublist in tag for item in sublist]
    words_tag = dict()
    for w, tg in flat_tag:
        if tg == 'JJ':
            jj.add(w)
            words_tag[w] = tg
    for i in range(len(words)):
        if i > 1 and words[i-2] == 'tidak' and words[i] in positives:
            score += 1
    for negative in negatives:
        for i in range(len(words)):
            if words[i] == negative:
                if (i > 0):
                    if (words[i-1] != 'tidak'):
                        score += 1
#                         if words[i] in words_tag.keys() and words_tag[words[i]] == 'JJ':
#                             score += 0.5
                        
                else:
                    score += 1
    for positive in positives:
        if ('tidak ' + positive) in tweet:
            score += 1
        if ('jangan ' + positive) in tweet:
            score += 1
    return score

In [None]:
def extract_positive_lexicon(tweet):
    score = 0
    words = nltk.word_tokenize(tweet)
    tag = ct.tag_sents([words])
    flat_tag = [item for sublist in tag for item in sublist]
    words_tag = dict()
    for w, tg in flat_tag:
        if tg == 'JJ':
            jj.add(w)
            words_tag[w] = tg
    for i in range(len(words)):
        if i > 1 and words[i-2] == 'tidak' and words[i] in negatives:
            score += 1
    for positive in positives:
        for i in range(len(words)):
            if words[i] == positive:
                if (i > 0):
                    if (words[i-1] != 'tidak'):
                        score += 1
#                         if words[i] in words_tag.keys() and words_tag[words[i]] == 'JJ':
#                             score += 0.5
                else:
                    score += 1
            
    for negative in negatives:
        if ('tidak ' + negative) in tweet:
            score += 1
        if ('jangan ' + negative) in tweet:
            score += 1
    return score

In [None]:
train_data_preprocess

In [None]:
extract_positive_lexicon('tidak kreatif')

In [None]:
def extract_feature(data):
    temp_data = data.copy()
    temp_data['lexicon_pos_score'] = temp_data['tweet'].apply(extract_positive_lexicon)
    temp_data['lexicon_neg_score'] = temp_data['tweet'].apply(extract_negative_lexicon)
    temp_dat_2 = data.copy()
    temp_dat_2['lexicon_score'] = (temp_data['lexicon_pos_score'] - temp_data['lexicon_neg_score'])
    temp_dat_2['jj'] = temp_dat_2['tweet'].apply(extract_jj)
    temp_dat_2['neg'] = temp_dat_2['tweet'].apply(extract_neg)
    return temp_dat_2

In [None]:
train_data_extracted = extract_feature(train_data_preprocess)
tester_data_extracted = extract_feature(tester_data_preprocess)

In [None]:
# zero_cond = (train_data_extracted['sentimen'] == 0) & (train_data_extracted['lexicon_score'] == 0) & (train_data_extracted['emoticon_score'] == 0)
# train_data_extracted['lexicon_score'].iloc[zero_cond.values] = -2 

# zero_cond = (tester_data_extracted['lexicon_score'] == 0) & (tester_data_extracted['emoticon_score'] == 0)
# tester_data_extracted['lexicon_score'].iloc[zero_cond.values] = -2 

In [None]:
features = ['lexicon_score', 'emoticon_score', 'jj', 'neg']
target = 'sentimen'

X, y = train_data_extracted[features].values, train_data_extracted[target].values
Xx = tester_data_extracted[features].values

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression, RidgeClassifier
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

classifiers = [
    ('Decission Tree', DecisionTreeClassifier()),
    ('Logistic Regression', LogisticRegression()),
    ('SVM', LinearSVC()),
    ('Multinomial Naive Bayes', MultinomialNB()),
    ('KNN', KNeighborsClassifier()),
    ('Ensemble', GradientBoostingClassifier())
]

from sklearn.model_selection import KFold

train_scores = 0
test_scores = 0

kfold = KFold(n_splits=10, random_state=46)
dt = DecisionTreeClassifier()

for train_index, test_index in kfold.split(X):
    X_train, y_train = X[train_index], y[train_index]
    X_test, y_test = X[test_index], y[test_index]
    dt.fit(X_train, y_train)
    train_scores += accuracy_score(dt.predict(X_train), y_train)
    test_scores += accuracy_score(dt.predict(X_test), y_test)
    
print(train_scores / 10)
print(test_scores / 10)

In [None]:
predicted = dt.predict(X)
train_data_extracted['predicted'] = predicted

In [None]:
train_data_extracted[train_data_extracted['predicted'] != train_data_extracted['sentimen']].head(40).tail(20)

In [None]:
tester_predicted = dt.predict(Xx)

In [None]:
tester_predicted

In [None]:
tester_data_extracted['predicted'] = tester_predicted

In [None]:
tester_data_extracted

In [None]:
tester_data_extracted.to_csv('results8.csv', header=False, index=False, columns=['test_ID', 'predicted'])

In [None]:
extract_emoticon("cie andien yang lagi bep marah marah mulu:p :) :) :) :)")

In [None]:
extract_positive_lexicon('tidak kreatif ambil kutipan orang tertawa suka iya ungkapin ditikung')

In [None]:
extract_positive_lexicon('tidak kreatif')