In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

import Sastrawi
import nltk

import re
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import cross_validate
from sklearn.metrics import confusion_matrix
from sklearn.naive_bayes import MultinomialNB
from nltk.tag import CRFTagger
from collections import Counter
import warnings
warnings.filterwarnings("ignore")

In [3]:
raw_data = pd.read_csv('train_set.csv', delimiter=',', encoding='Latin-1')

In [4]:
raw_data.head(10)

Unnamed: 0,id,sentimen,tweet
0,1,1,oks kak semangat ya kalian kalian
1,2,0,sekarang harus kaya orang bodoh lagi bodoh sangat
2,3,1,"Begitu diumumkan lulus 100%, mereka semua suju..."
3,4,0,[USERNAME] [USERNAME] Katanya Bapak Reformasi ...
4,5,0,macet macetan perut kosong akhirnya mampir dah...
5,6,0,Pernyataan paling mengganggu telinga malam ini...
6,7,1,Masi belum move on dari poto poto ini. Ceritan...
7,8,1,"Dibalik kecemburuan,terselip rasa kasih sayang..."
8,9,1,Kalo udah sayang beneran itu mau dihadapkan sa...
9,10,1,"[USERNAME] Pagi juga mas nyaa, duhh jadi gaena..."


In [32]:
def ortografi_exception(tweet):
    special_list = ['[USERNAME]', '[URL]', '[SENSITIVE-NO]']
    for sp in special_list:
        tweet = tweet.replace(sp, '')
    return tweet

In [33]:
def ortografi_capital_count(tweet):
    return sum(1 for c in ortografi_exception(tweet) if c.isupper())

In [34]:
def ortografi_char_count(tweet):
    return len(tweet)

In [35]:
def ortografi_word_count(tweet):
    return  len(nltk.word_tokenize(tweet))

In [36]:
def ortografi_exclamation_count(tweet):
    return  sum((1 for c in tweet if c == "!"))

In [37]:
def ortografi_word_capital_count(tweet):
    words = nltk.word_tokenize(ortografi_exception(tweet))
    count = 0
    for word in words:
        if all([c.isupper() for c in word]):
            count = count + 1
    return count

In [31]:
def generate_ortografi_feature(tweets):
    tweets['capital_count'] = tweets['tweet'].apply(ortografi_capital_count)
    tweets['exclamation_count'] = tweets['tweet'].apply(ortografi_exclamation_count)
    tweets['word_count'] = tweets['tweet'].apply(ortografi_word_count)
    tweets['char_count'] = tweets['tweet'].apply(ortografi_char_count)
    tweets['word_capital_count'] = tweets['tweet'].apply(ortografi_word_capital_count)
    return tweets

In [12]:
data_ort = generate_ortografi_feature(raw_data)

In [13]:
data_ort.head(10)

Unnamed: 0,id,sentimen,tweet,capital_count,exclamation_count,word_count,char_count,word_capital_count
0,1,1,oks kak semangat ya kalian kalian,0,0,6,33,0
1,2,0,sekarang harus kaya orang bodoh lagi bodoh sangat,0,0,8,49,0
2,3,1,"Begitu diumumkan lulus 100%, mereka semua suju...",1,0,46,219,0
3,4,0,[USERNAME] [USERNAME] Katanya Bapak Reformasi ...,7,0,44,252,0
4,5,0,macet macetan perut kosong akhirnya mampir dah...,0,0,18,89,0
5,6,0,Pernyataan paling mengganggu telinga malam ini...,12,0,46,277,0
6,7,1,Masi belum move on dari poto poto ini. Ceritan...,5,0,28,154,0
7,8,1,"Dibalik kecemburuan,terselip rasa kasih sayang...",2,0,29,150,0
8,9,1,Kalo udah sayang beneran itu mau dihadapkan sa...,1,0,31,157,0
9,10,1,"[USERNAME] Pagi juga mas nyaa, duhh jadi gaena...",4,0,34,169,0


In [43]:
def normalisasi(tweet):
    emoticons = [
        (' emotxsenyum ', [':-)', ':)', '(:', '(-:', '^^', '^-^', '^_^', ':"")', ':")']),
        (' emotxtawa ', [':-D', ':D', 'X-D', 'XD', 'xD']),
        (' emotxcinta ', ['<3', ':\*']),
        (' emotxkedip ', [';-)', ';)', ';-D', ';D', '(;', '(-;']),
        (' emotxsedih ', [':(', ':-(', '):', ')-:']),
        (' emotxtangis ', [':,(', ':\'(', ':"(', ':((', 'T_T']),
        (' emotxejek ', [':p', ':P', ':-p', ':-P', '=p', '=P']),
        (' emotxdatar ', ['-_-']),
        (' emotxjempol ', ['(y)', 'thumbsup'])
    ]
    normal_tw = tweet.lower() #lowercase
    normal_tw = re.sub('(\.){1,}', ' ', normal_tw)
    normal_tw = re.sub('\s+', ' ', normal_tw) # remove extra space
    normal_tw = normal_tw.strip() #trim depan belakang
    normal_tw = re.sub(r'(wk){2,}|(ck){2,}|(ha){2,}|(he){2,}', ' emotxtawa ', normal_tw)
    normal_tw = re.sub(r'(hiks)|(hu){2,}', ' emotxtangis ', normal_tw)
    for emot_text, emot_symbols in emoticons:
        for symbol in emot_symbols:
            normal_tw = normal_tw.replace(symbol, emot_text)
    normal_tw   =   re.sub(r'[^\w\s\.]',' ',normal_tw)   #buang punctuation
    normal_tw = re.sub(r'([A-Za-z])\1{1,}\s', r'\1', normal_tw)
    normal_tw = re.sub(r'([A-Za-z])\1{1,}$', r'\1', normal_tw)
    return normal_tw

In [15]:
re.sub(r'[^\w\s\.]', ' ', 'kemana-mana ')

'kemana mana '

In [16]:
tweet_normalized = data_ort.copy()
tweet_normalized['tweet'] = tweet_normalized['tweet'].apply(normalisasi)
tweet_normalized.head(10)

Unnamed: 0,id,sentimen,tweet,capital_count,exclamation_count,word_count,char_count,word_capital_count
0,1,1,oks kak semangat ya kalian kalian,0,0,6,33,0
1,2,0,sekarang harus kaya orang bodoh lagi bodoh sangat,0,0,8,49,0
2,3,1,begitu diumumkan lulus 100 mereka semua suju...,1,0,46,219,0
3,4,0,username username katanya bapak reformasi ...,7,0,44,252,0
4,5,0,macet macetan perut kosong akhirnya mampir dah...,0,0,18,89,0
5,6,0,pernyataan paling mengganggu telinga malam ini...,12,0,46,277,0
6,7,1,masi belum move on dari poto poto ini ceritany...,5,0,28,154,0
7,8,1,dibalik kecemburuan terselip rasa kasih sayang...,2,0,29,150,0
8,9,1,kalo udah sayang beneran itu mau dihadapkan sa...,1,0,31,157,0
9,10,1,username pagi juga mas nya duhjadi gaenak da...,4,0,34,169,0


In [17]:
tweet_normalized.to_csv('data_normalized.csv', columns=['id', 'sentimen', 'tweet', 'capital_count', 'exclamation_count', 'word_count', 'char_count', 'word_capital_count'], index=False, header=False)

Dilakukan formalizer menggunakan script di Java

In [4]:
data_formalized = pd.read_csv('data_formalized.csv', encoding='Latin-1')

In [5]:
data_formalized.head(10)

Unnamed: 0,id,sentimen,tweet,capital_count,exclamation_count,word_count,char_count,word_capital_count
0,1,1,oks kak semangat iya kalian kalian,0,0,6,33,0
1,2,0,sekarang harus kaya orang bodoh lagi bodoh san...,0,0,8,49,0
2,3,1,begitu diumumkan lulus 100 mereka semua sujud ...,1,0,46,219,0
3,4,0,username username katanya bapak reformasi dan...,7,0,44,252,0
4,5,0,macet macetan perut kosong akhirnya mampir dah...,0,0,18,89,0
5,6,0,pernyataan paling mengganggu telinga malam ini...,12,0,46,277,0
6,7,1,masi belum move on dari poto poto ini ceritany...,5,0,28,154,0
7,8,1,dibalik kecemburuan terselip rasa kasih sayang...,2,0,29,150,0
8,9,1,kalau sudah sayang beneran itu mau dihadapkan ...,1,0,31,157,0
9,10,1,username pagi juga mas nya duhjadi gaenak dap...,4,0,34,169,0


In [2]:
stopwords = pd.read_csv('stopwords.txt', header=None)[0].values

In [7]:
def remove_stopwords(tweet, stopwords):
    special_list = ['username', 'url', 'sensitive-no']
    token = nltk.word_tokenize(tweet)
    token_afterremoval = []
    for k in token:
        if k not in stopwords and k not in special_list:
            token_afterremoval.append(k)
    str_clean = ' '.join(token_afterremoval)
    return str_clean

In [8]:
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory


def stemming(tweet):
    token = nltk.word_tokenize(tweet)
    stem_kalimat = []
    factory = StemmerFactory()
    stemmer = factory.create_stemmer()
    for k in token:
        stem_kata = stemmer.stem(k)
        stem_kalimat.append(stem_kata)
    stem_kalimat_str = ' '.join(stem_kalimat)
    return stem_kalimat_str

In [9]:
def pre_processing(tweets):
    temp_tweets = tweets.copy()
    temp_tweets['tweet'] = temp_tweets['tweet'].apply(lambda tweet: remove_stopwords(tweet, stopwords))
    temp_tweets['tweet'] = temp_tweets['tweet'].apply(stemming)
    return temp_tweets

In [5]:
# data_preprocess = pre_processing(data_formalized)
data_preprocess = pd.read_csv('train_data_preprocessed.csv', delimiter=',', encoding='Latin-1', names=['id', 'sentimen', 'tweet', 'capital_count', 'exclamation_count', 'word_count', 'char_count', 'word_capital_count'])

In [11]:
data_preprocess.to_csv('train_data_preprocessed.csv', columns=['id', 'sentimen', 'tweet', 'capital_count', 'exclamation_count', 'word_count', 'char_count', 'word_capital_count'], index=False, header=False)

In [132]:
data_preprocess

Unnamed: 0,id,sentimen,tweet,capital_count,exclamation_count,word_count,char_count,word_capital_count
0,1,1,oks kak semangat iya,0,0,6,33,0
1,2,0,kaya orang bodoh bodoh sangat,0,0,8,49,0
2,3,1,umum lulus 100 sujud syukur langsung ambil bun...,1,0,46,219,0
3,4,0,reformasi demokrasi negeri nyata tidak aku leb...,7,0,44,252,0
4,5,0,macet macet perut kosong mampir soto betawi at...,0,0,18,89,0
5,6,0,nyata ganggu telinga malam mulut juru bicara p...,12,0,46,277,0
6,7,1,mas move on poto poto cerita nyobain face filt...,5,0,28,154,0
7,8,1,balik cemburu selip kasih sayang cemburu hubun...,2,0,29,150,0
8,9,1,sayang beneran hadap cowok kaya ganteng pinter...,1,0,31,157,0
9,10,1,pagi mas nya duhjadi gaenak ucap met pagi neti...,4,0,34,169,0


In [6]:
data_preprocess.head(10)

Unnamed: 0,id,sentimen,tweet,capital_count,exclamation_count,word_count,char_count,word_capital_count
0,1,1,oks kak semangat iya,0,0,6,33,0
1,2,0,kaya orang bodoh bodoh sangat,0,0,8,49,0
2,3,1,umum lulus 100 sujud syukur langsung ambil bun...,1,0,46,219,0
3,4,0,reformasi demokrasi negeri nyata tidak aku leb...,7,0,44,252,0
4,5,0,macet macet perut kosong mampir soto betawi at...,0,0,18,89,0
5,6,0,nyata ganggu telinga malam mulut juru bicara p...,12,0,46,277,0
6,7,1,mas move on poto poto cerita nyobain face filt...,5,0,28,154,0
7,8,1,balik cemburu selip kasih sayang cemburu hubun...,2,0,29,150,0
8,9,1,sayang beneran hadap cowok kaya ganteng pinter...,1,0,31,157,0
9,10,1,pagi mas nya duhjadi gaenak ucap met pagi neti...,4,0,34,169,0


In [54]:
def extract_emoticon(tweet):
    emoticons = [
        ('emotxsenyum', 5),
        ('emotxtawa', 5),
        ('emotxcinta', 5),
        ('emotxkedip', 3),
        ('emotxsedih', -5),
        ('emotxtangis', -5),
        ('emotxejek', 1),
        ('emotxdatar', -4),
        ('emotxjempol', 5)
    ]
    score = 0
    if tweet is not None and type(tweet) == str:
        words = nltk.word_tokenize(tweet)
        for word in words:
            for emot, emot_score in emoticons:
                if word == emot:
                    score += emot_score
        return score
    return 0
        

In [8]:
negatives = pd.read_csv('negative_koto.tsv', header=None, delimiter='\t', names=['word', 'word_score'])
negs = negatives['word'].tolist()
neg_scores = negatives['word_score'].tolist()

lexicon_neg = {}

for i in range(len(negs)):
    lexicon_neg[negs[i]] = neg_scores[i]

def extract_sentiment_lexicon_negative(tweet):
    words = nltk.word_tokenize(tweet)
    score = 0
    for i in range(len(words)):
        if words[i] in lexicon_neg.keys():
            score += lexicon_neg[words[i]]
    return score

def extract_sentiment_lexicon_negative_negation(tweet):
    words = nltk.word_tokenize(tweet)
    negation_score = 0
    for i in range(len(words)):
        if words[i] in lexicon_neg.keys():
            if i > 0 and (words[i-1] == 'tidak' or words[i-1] == 'tak' or words[i-1] == 'tdk'):
                negation_score += lexicon_neg[words[i]]
    return negation_score

In [108]:
positives_vania = pd.read_csv('positif_vania.txt', header=None, names=['word'])
lexicon_positive_vania = set(positives_vania['word'].tolist())

def extract_sentiment_lexicon_positive_vania(tweet):
    if type(tweet) != str:
        return 0
    score = 0
    words = nltk.word_tokenize(tweet)
    for i in range(len(words)):
        if words[i] in lexicon_positive_vania:
            score += 1
            if (i > 0 and words[i-1] == 'sangat') or (i + 1 < len(words) and words[i+1] == 'banget'):
                score += 1
            if (i > 0 and (words[i-1] == 'tidak' or words[i-1] == 'tdk' or words[i-1] == 'tak')):
                score -= 2
#     for lexicon in lexicon_positive_vania:
#         if lexicon in tweet:
#             score += 1
    return score

In [107]:
negative_vania = pd.read_csv('negatif_vania.txt', header=None, names=['word'])
lexicon_negative_vania = set(negative_vania['word'].tolist())

def extract_sentiment_lexicon_negative_vania(tweet):
    if type(tweet) != str:
        return 0
    score = 0
    words = nltk.word_tokenize(tweet)
    for i in range(len(words)):
        if words[i] in lexicon_negative_vania:
            score += 1
            if (i > 0 and words[i-1] == 'sangat') or (i + 1 < len(words)  and words[i+1] == 'banget'):
                score += 1
            if (i > 0 and (words[i-1] == 'tidak' or words[i-1] == 'tdk' or words[i-1] == 'tak')):
                score -= 2
#     for lexicon in lexicon_negative_vania:
#         if lexicon in tweet:
#             score += 1
    return score

In [43]:
positives = pd.read_csv('positive_koto.tsv', header=None, delimiter='\t', names=['word', 'word_score'])
poss = positives['word'].tolist()
pos_scores = positives['word_score'].tolist()

lexicon_pos = {}
for i in range(len(poss)):
    lexicon_pos[poss[i]] = pos_scores[i]
    
def extract_sentiment_lexicon_positive(tweet):
    words = nltk.word_tokenize(tweet)
    score = 0
    for i in range(len(words)):
        if words[i] in lexicon_pos.keys():
            score += lexicon_pos[words[i]]
    return score

def extract_sentiment_lexicon_positive_negation(tweet):
    words = nltk.word_tokenize(tweet)
    negation_score = 0
    for i in range(len(words)):
        if words[i] in lexicon_pos.keys():
            if i > 0 and (words[i-1] == 'tidak' or words[i-1] == 'tak' or words[i-1] == 'tdk'):
                negation_score += lexicon_pos[words[i]]
    return negation_score

In [109]:
def feature_extraction(tweets):
    tweets_temp = tweets.copy()
    tweets_temp['emoticon_score'] = tweets_temp['tweet'].apply(extract_emoticon)
#     tweets_temp['positive_score'] = tweets_temp['tweet'].apply(extract_sentiment_lexicon_positive)
#     tweets_temp['positive_negation_score'] = tweets_temp['tweet'].apply(extract_sentiment_lexicon_positive_negation)
#     tweets_temp['negative_score'] = tweets_temp['tweet'].apply(extract_sentiment_lexicon_negative)
#     tweets_temp['negative_negation_score'] = tweets_temp['tweet'].apply(extract_sentiment_lexicon_negative_negation)
#     tweets_temp['positive_score'] -= tweets_temp['negative_negation_score']
#     tweets_temp['negative_score'] -= tweets_temp['positive_negation_score']
    tweets_temp['negative_score_vania'] = tweets_temp['tweet'].apply(extract_sentiment_lexicon_negative_vania)
    tweets_temp['positive_score_vania'] = tweets_temp['tweet'].apply(extract_sentiment_lexicon_positive_vania)
    tweets_temp['lexicon_score'] = tweets_temp['positive_score_vania'] - tweets_temp['negative_score_vania']
    return tweets_temp

In [110]:
data_feature_extracted = feature_extraction(data_preprocess)

In [111]:
data_feature_extracted.head(10)

Unnamed: 0,id,sentimen,tweet,capital_count,exclamation_count,word_count,char_count,word_capital_count,emoticon_score,negative_score_vania,positive_score_vania,lexicon_score
0,1,1,oks kak semangat iya,0,0,6,33,0,0,0,1,1
1,2,0,kaya orang bodoh bodoh sangat,0,0,8,49,0,0,2,0,-2
2,3,1,umum lulus 100 sujud syukur langsung ambil bun...,1,0,46,219,0,0,0,1,1
3,4,0,reformasi demokrasi negeri nyata tidak aku leb...,7,0,44,252,0,0,1,2,1
4,5,0,macet macet perut kosong mampir soto betawi at...,0,0,18,89,0,0,3,0,-3
5,6,0,nyata ganggu telinga malam mulut juru bicara p...,12,0,46,277,0,0,1,0,-1
6,7,1,mas move on poto poto cerita nyobain face filt...,5,0,28,154,0,0,0,1,1
7,8,1,balik cemburu selip kasih sayang cemburu hubun...,2,0,29,150,0,5,2,3,1
8,9,1,sayang beneran hadap cowok kaya ganteng pinter...,1,0,31,157,0,0,0,3,3
9,10,1,pagi mas nya duhjadi gaenak ucap met pagi neti...,4,0,34,169,0,10,1,0,-1


In [112]:
features = ['emoticon_score', 'lexicon_score']
target = 'sentimen'

X, y = data_feature_extracted[features].values, data_feature_extracted[target].values

In [127]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression, RidgeClassifier
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

classifiers = [
    ('Decission Tree', DecisionTreeClassifier()),
    ('Logistic Regression', LogisticRegression()),
    ('SVM', LinearSVC()),
    ('Multinomial Naive Bayes', MultinomialNB()),
    ('KNN', KNeighborsClassifier()),
    ('Ensemble', GradientBoostingClassifier())
]

from sklearn.model_selection import KFold

train_scores = 0
test_scores = 0

kfold = KFold(n_splits=10, random_state=46)
dt = RidgeClassifier()

for train_index, test_index in kfold.split(X):
    X_train, y_train = X[train_index], y[train_index]
    X_test, y_test = X[test_index], y[test_index]
    dt.fit(X_train, y_train)
    train_scores += accuracy_score(dt.predict(X_train), y_train)
    test_scores += accuracy_score(dt.predict(X_test), y_test)
    
print(train_scores / 10)
print(test_scores / 10)

0.8200460935841933
0.8200363145708052


In [24]:
temp_data = data_feature_extracted.copy()
temp_data['predicted'] = dt.predict(X)

In [25]:
temp_temp_data = temp_data[['id', 'sentimen', 'predicted', 'tweet']]
temp_temp_data['ls'] = temp_data['lexicon_score']
temp_temp_data['es'] = temp_data['emoticon_score']

temp_temp_data['tweet'] = temp_temp_data['tweet'].str.wrap(1000)

In [26]:
pd.set_option('display.max_colwidth', 5000)

In [27]:
temp_temp_data[temp_temp_data['predicted'] != temp_temp_data['sentimen']].head(40).tail(10)

Unnamed: 0,id,sentimen,predicted,tweet,ls,es
206,207,1,0,jokowi hasil banget deh pokok bikin acara pesta hibur rakyat monas macet manatahaan http t co piqhkoawzg,-1,0
217,218,1,0,hibur hape tidak lihat recehkantwitter iya google assistant ngirimin joke hidup uang kembali,0,0
228,229,0,1,sumpah iya komplen uang debet bca cepat dikembaliin b pn bca hari b pn minggu minggu dibalikin nominal debet gede bca mana that s why i cinta bca your service satisfies me always,2,0
251,252,1,0,asa baca novel jenis novel destar informasi sangat riil kisah nyata cacat hukum,-1,0
257,258,1,0,emas main badminton rekan dwi sulistyono sabet juara kelas double junior piala bupati 2008 2010 semi final single junior 2007 ngiang,0,0
259,260,0,1,tak pajang hadist ayat allah benar niat tak jual ayat hadist benar,2,0
270,271,1,0,muka w mang tidak indah at least w jalan indah myfaceismyproblemnotyours dibikinasikaja familytime qualitytime munduk bal indonesia,0,0
273,274,1,0,alah gede barusan senyum jarak meter lemas malam ketemu iya nc,-1,0
277,278,1,0,poin artis emotxtawa alhamdulillah sehat 2 sibuk ma anak sibuk ma suami sibuk nikah rosa sibuk koreanan emotxtawa,-2,10
279,280,0,1,tunggu anak anteng sabar luas samudra bapak dateng-dateng bikin anak menang hmkuingin marah lampias,1,0


In [38]:
raw_tester_data = pd.read_csv('test_set.csv', delimiter=',', encoding='Latin-1')

In [39]:
raw_tester_data

Unnamed: 0,test_ID,tweet
0,0,"Jadi wanita jangan suka menghancurkan hubungan orang. Jgn bangga berhasil merusak kebahagian orang. Silahkan saja, tapi ga berkah bahagianya nanti hehe."
1,1,sombong apanya kalau sms saja dibls terus
2,2,apadah kamu :p cie cie baik kamu cie bebe cie kiwkiw ;)
3,3,tdrlah besok medical check up semoga lancar â?º wml
4,4,crew serbu bsm seru (at bank syariah mandiri bekasi) [pic] â??
5,5,sian amat ditelantarin:'| tapi semua akan indah pada waktunya nge:') loh
6,6,dan dirikanlah sembahyang tunaikanlah zakat dan tatlah kepada rasul supaya kamu diberi rahmat (24:56)
7,7,kamu pikir saya bandar bayarin makan ! evil dead pokoknya star trek keren tapi
8,8,tidak-kreatif ambil kutipan orang wkwk kalau suka ya ungkapin saja dari pada ditikung ama
9,9,ya jangan dibahas di twiter juga kali ven teman saya tuh wkwk yap buktinya dia masih sayang sama mantanya han


In [40]:
tester_data_ort = generate_ortografi_feature(raw_tester_data)

In [41]:
tester_data_ort

Unnamed: 0,test_ID,tweet,capital_count,exclamation_count,word_count,char_count,word_capital_count
0,0,"Jadi wanita jangan suka menghancurkan hubungan orang. Jgn bangga berhasil merusak kebahagian orang. Silahkan saja, tapi ga berkah bahagianya nanti hehe.",3,0,25,152,0
1,1,sombong apanya kalau sms saja dibls terus,0,0,7,41,0
2,2,apadah kamu :p cie cie baik kamu cie bebe cie kiwkiw ;),0,0,14,55,0
3,3,tdrlah besok medical check up semoga lancar â?º wml,0,0,11,51,0
4,4,crew serbu bsm seru (at bank syariah mandiri bekasi) [pic] â??,0,0,17,62,0
5,5,sian amat ditelantarin:'| tapi semua akan indah pada waktunya nge:') loh,0,0,16,72,0
6,6,dan dirikanlah sembahyang tunaikanlah zakat dan tatlah kepada rasul supaya kamu diberi rahmat (24:56),0,0,16,101,0
7,7,kamu pikir saya bandar bayarin makan ! evil dead pokoknya star trek keren tapi,0,1,14,78,0
8,8,tidak-kreatif ambil kutipan orang wkwk kalau suka ya ungkapin saja dari pada ditikung ama,0,0,14,89,0
9,9,ya jangan dibahas di twiter juga kali ven teman saya tuh wkwk yap buktinya dia masih sayang sama mantanya han,0,0,20,109,0


In [44]:
tester_tweet_normalized = tester_data_ort.copy()
tester_tweet_normalized['tweet'] = tester_tweet_normalized['tweet'].apply(normalisasi)

In [45]:
tester_tweet_normalized['tweet'].values

array(['jadi wanita jangan suka menghancurkan hubungan orang jgn bangga berhasil merusak kebahagian orang silahkan saja  tapi ga berkah bahagianya nanti  emotxtawa ',
       'sombong apanya kalau sms saja dibls terus',
       'apadah kamu  emotxejek  cie cie baik kamu cie bebe cie kiwkiw  emotxkedip ',
       ..., 'serius   ', 'terserah juga oke',
       'sedihan juga saya laptop saya rusak parah   '], dtype=object)

In [49]:
tester_tweet_normalized.to_csv('tester_data_normalized.csv', columns=['test_ID', 'tweet', 'capital_count', 'exclamation_count', 'word_count', 'char_count', 'word_capital_count'], index=False, header=False)

In [52]:
tester_data_formalized = pd.read_csv('tester_data_formalized.csv', encoding='Latin-1')

In [53]:
tester_data_formalized

Unnamed: 0,test_ID,tweet,capital_count,exclamation_count,word_count,char_count,word_capital_count
0,0,jadi wanita jangan suka menghancurkan hubungan orang jgn bangga berhasil merusak kebahagian orang silahkan saja tapi ga berkah bahagianya nanti emotxtawa,3,0,25,152,0
1,1,sombong apanya kalau sms saja dibls terus,0,0,7,41,0
2,2,apadah kamu emotxejek cie cie baik kamu cie bebe cie kiwkiw emotxkedip,0,0,14,55,0
3,3,tdrlah besok medical check up semoga lancar Ã¢ Âº wml,0,0,11,51,0
4,4,crew serbu bsm seru at bank syariah mandiri bekasi pic Ã¢,0,0,17,62,0
5,5,sian amat ditelantarin tapi semua akan indah pada waktunya nge loh,0,0,16,72,0
6,6,dan dirikanlah sembahyang tunaikanlah zakat dan tatlah kepada rasul supaya kamu diberi rahmat 24 56,0,0,16,101,0
7,7,kamu pikir saya bandar bayarin makan evil dead pokoknya star trek keren tapi,0,1,14,78,0
8,8,tidak kreatif ambil kutipan orang emotxtawa kalau suka ya ungkapin saja dari pada ditikung ama,0,0,14,89,0
9,9,ya jangan dibahas di twiter juga kali ven teman saya tuh emotxtawa yap buktinya dia masih sayang sama mantanya han,0,0,20,109,0


In [116]:
# tester_data_preprocess = pre_processing(tester_data_formalized)
tester_data_preprocess = pd.read_csv('tester_data_preprocess.csv', names=['test_ID', 'tweet', 'capital_count', 'exclamation_count', 'word_count', 'char_count', 'word_capital_count'], delimiter=',', encoding='Latin-1')

In [117]:
tester_data_preprocess

Unnamed: 0,test_ID,tweet,capital_count,exclamation_count,word_count,char_count,word_capital_count
0,0,wanita suka hancur hubung orang jgn bangga has...,3,0,25,152,0
1,1,sombong apa sms dibls,0,0,7,41,0
2,2,apadah emotxejek cie cie cie bebe cie kiwkiw e...,0,0,14,55,0
3,3,tdrlah besok medical check up moga lancar wml,0,0,11,51,0
4,4,crew serbu bsm seru at bank syariah mandiri be...,0,0,17,62,0
5,5,sian ditelantarin indah nge loh,0,0,16,72,0
6,6,diri sembahyang tunai zakat tatlah rasul rahma...,0,0,16,101,0
7,7,pikir bandar bayarin makan evil dead pokok sta...,0,1,14,78,0
8,8,tidak kreatif ambil kutip orang emotxtawa suka...,0,0,14,89,0
9,9,ya bahas twiter kali ven teman tuh emotxtawa y...,0,0,20,109,0


In [118]:
tester_data_feature_extracted = feature_extraction(tester_data_preprocess)

In [119]:
tester_data_feature_extracted

Unnamed: 0,test_ID,tweet,capital_count,exclamation_count,word_count,char_count,word_capital_count,emoticon_score,negative_score_vania,positive_score_vania,lexicon_score
0,0,wanita suka hancur hubung orang jgn bangga has...,3,0,25,152,0,5,2,3,1
1,1,sombong apa sms dibls,0,0,7,41,0,0,1,0,-1
2,2,apadah emotxejek cie cie cie bebe cie kiwkiw e...,0,0,14,55,0,4,0,0,0
3,3,tdrlah besok medical check up moga lancar wml,0,0,11,51,0,0,0,1,1
4,4,crew serbu bsm seru at bank syariah mandiri be...,0,0,17,62,0,0,0,2,2
5,5,sian ditelantarin indah nge loh,0,0,16,72,0,0,0,1,1
6,6,diri sembahyang tunai zakat tatlah rasul rahma...,0,0,16,101,0,0,0,1,1
7,7,pikir bandar bayarin makan evil dead pokok sta...,0,1,14,78,0,0,0,1,1
8,8,tidak kreatif ambil kutip orang emotxtawa suka...,0,0,14,89,0,5,0,0,0
9,9,ya bahas twiter kali ven teman tuh emotxtawa y...,0,0,20,109,0,5,0,1,1


In [120]:
features = ['emoticon_score', 'lexicon_score']
Xx = tester_data_feature_extracted[features].values

In [128]:
predicted = dt.predict(Xx)

In [129]:
results = pd.DataFrame()
results['id'] = tester_data_feature_extracted['test_ID']
results['predicted_sentimen'] = predicted

In [130]:
results

Unnamed: 0,id,predicted_sentimen
0,0,1
1,1,0
2,2,1
3,3,1
4,4,1
5,5,1
6,6,1
7,7,1
8,8,1
9,9,1


In [131]:
results.to_csv('results2.csv', header=False, index=False)

In [64]:
tester_data_preprocess.to_csv('tester_data_preprocess.csv', columns=['test_ID', 'tweet', 'capital_count', 'exclamation_count', 'word_count', 'char_count', 'word_capital_count'], index=False, header=False)

In [None]:
tester_data_preprocess.to_csv('tester_data_preprocess.csv', columns=['test_ID', 'tweet', 'capital_count', 'exclamation_count', 'word_count', 'char_count', 'word_capital_count'], index=False, header=False)

In [104]:
tester_data_feature_extracted

Unnamed: 0,id,tweet,capital_count,exclamation_count,word_count,char_count,word_capital_count,emoticon_score,positive_score,positive_negation_score,negative_score,negative_negation_score,negative_score_vania,positive_score_vania
0,0,sila keluarga seokjin gue sayangg bangett plea...,0,0,42,254,0,0,19,0,-35,0,1,3
1,1,hidup segan mati tak,0,1,6,25,0,0,6,0,-9,0,1,0
2,2,pagipagi liat ka ditl ku pagi k manis emotxsenyum,0,0,11,47,0,5,8,0,-6,0,0,1
3,3,emotxdatar pagi ata calon istri bang wowon emo...,0,0,19,91,0,6,13,0,-9,0,0,2
4,4,tay gilir tengkar ngebocorin busuk gamalu ape ...,0,1,19,122,0,0,0,0,-19,0,4,0
5,5,berita via vallen dapet dm main sepak bola kes...,4,0,46,238,1,0,12,0,-30,0,1,0
6,6,orang ga nyindirnyindir bisa ngomong langsung ...,2,0,37,202,0,0,17,0,-14,0,1,0
7,7,gelas semangat penuh cinta emotxsenyum,0,0,6,31,0,5,7,0,-2,0,1,2
8,8,gua cinta banget bang wawan abang gua ya karna...,107,0,29,136,23,8,13,0,-19,0,2,2
9,9,haduh bis kasihan hujan batu kaya hancur kacak...,0,0,16,106,0,0,14,0,-18,0,3,0


In [105]:
tester_data_feature_extracted['tweet'].tolist()

['sila keluarga seokjin gue sayangg bangett please krn bokapnya yg ceoibunya yg ex miss korea tp gue sayang sm seokjin bener bener juang bangtan tau bangtan susah kayak gimana',
 'hidup segan mati tak',
 'pagipagi liat ka ditl ku pagi k manis emotxsenyum',
 'emotxdatar pagi ata calon istri bang wowon emotxsenyum pagi bolonemotxsenyum selamat pagi selamat aktivitas emotxsenyum',
 'tay gilir tengkar ngebocorin busuk gamalu ape ngomngin busuk muka busuk',
 'berita via vallen dapet dm main sepak bola kes goda sexual harassment liat komennya bilang lebay gak publish kek gitu aja parah nyinyir perempuan miris',
 'orang ga nyindirnyindir bisa ngomong langsung ampun serta dg bumbu kesal maaf teuteup ngoreksi hrs kayak gimana',
 'gelas semangat penuh cinta emotxsenyum',
 'gua cinta banget bang wawan abang gua ya karna salah gaul kaya gua emotxsenyum sama emotxtawa',
 'haduh bis kasihan hujan batu kaya hancur kacakacanya miris liat jadi kaya gin']