In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

import Sastrawi
import nltk

import re
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import cross_validate
from sklearn.metrics import confusion_matrix
from sklearn.naive_bayes import MultinomialNB
from nltk.tag import CRFTagger
from collections import Counter
import warnings
warnings.filterwarnings("ignore")
pd.set_option('display.max_colwidth', 300)

In [2]:
train_data = pd.read_csv('dataset/train_set.csv', delimiter=',', encoding='Latin')
tester_data = pd.read_csv('dataset/test_set.csv', delimiter=',', encoding='Latin')
tambahan_data = pd.read_csv('tambahan.csv', delimiter=',', encoding='Latin')

In [3]:
emoticons = pd.read_csv('emoticon.txt', delimiter='\t', names=['emoticon', 'emoticon_score'])

In [4]:
emoticon_texts = emoticons['emoticon'].tolist()
emoticon_scores = emoticons['emoticon_score'].tolist()

In [5]:
emoticonset = set()
for i in range(len(emoticon_texts)):
    emoticonset.add((emoticon_texts[i], emoticon_scores[i]))

In [6]:
def ortografi_exception(tweet):
    special_list = ['[USERNAME]', '[URL]', '[SENSITIVE-NO]']
    for sp in special_list:
        tweet = tweet.replace(sp, '')
    return tweet

def extract_ortografi_word_capital_count(tweet):
    words = nltk.word_tokenize(ortografi_exception(tweet))
    count = 0
    for word in words:
        if all([c.isupper() for c in word]):
            count = count + 1
    return count / len(words)

In [7]:
def extract_exclamation_count(tweet):
    tweet = re.sub(r'!{1,}', '!', tweet)
    return sum((1 for c in tweet if c == "!"))

In [8]:
train_data['ortografi'] = train_data['tweet'].apply(extract_ortografi_word_capital_count)
tester_data['ortografi'] = tester_data['tweet'].apply(extract_ortografi_word_capital_count)

train_data['exclamation'] = train_data['tweet'].apply(extract_exclamation_count)
tester_data['exclamation'] = tester_data['tweet'].apply(extract_exclamation_count)


tambahan_data['ortografi'] = tambahan_data['tweet'].apply(extract_ortografi_word_capital_count)
tambahan_data['exclamation'] = tambahan_data['tweet'].apply(extract_exclamation_count)


In [9]:
import re 
def extract_emoticon(tweet):
    score = 0
    for emoticon_text, emoticon_score in emoticonset:
        occurence = 0
        if emoticon_text in tweet:
            score += emoticon_score
        for i in range(len(tweet) - len(emoticon_text) - 1):
            if len(emoticon_text) <= len(tweet) and tweet[i:(i+len(emoticon_text))] == emoticon_text:
                occurence += 1
        score += (occurence * emoticon_score)
    return score

In [10]:
extract_emoticon("cie andien yang lagi bep marah marah mulu :p :) :) :) :) :-) :(")

6

In [11]:
train_data['emoticon_score'] = train_data['tweet'].apply(extract_emoticon)
tester_data['emoticon_score'] = tester_data['tweet'].apply(extract_emoticon)
tambahan_data['emoticon_score'] = tambahan_data['tweet'].apply(extract_emoticon)


In [12]:
kbba_ = pd.read_csv('kbba.txt', delimiter='\t', names=['from', 'to'])
kbba_from = kbba_['from'].tolist()
kbba_to = kbba_['to'].tolist()

kbba_repo = list()
for i in range(len(kbba_from)):
    kbba_repo.append((kbba_from[i], kbba_to[i]))
    
abbr_ = pd.read_csv('singkatan-lib.csv', delimiter=',', names=['from', 'to'])
abbr_from = abbr_['from'].tolist()
abbr_to = abbr_['to'].tolist()

abbr_repo = list()
for i in range(len(abbr_from)):
    abbr_repo.append((abbr_from[i], abbr_to[i]))

noises_ = pd.read_csv('noise.txt', names=['noise'])
noises_repo = noises_['noise'].tolist()

def normalisasi(tweet):
    normal_tw = tweet.lower() #lowercase
    normal_tw_words = nltk.word_tokenize(normal_tw)
    
    normal_tw_words_normalized = [""]
    for word in normal_tw_words:
        match = False
        for kbba_f, kbba_t in kbba_repo:
            if word == kbba_f:
                normal_tw_words_normalized.append(kbba_t)
                match = True
                break
        if not match:
            normal_tw_words_normalized.append(word)
    
    normal_tw = " ".join(normal_tw_words_normalized)

    normal_tw_words = nltk.word_tokenize(normal_tw)
    normal_tw_words_normalized = [""]
    for word in normal_tw_words:
        match = False
        for abbr_f, abbr_t in abbr_repo:
            if word == abbr_f:
                normal_tw_words_normalized.append(abbr_t)
                match = True
                break
        if not match:
            normal_tw_words_normalized.append(word)
    normal_tw = " ".join(normal_tw_words_normalized)

#     normal_tw_words_normalized = []
#     for word in normal_tw_words:
#         match = False
#         for noise in noises_repo:
#             if word == noise:
#                 match = True
#                 break
#         if not match:
#             normal_tw_words_normalized.append(word)
#     normal_tw = " ".join(normal_tw_words_normalized)

    normal_tw = re.sub('(\.){1,}', ' ', normal_tw)
    normal_tw = re.sub('\s+', ' ', normal_tw) # remove extra space
    normal_tw = normal_tw.strip() #trim depan belakang
    normal_tw = re.sub(r'(wk){2,}|(wka){2,}|(ck){2,}|(ha){2,}|(he){2,}', ' emotxtawa ', normal_tw)
    normal_tw = re.sub(r'(hiks)|(kiw){2,}|(hu){2,}', ' emotxtangis ', normal_tw)
    normal_tw   =   re.sub(r'[^\w\s\.]',' ',normal_tw)   #buang punctuation
    normal_tw = re.sub(r'([A-Za-z])\1{1,}\s', r'\1', normal_tw)
    normal_tw = re.sub(r'([A-Za-z])\1{1,}$', r'\1', normal_tw)
    normal_tw = normal_tw.strip()
    return normal_tw

In [13]:
def extract_emoticon_2(tweet):
    score = 0
    words = nltk.word_tokenize(tweet)
    for word in words:
        if word == 'emotxtawa':
            score += 1
        if word == 'emotxtangis':
            score -= 1
    return score

In [14]:
train_data['tweet'] = train_data['tweet'].apply(normalisasi)
tester_data['tweet'] = tester_data['tweet'].apply(normalisasi)

tambahan_data['tweet'] = tambahan_data['tweet'].apply(normalisasi)

In [15]:
train_data['emoticon_score'] = train_data['emoticon_score'] + train_data['tweet'].apply(extract_emoticon_2)
tester_data['emoticon_score'] = tester_data['emoticon_score'] + tester_data['tweet'].apply(extract_emoticon_2)

tambahan_data['emoticon_score'] = tambahan_data['emoticon_score'] + tambahan_data['tweet'].apply(extract_emoticon_2)


In [16]:
train_data.to_csv('train_data_normalized.csv', index=False, header=False)
tester_data.to_csv('tester_data_normalized.csv', index=False, header=False)

tambahan_data.to_csv('tambahan_data_normalized.csv', index=False, header=False, columns=['id', 'sentimen', 'tweet','emoticon_score'])


In [17]:
train_data_formalized = pd.read_csv('dataset/train_data_formalized.csv', encoding='Latin')
tester_data_formalized = pd.read_csv('dataset/tester_data_formalized.csv', encoding='Latin')

# train_data_formalized = train_data
# tester_data_formalized = tester_data

tambahan_data_formalized = pd.read_csv('tambahan_data_formalized.csv', encoding='Latin')
tambahan_data_formalized['ortografi'] = tambahan_data['ortografi']
tambahan_data_formalized['exclamation'] = tambahan_data['exclamation']

train_data_formalized['ortografi'] = train_data['ortografi']
train_data_formalized['exclamation'] = train_data['exclamation']

tester_data_formalized['ortografi'] = tester_data['ortografi']
tester_data_formalized['exclamation'] = tester_data['exclamation']

In [18]:
stopwords = pd.read_csv('stopwords.txt', header=None)[0].values
def remove_stopwords(tweet, stopwords):
    special_list = ['username', 'url', 'sensitive-no']
    token = nltk.word_tokenize(tweet)
    token_afterremoval = []
    for k in token:
        if k not in stopwords and k not in special_list:
            token_afterremoval.append(k)
    str_clean = ' '.join(token_afterremoval)
    return str_clean

In [19]:
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory


def stemming(tweet):
    token = nltk.word_tokenize(tweet)
    stem_kalimat = []
    factory = StemmerFactory()
    stemmer = factory.create_stemmer()
    for k in token:
        stem_kata = stemmer.stem(k)
        stem_kalimat.append(stem_kata)
    stem_kalimat_str = ' '.join(stem_kalimat)
    return stem_kalimat_str

In [20]:
def pre_processing(tweets):
    temp_tweets = tweets.copy()
    temp_tweets['tweet'] = temp_tweets['tweet'].apply(lambda tweet: remove_stopwords(tweet, stopwords))
#     temp_tweets['tweet'] = temp_tweets['tweet'].apply(stemming)
    return temp_tweets

In [21]:
train_data_preprocess = pre_processing(train_data_formalized)
tester_data_preprocess = pre_processing(tester_data_formalized)

tambahan_data_preprocess = pre_processing(tambahan_data_formalized)
# train_data_preprocess.to_csv('train_data_preprocessed.csv', index=False)
# tester_data_preprocess.to_csv('tester_data_preprocessed.csv', index=False)

# train_data_preprocess = pd.read_csv('train_data_preprocessed.csv', delimiter=',', encoding='Latin-1', names=['id', 'sentimen', 'tweet', 'capital_count', 'exclamation_count', 'word_count', 'char_count', 'word_capital_count'])

In [22]:
positives = set(pd.read_csv('positif_vania.txt', names=['word'])['word'].tolist())
negatives = set(pd.read_csv('negatif_vania.txt', names=['word'])['word'].tolist())

In [23]:
ct = CRFTagger()
ct.set_model_file("all_indo_man_tag_corpus_model.crf.tagger")

def extract_jj(tweet):
    words = nltk.word_tokenize(tweet)
    tag = ct.tag_sents([words])
    flat_tag = [item for sublist in tag for item in sublist]
    pos_count = Counter([j for i,j in flat_tag])
    return pos_count['JJ']

def extract_neg(tweet):
    words = nltk.word_tokenize(tweet)
    tag = ct.tag_sents([words])
    flat_tag = [item for sublist in tag for item in sublist]
    pos_count = Counter([j for i,j in flat_tag])
    return pos_count['NEG']

In [24]:
jj = set()

def extract_negative_lexicon(tweet):
    score = 0
    words = nltk.word_tokenize(tweet)
    tag = ct.tag_sents([words])
    flat_tag = [item for sublist in tag for item in sublist]
    words_tag = dict()
    for w, tg in flat_tag:
        if tg == 'JJ':
            jj.add(w)
            words_tag[w] = tg
    for i in range(len(words)):
        if i > 1 and words[i-2] == 'tidak' and words[i] in positives:
            score += 1
    for negative in negatives:
        for i in range(len(words)):
            if words[i] == negative:
                if (i > 0):
                    if (words[i-1] != 'tidak'):
                        score += 1
#                         if words[i] in words_tag.keys() and words_tag[words[i]] == 'JJ':
#                             score += 0.5
                        
                else:
                    score += 1
    for positive in positives:
        if ('tidak ' + positive) in tweet:
            score += 1
        if ('jangan ' + positive) in tweet:
            score += 1
    return score

In [25]:
def extract_positive_lexicon(tweet):
    score = 0
    words = nltk.word_tokenize(tweet)
    tag = ct.tag_sents([words])
    flat_tag = [item for sublist in tag for item in sublist]
    words_tag = dict()
    for w, tg in flat_tag:
        if tg == 'JJ':
            jj.add(w)
            words_tag[w] = tg
    for i in range(len(words)):
        if i > 1 and words[i-2] == 'tidak' and words[i] in negatives:
            score += 1
    for positive in positives:
        for i in range(len(words)):
            if words[i] == positive:
                if (i > 0):
                    if (words[i-1] != 'tidak'):
                        score += 1
#                         if words[i] in words_tag.keys() and words_tag[words[i]] == 'JJ':
#                             score += 0.5
                else:
                    score += 1
            
    for negative in negatives:
        if ('tidak ' + negative) in tweet:
            score += 1
        if ('jangan ' + negative) in tweet:
            score += 1
    return score

In [26]:
extract_positive_lexicon('tidak kreatif')

0

In [27]:
def extract_feature(data):
    temp_data = data.copy()
    temp_data['lexicon_pos_score'] = temp_data['tweet'].apply(extract_positive_lexicon)
    temp_data['lexicon_neg_score'] = temp_data['tweet'].apply(extract_negative_lexicon)
    temp_dat_2 = data.copy()
    temp_dat_2['lexicon_score'] = (temp_data['lexicon_pos_score'] - temp_data['lexicon_neg_score'])
    temp_dat_2['lexicon_pos_score'] = temp_data['lexicon_pos_score']
    temp_dat_2['lexicon_neg_score'] = temp_data['lexicon_neg_score']
    temp_dat_2['jj'] = temp_dat_2['tweet'].apply(extract_jj)
    temp_dat_2['neg'] = temp_dat_2['tweet'].apply(extract_neg)
    return temp_dat_2

In [28]:
train_data_extracted = extract_feature(train_data_preprocess)
tester_data_extracted = extract_feature(tester_data_preprocess)
tambahan_data_extracted = extract_feature(tambahan_data_preprocess)

In [29]:
# zero_cond = (train_data_extracted['sentimen'] == 0) & (train_data_extracted['lexicon_score'] == 0) & (train_data_extracted['emoticon_score'] == 0)
# train_data_extracted['lexicon_score'].iloc[zero_cond.values] = -2 

# zero_cond = (tester_data_extracted['lexicon_score'] == 0) & (tester_data_extracted['emoticon_score'] == 0)
# tester_data_extracted['lexicon_score'].iloc[zero_cond.values] = -2 

In [30]:
features = ['lexicon_score', 'emoticon_score']
target = 'sentimen'

X, y = train_data_extracted[features].values, train_data_extracted[target].values
Xx = tester_data_extracted[features].values

In [31]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression, RidgeClassifier
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

classifiers = [
    ('Decission Tree', DecisionTreeClassifier()),
    ('Logistic Regression', LogisticRegression()),
    ('SVM', LinearSVC()),
    ('Multinomial Naive Bayes', MultinomialNB()),
    ('KNN', KNeighborsClassifier()),
    ('Ensemble', GradientBoostingClassifier())
]

from sklearn.model_selection import KFold

train_scores = 0
test_scores = 0

kfold = KFold(n_splits=10, random_state=46)
dt = DecisionTreeClassifier()

for train_index, test_index in kfold.split(X):
    X_train, y_train = X[train_index], y[train_index]
    X_test, y_test = X[test_index], y[test_index]
    dt.fit(X_train, y_train)
    train_scores += accuracy_score(dt.predict(X_train), y_train)
    test_scores += accuracy_score(dt.predict(X_test), y_test)
    
print(train_scores / 10)
print(test_scores / 10)

0.8282622595128546
0.8258116639736135


In [32]:
predicted = dt.predict(X)
train_data_extracted['predicted'] = predicted

In [33]:
train_data_extracted

Unnamed: 0,id,sentimen,tweet,emoticon_score,ortografi,exclamation,lexicon_score,lexicon_pos_score,lexicon_neg_score,jj,neg,predicted
0,1,1,oks kak semangat iya,0,0.000000,0,2,2,0,0,0,1
1,2,0,kaya orang bodoh bodoh sangat,0,0.000000,0,-2,0,2,0,0,0
2,3,1,diumumkan lulus 100 sujud syukur langsung mengambil bunga menghampiri langsung memeluk menciumku air mata tidak kuasa kubendung mom this is my birthday present for,0,0.000000,0,1,2,1,1,1,1
3,4,0,reformasi demokrasi negeri kenyataannya tidak mengakui kelebihan pemimpin tidak diambil contoh nya generasi muda tolak,0,0.000000,0,2,4,2,1,2,1
4,5,0,macet macetan perut kosong mampir soto betawi gki pamulang pic Ã¢,0,0.000000,0,-2,0,2,0,0,0
5,6,0,pernyataan mengganggu telinga malam mulut juru bicara pemerintah ali mochtar ngabalin pemerintah representasi tuhan muka bumi dekaden nalar demokrasi pandangan dibiarkan menguasai istana tua,0,0.000000,0,-1,1,2,1,0,0
6,7,1,masi move on poto poto ceritanya nyobain face filter hengpong syanggih diajeng astri eka pertiwi gada orangnya gausah dicari,0,0.000000,0,0,0,0,0,0,0
7,8,1,dibalik kecemburuan terselip kasih sayang cemburu hubungan kadar wajar,1,0.000000,0,1,3,2,1,0,1
8,9,1,sayang beneran dihadapkan cowok kaya ganteng pinter serba tetep tidak berpindah,0,0.000000,0,3,3,0,1,1,1
9,10,1,pagi mas nya duhjadi gaenak ucapan selamat pagi netijen twitter tertawa bicara bicara kejauhan alias darimana kota tinggalmu tertawa,0,0.000000,0,3,3,0,1,0,1


In [34]:
train_data_extracted[train_data_extracted['predicted'] != train_data_extracted['sentimen']].head(40).tail(20)

Unnamed: 0,id,sentimen,tweet,emoticon_score,ortografi,exclamation,lexicon_score,lexicon_pos_score,lexicon_neg_score,jj,neg,predicted
138,139,0,belajar agama tuntas nga ngarang hidup isa disebut-disebut dialquran jelek-jelekin isa baca alquran tidak,0,0.0,0,1,2,1,1,1,1
142,143,0,sepenanggungan meninggalnya ramona sari sejawat aktivis salah pejuang tangguh hak kesehatan seksual reproduksi perempuan indonesia semoga maha berkenan memeluk pulang tuhan tuhan mengambil,0,0.0,0,3,4,1,3,0,1
151,152,0,semoga roboh menimpa himbauan keras tidak sembarangan situs sejarah sadis terpaksa pekok berjamaah garansi seumur hidup,0,0.0,0,1,4,3,2,1,1
158,159,1,percaya tuhan rencana orang berbuat kesalahan tidak mengulang,0,0.0,0,0,1,1,0,1,0
171,172,1,buset indosat sangat deh nge berentiin pakai bukanan id indosat super wifi,0,0.0,0,0,0,0,0,0,0
175,176,0,setuju bang bikin hidupnya tidak tenang iya tidak mikirin mencari orderan enak gedeg bikin opik begituan,0,0.0,0,1,2,1,1,2,1
184,185,1,cinta si hitam idung lebar Ã¢,0,0.0,0,0,1,1,1,0,0
192,193,0,sehat hidupin data seluler muncul notif kanan data seluler otomatis mati berulang padal kuotanya msih 1 gb tidak dipakai respon respon iya,0,0.0,0,1,2,1,3,1,1
217,218,1,hiburan hape tidak lihat recehkantwitter iya google assistant ngirimin joke hidupku uang kembalian,0,0.0,0,0,0,0,0,1,0
228,229,0,sumpah iya komplen uang kedebet bca cepat dikembaliin b pn bca sehari b pn berminggu minggu dibalikin nominal kedebet gedean bca kemana that s why i cinta bca your service satisfies me always,0,0.1,0,2,2,0,2,0,1


In [35]:
tester_predicted = dt.predict(Xx)

In [36]:
tester_predicted

array([0, 0, 0, ..., 0, 0, 0])

In [37]:
tester_data_extracted['predicted'] = tester_predicted

In [38]:
tester_data_extracted

Unnamed: 0,tweetID,tweet,emoticon_score,ortografi,exclamation,lexicon_score,lexicon_pos_score,lexicon_neg_score,jj,neg,predicted
0,0,wanita jangan suka menghancurkan hubungan orang jangan bangga berhasil merusak kebahagian orang silahkan tidak berkah bahagianya he,0,0.000000,0,-1,4,5,0,1,0
1,1,sombong apanya sms dibls,0,0.000000,0,-1,0,1,0,0,0
2,2,apadah p cie cie cie bebe cie kiwkiw,0,0.000000,0,0,0,0,1,0,0
3,3,tdrlah besok medical check up semoga lancar Ã¢ Âº wml,0,0.000000,0,2,2,0,1,0,1
4,4,crew serbu bsm seru bang syariah mandiri bekasi pic Ã¢,0,0.000000,0,1,2,1,0,0,1
5,5,sian ditelantarin indah nge lho,1,0.000000,0,1,1,0,1,0,1
6,6,dirikanlah sembahyang tunaikanlah zakat tatlah rasul rahmat 24 56,0,0.000000,0,1,1,0,0,0,1
7,7,pikir bandar bayarin makan evil dead pokoknya star trek keren,0,0.000000,1,2,2,0,0,0,1
8,8,tidak kreatif ambil kutipan orang tertawa suka iya ungkapin ditikung,0,0.000000,0,1,2,1,1,1,1
9,9,iya jangan dibahas twiter kali ven teman tertawa iya buktinya sayang mantanya han,0,0.000000,0,2,2,0,0,0,1


In [39]:
tester_data_extracted.to_csv('results9.csv', header=False, index=False, columns=['test_ID', 'predicted'])

In [40]:
extract_emoticon("cie andien yang lagi bep marah marah mulu:p :) :) :) :)")

4

In [41]:
extract_positive_lexicon('tidak kreatif ambil kutipan orang tertawa suka iya ungkapin ditikung')

2

In [42]:
extract_positive_lexicon('tidak kreatif')

0

In [43]:
train_data

Unnamed: 0,id,sentimen,tweet,ortografi,exclamation,emoticon_score
0,1,1,oks kak semangat iya kalian kalian,0.000000,0,0
1,2,0,sekarang harus kaya orang bodoh lagi bodoh sangat,0.000000,0,0
2,3,1,begitu diumumkan lulus 100 mereka semua sujud syukur dan langsung mengambil bunga saat dia menghampiri langsung memeluk menciumku air mata tidak kuasa kubendung mom this is my birthday present for u url,0.000000,0,0
3,4,0,username username katanya bapak reformasi dan demokrasi di negeri kita ini tapi kenyataannya sebaliknya tidak mau mengakui kelebihan seseorang pemimpin macam apa itu tidak ada yang bisa diambil contoh dari nya kami sebagai generasi muda tolak,0.000000,0,0
4,5,0,macet macetan perut kosong akhirnya mampir dahulu soto betawi di gki pamulang pic â,0.000000,0,0
5,6,0,pernyataan paling mengganggu telinga malam ini keluar dari mulut juru bicara pemerintah ali mochtar ngabalin pemerintah adalah representasi tuhan di muka bumi makin dekaden nalar demokrasi kita kalau pandangan begitu dibiarkan menguasai istana ternyata sama saja dengan pak tua itu itu,0.000000,0,0
6,7,1,masi belum move on dari poto poto ini ceritanya lagi nyobain face filter hengpong syanggih punya diajeng astri eka pertiwi gada orangnya gausah dicari,0.000000,0,0
7,8,1,dibalik kecemburuan terselip rasa kasih sayang yang dalam maka dari itu cemburu diperlukan dalam setiap hubungan tetapi tetap dalam kadar yang wajar tentunya,0.000000,0,1
8,9,1,kalau sudah sayang beneran itu mau dihadapkan sama cowok yang lebih kaya ganteng pinter serba dan lain lain itu pasti akan tetep tidak mau berpindah,0.000000,0,0
9,10,1,username pagi juga mas nya duhjadi gaenak dapat ucapan selamat pagi dari netijen twitter tertawa ngomong ngomong dari kejauhan berapa kamu ini alias darimana kota tempat tinggalmu tertawa,0.000000,0,0


In [44]:
train_data_extracted

Unnamed: 0,id,sentimen,tweet,emoticon_score,ortografi,exclamation,lexicon_score,lexicon_pos_score,lexicon_neg_score,jj,neg,predicted
0,1,1,oks kak semangat iya,0,0.000000,0,2,2,0,0,0,1
1,2,0,kaya orang bodoh bodoh sangat,0,0.000000,0,-2,0,2,0,0,0
2,3,1,diumumkan lulus 100 sujud syukur langsung mengambil bunga menghampiri langsung memeluk menciumku air mata tidak kuasa kubendung mom this is my birthday present for,0,0.000000,0,1,2,1,1,1,1
3,4,0,reformasi demokrasi negeri kenyataannya tidak mengakui kelebihan pemimpin tidak diambil contoh nya generasi muda tolak,0,0.000000,0,2,4,2,1,2,1
4,5,0,macet macetan perut kosong mampir soto betawi gki pamulang pic Ã¢,0,0.000000,0,-2,0,2,0,0,0
5,6,0,pernyataan mengganggu telinga malam mulut juru bicara pemerintah ali mochtar ngabalin pemerintah representasi tuhan muka bumi dekaden nalar demokrasi pandangan dibiarkan menguasai istana tua,0,0.000000,0,-1,1,2,1,0,0
6,7,1,masi move on poto poto ceritanya nyobain face filter hengpong syanggih diajeng astri eka pertiwi gada orangnya gausah dicari,0,0.000000,0,0,0,0,0,0,0
7,8,1,dibalik kecemburuan terselip kasih sayang cemburu hubungan kadar wajar,1,0.000000,0,1,3,2,1,0,1
8,9,1,sayang beneran dihadapkan cowok kaya ganteng pinter serba tetep tidak berpindah,0,0.000000,0,3,3,0,1,1,1
9,10,1,pagi mas nya duhjadi gaenak ucapan selamat pagi netijen twitter tertawa bicara bicara kejauhan alias darimana kota tinggalmu tertawa,0,0.000000,0,3,3,0,1,0,1


In [45]:
train_data_preprocessed = pd.read_csv('train_data_preprocessed.csv', encoding='Latin')
tester_data_preprocessed = pd.read_csv('tester_data_preprocessed.csv', encoding='Latin')

In [46]:
train_data_preprocessed['ortografi'] = train_data_extracted['ortografi']
train_data_preprocessed['exclamation'] = train_data_extracted['exclamation']
train_data_preprocessed['emoticon_score'] = train_data_extracted['emoticon_score']
train_data_preprocessed['lexicon_score'] = train_data_extracted['lexicon_score']
train_data_preprocessed['jj'] = train_data_extracted['jj']
train_data_preprocessed['neg'] = train_data_extracted['neg']
train_data_preprocessed['lexicon_pos_score'] = train_data_extracted['lexicon_pos_score']
train_data_preprocessed['lexicon_neg_score'] = train_data_extracted['lexicon_neg_score']
train_data_preprocessed['lexicon_neg_score'] = train_data_extracted['lexicon_neg_score']

train_data_preprocessed.to_csv('tdp.csv', encoding='Latin', index=False)

In [47]:
tester_data_preprocessed['ortografi'] = tester_data_extracted['ortografi']
tester_data_preprocessed['exclamation'] = tester_data_extracted['exclamation']
tester_data_preprocessed['emoticon_score'] = tester_data_extracted['emoticon_score']
tester_data_preprocessed['lexicon_score'] = tester_data_extracted['lexicon_score']
tester_data_preprocessed['jj'] = tester_data_extracted['jj']
tester_data_preprocessed['neg'] = tester_data_extracted['neg']
tester_data_preprocessed['lexicon_pos_score'] = tester_data_extracted['lexicon_pos_score']
tester_data_preprocessed['lexicon_neg_score'] = tester_data_extracted['lexicon_neg_score']
tester_data_preprocessed.to_csv('tdp_test.csv', encoding='Latin', index=False)

In [48]:
tambahan_data_extracted.to_csv('tambahan_preprocess.csv', encoding='Latin', index=False)