# Preprocessing


## Import Packages and Data

In [1]:
import pandas as pd
import numpy as np
import re

import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style("whitegrid")

import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import googletrans

import time

import warnings
warnings.filterwarnings("ignore")

In [2]:
data_train = pd.read_csv("../Data/data_train.csv",sep=";")
data_train.head()

Unnamed: 0,Tweet,HS,Abusive,HS_Individual,HS_Group,HS_Religion,HS_Race,HS_Physical,HS_Gender,HS_Other,HS_Weak,HS_Moderate,HS_Strong
0,- kdg will be kind enough to show you how to l...,0,1,0,0,0,0,0,0,0,0,0,0
1,Ternyata komunis juga bisa nangis',0,0,0,0,0,0,0,0,0,0,0,0
2,USER USER Kenapa harus bom ya? Kek yg benar nu...,0,0,0,0,0,0,0,0,0,0,0,0
3,USER Sumpaaah kaya kalo abis IAA dilanjut olim...,0,0,0,0,0,0,0,0,0,0,0,0
4,RT USER USER USER USER USER fungsi media sosia...,1,0,1,0,0,0,0,0,1,1,0,0


In [3]:
data_test = pd.read_csv("../Data/data_test.csv",sep=";")
data_test.head()

Unnamed: 0,Tweet,HS,Abusive,HS_Individual,HS_Group,HS_Religion,HS_Race,HS_Physical,HS_Gender,HS_Other,HS_Weak,HS_Moderate,HS_Strong
0,GANTENG TAPI BERENGSEK!!! BUAT APA?!!\nWHO DO ...,1,1,1,0,0,0,0,0,1,1,0,0
1,Sakit jiwa kali yah..,0,0,0,0,0,0,0,0,0,0,0,0
2,"Does insulting one for being ""Cina"", ""Kafir"", ...",0,1,0,0,0,0,0,0,0,0,0,0
3,USER USER Karena partaimu pdip itu bajingan',1,1,0,1,0,0,0,0,1,0,1,0
4,USER USER Jgn salah itu janji jokowi yg di tun...,1,1,1,0,0,0,0,0,1,1,0,0


In [4]:
data_val = pd.read_csv("../Data/data_val.csv",sep=";")
data_val.head()

Unnamed: 0,Tweet,HS,Abusive,HS_Individual,HS_Group,HS_Religion,HS_Race,HS_Physical,HS_Gender,HS_Other,HS_Weak,HS_Moderate,HS_Strong
0,So much berengsek people in the world,1,1,0,1,0,0,0,0,1,0,1,0
1,"So, how can I get my REKBER?? rekening berengsek.",0,1,0,0,0,0,0,0,0,0,0,0
2,RT USER: dasar bajingan!\xf0\x9f\x98\xa0 URL,0,1,0,0,0,0,0,0,0,0,0,0
3,USER Krn cebong tdk akn mengakui pria lajang y...,1,1,0,1,0,0,0,0,1,0,1,0
4,"RT USER: ga ada urusannya, monyet. gua broken ...",1,1,1,0,0,0,0,0,1,1,0,0


## Preprocessing

### Text Cleaning and Preparation

1. menghilangkan simbol khusus seperti \r, \n, ", url, dan byte code

In [5]:
def cleaning(data):
    data['Tweet_Parsed_1'] = data['Tweet'].str.replace("\\\\r", " ")
    data['Tweet_Parsed_1'] = data['Tweet_Parsed_1'].str.replace("\\\\n", " ")
    data['Tweet_Parsed_1'] = data['Tweet_Parsed_1'].str.replace("\n", " ")
    data['Tweet_Parsed_1'] = data['Tweet_Parsed_1'].str.replace("\r", " ")
    data['Tweet_Parsed_1'] = data['Tweet_Parsed_1'].str.replace("&amp", " ")
    data['Tweet_Parsed_1'] = data['Tweet_Parsed_1'].str.replace("&gt", " ")
    data['Tweet_Parsed_1'] = data['Tweet_Parsed_1'].str.replace("&lt", " ")
    data['Tweet_Parsed_1'] = data['Tweet_Parsed_1'].str.replace("    ", " ")
    data['Tweet_Parsed_1'] = data['Tweet_Parsed_1'].str.replace(r'''(?i)\b((?:https|http?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'".,<>?«»“”‘’]))''', " ")
    data['Tweet_Parsed_1'] = data['Tweet_Parsed_1'].str.replace('"', '')
    data['Tweet_Parsed_1'] = data['Tweet_Parsed_1'].str.replace("\\\\x[a-zA-z0-9][a-zA-z0-9]","")
    return data

In [6]:
data_train = cleaning(data_train)
data_test = cleaning(data_test)
data_val = cleaning(data_val)

2. mengubah menjadi lower case

In [7]:
def lowerCase(data):
    data['Tweet_Parsed_2'] = data['Tweet_Parsed_1'].str.lower()
    return data

In [8]:
data_train = lowerCase(data_train)
data_test = lowerCase(data_test)
data_val = lowerCase(data_val)

3. Hapus simbol dan tanda baca

In [9]:
def removePunct(data):
    data['Tweet_Parsed_3'] = data['Tweet_Parsed_2']
    data['Tweet_Parsed_3'] = data['Tweet_Parsed_3'].str.replace("'s","")
    data['Tweet_Parsed_3'] = data['Tweet_Parsed_3'].str.replace("can't","can not")
    data['Tweet_Parsed_3'] = data['Tweet_Parsed_3'].str.replace("ain't","am not")
    data['Tweet_Parsed_3'] = data['Tweet_Parsed_3'].str.replace("n't"," not")
    data['Tweet_Parsed_3'] = data['Tweet_Parsed_3'].str.replace("'re"," are")
    data['Tweet_Parsed_3'] = data['Tweet_Parsed_3'].str.replace('[^a-zA-Z0-9]+',' ')
    return data

In [10]:
data_train = removePunct(data_train)
data_test = removePunct(data_test)
data_val = removePunct(data_val)

4. 1. Normalisasi Kata

In [11]:
kamus_normal = pd.read_csv("../Data/kamusnormalisasi.csv",encoding='latin-1',header=None,names=["non-standard word","standard word"])
kamus_normal.head()

Unnamed: 0,non-standard word,standard word
0,27-Jun,dua puluh tujuh juni
1,01curang,01 curang
2,01thechampion,01 the champion
3,02wintheelection,02 win the election
4,Â½ramayana,ramayana


In [12]:
nonstdword = kamus_normal['non-standard word'].values.tolist()
stdword = kamus_normal['standard word'].values.tolist()

In [13]:
nonstdword

['27-Jun',
 '01curang',
 '01thechampion',
 '02wintheelection',
 'Â½ramayana',
 '10000hours',
 '10harilagigantipresiden',
 '10harilagilebaranakalsehat',
 '10harimenujumenang',
 '10jt',
 '10k',
 '10k',
 '10kk',
 '10rb',
 '10th',
 '120k',
 '130thipuanniversary',
 '13rb',
 '145k',
 '149rb',
 '14rb',
 '15jt',
 '16daysagainstviolenceagainstwomen',
 '16gb',
 '16jam',
 '16th',
 '17an',
 '17aprilprabowopresiden',
 '1jt',
 '1kg',
 '1periode',
 '1st',
 '1t',
 '2009vs2019',
 '200an',
 '200rbu',
 '2018gantigubernur',
 '2019asalbukanjokowi',
 '2019changepresident',
 '2019changepresidents',
 '2019forjkw',
 '2019g',
 '2019ga',
 '2019gaadaun',
 '2019ganti',
 '2019gantikacung',
 '2019gantip',
 '2019gantipesinden',
 '2019gantipre',
 '2019gantipres',
 '2019gantipresid',
 '2019gantipreside',
 '2019gantipresiden',
 '2019gantipresidenbaru',
 '2019gantipresidenkacung',
 '2019gantirezim',
 '2019gantisistem',
 '2019gantitelor',
 '2019jokowiend',
 '2019jokowilagi',
 '2019jokowilanjutkan',
 '2019jokowilengser',
 

In [14]:
stdword

['dua puluh tujuh juni',
 '01 curang',
 '01 the champion',
 '02 win the election',
 'ramayana',
 '10000 hours',
 '10 hari lagi ganti presiden',
 '10 hari lagi lebaran akal sehat',
 '10 hari menuju menang',
 'sepuluh juta',
 '10000',
 'sepuluh ribu',
 'sepuluh juta',
 'sepuluh ribu',
 'sepuluh tahun',
 'seratus dua puluh ribu',
 '130th ipu anniversary',
 'tiga belas ribu',
 'seratus empat puluh lima ribu',
 'seratus empat puluh sembilan ribu',
 'empat belas ribu',
 'lima belas juta',
 '16 days againts violence againts women',
 'enam belas giga byte',
 'enam belas jam',
 'enam belas tahun',
 'tujuh belasan',
 '17 april prabowo presiden',
 'satu juta',
 'satu kilogram',
 'satu periode',
 'pertama',
 'satu triliun',
 '2009 versus 2019',
 'dua ratusan',
 'dua ratus ribu',
 '2018 ganti gubernur',
 '2019 asal bukan jokowi',
 '2019 change president',
 '2020 change president',
 '2019 for jokowi',
 '2019 ganti presiden',
 '2019 ganti presiden',
 '2019 tidak ada ujian nasional',
 '2019 ganti pres

In [15]:
def normalize_text(text, stdword_, nonstdword_):
    text = text.split(" ")
    for i in range(len(text)):
        if text[i] in nonstdword_:
            index = nonstdword_.index(text[i])
            text[i] = stdword_[index]
    return ' '.join(map(str, text))

def normalizeData(data, stdword_, nonstdword_):
    data['Tweet_Parsed_4'] = data['Tweet_Parsed_3']
    data['Tweet_Parsed_4'] = data['Tweet_Parsed_4'].map(lambda com : normalize_text(com,stdword_,nonstdword_))
    return data

In [16]:
data_train = normalizeData(data_train, stdword, nonstdword)
data_test = normalizeData(data_test, stdword, nonstdword)
data_val = normalizeData(data_val, stdword, nonstdword)

In [19]:
data_train.to_csv("../Data/data_train_normalizazionprocess_checkpoint.csv",index = False,sep=";")
data_test.to_csv("../Data/data_test_normalizazionprocess_checkpoint.csv",index = False,sep=";")
data_val.to_csv("../Data/data_val_normalizazionprocess_checkpoint.csv",index = False,sep=";")

4. 2. Translate bahasa Inggris ke Indonesia (optional)

In [20]:
data_train_trans = data_train.copy()
data_test_trans = data_test.copy()
data_val_trans = data_val.copy()

In [26]:
train_trans = pd.read_excel("../Data/train_translated.xlsx")
test_trans = pd.read_excel("../Data/test_translated.xlsx")
val_trans = pd.read_excel("../Data/val_translated.xlsx")

def translateData(data, trans_data):
    data['Tweet_Parsed_4'] = trans_data['Tweet_trans'].str.lower()
    return data

In [28]:
data_train_trans = translateData(data_train_trans, train_trans)
data_test_trans = translateData(data_test_trans, test_trans)
data_val_trans = translateData(data_val_trans, val_trans)

In [None]:
# data_train_trans['Tweet_Parsed_4'] = data_train_trans['Tweet_Parsed_4'].str.lower()
# data_test_trans['Tweet_Parsed_4'] = data_test_trans['Tweet_Parsed_4'].str.lower()

4. 3. Menghilangkan Angka

In [29]:
def removeNumb(data):
    data['Tweet_Parsed_4'] = data['Tweet_Parsed_4'].str.replace('[^a-zA-Z]+',' ')
    return data

In [19]:
data_train = removeNumb(data_train)
data_test = removeNumb(data_test)
data_val = removeNumb(data_val)

In [30]:
data_train_trans = removeNumb(data_train_trans)
data_test_trans = removeNumb(data_test_trans)
data_val_trans = removeNumb(data_val_trans)

4. 4. Menghilangkan kata tidak bermakna

In [31]:
meaningless = pd.read_csv("../Data/new_stopword.csv",header=None,names=['stopword'])
meaningless = meaningless['stopword'].tolist()

In [32]:
meaningless

['aduh',
 'ah',
 'alah',
 'alamak',
 'aw',
 'bbbbbbitcdjaj',
 'beh',
 'beuh',
 'bih',
 'bla',
 'blukkk',
 'brak',
 'cih',
 'ckck',
 'coy',
 'cyuuqqqq',
 'dah',
 'deh',
 'dei',
 'deng',
 'dih',
 'doang',
 'dong',
 'duh',
 'eak',
 'eh',
 'eits',
 'euy',
 'f1',
 'f8affdf64544',
 'gasnhsjsjsajja',
 'geez',
 'ggg',
 'ggggg',
 'gih',
 'hah',
 'haha',
 'hai',
 'halo',
 'hehe',
 'hem',
 'hih',
 'hihi',
 'hiks',
 'hu',
 'hue',
 'huft',
 'huh',
 'huhu',
 'ih',
 'isbskakdb',
 'jeng',
 'jiah',
 'jos',
 'kah',
 'kan',
 'ko',
 'kok',
 'kshssksj',
 'kxkskdck',
 'la',
 'lah',
 'lalalala',
 'loh',
 'mah',
 'mengg',
 'meow',
 'msnakdnskdnskdndjfn',
 'muah',
 'nah',
 'ngahajhaab',
 'nge',
 'nxnsjdjjsz',
 'nya',
 'oh',
 'oi',
 'oops',
 'pft',
 'pun',
 'reeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeee',
 'se',
 'shhshddhdhdjd',
 'shshshh',
 'sih',
 'sip',
 'sjsijdksk',
 'sjsksksk',
 'tenetnetnettenet',
 'toh',
 'tuh',
 'uh',
 'uhuk',
 'uhuy',
 'ups',
 'utututut',
 'waduh',
 'wah',
 'weh',
 'wey',
 'wkwk',

In [33]:
def removeMeaningless(data,meaningless_):
    for word in meaningless_:
        regex_meaningless = r"\b" + word + r"\b"
        data['Tweet_Parsed_4'] = data['Tweet_Parsed_4'].str.replace(regex_meaningless, '')
    return data

In [23]:
data_train = removeMeaningless(data_train,meaningless)
data_test = removeMeaningless(data_test,meaningless)
data_val = removeMeaningless(data_val,meaningless)

In [34]:
data_train_trans = removeMeaningless(data_train_trans,meaningless)
data_test_trans = removeMeaningless(data_test_trans,meaningless)
data_val_trans = removeMeaningless(data_val_trans,meaningless)

5. Menghilangkan imbuhan (*stemming*)

In [35]:
wordnet_lemmatizer = WordNetLemmatizer()
factory = StemmerFactory()
stemmerID = factory.create_stemmer()

In [36]:
def stemming(text, stemmer_id, stemmer_en):
    text_split = text.split(" ")
    stemmed_list = []
    for i in text_split:
        stem_text1 = stemmer_id.stem(i)
        stem_text2 = stemmer_en.lemmatize(stem_text1, pos="v")
        stem_text3 = stemmer_en.lemmatize(stem_text2, pos="n")
        stem_text4 = stemmer_en.lemmatize(stem_text3, pos="a")
        stemmed_list.append(stem_text4)
    stemmed = ' '.join(map(str,stemmed_list))
    return stemmed

def stemData(data, stemmer_id, stemmer_en):
    data['Tweet_Parsed_5'] = data['Tweet_Parsed_4']
    data['Tweet_Parsed_5'] = data['Tweet_Parsed_5'].map(lambda com : stemming(com,stemmer_id,stemmer_en))
    return data

In [26]:
data_train = stemData(data_train, stemmerID, wordnet_lemmatizer)
data_test = stemData(data_test, stemmerID, wordnet_lemmatizer)
data_val = stemData(data_val, stemmerID, wordnet_lemmatizer)

In [37]:
data_train_trans = stemData(data_train_trans, stemmerID, wordnet_lemmatizer)
data_test_trans = stemData(data_test_trans, stemmerID, wordnet_lemmatizer)
data_val_trans = stemData(data_val_trans, stemmerID, wordnet_lemmatizer)

6. Remove Stopword

In [38]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ASLAB\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [39]:
stop_words = list(stopwords.words('indonesian')) + list(stopwords.words('english'))
stop_words = stop_words+["rt","retweet","url"]

In [40]:
stop_words[1:10]

['adalah',
 'adanya',
 'adapun',
 'agak',
 'agaknya',
 'agar',
 'akan',
 'akankah',
 'akhir']

In [41]:
len(stop_words)

940

In [42]:
def removeStopword(data,stop_words_):
    data['Tweet_Parsed_6'] = data['Tweet_Parsed_5']
    for stop_word in stop_words_:
        regex_stopword = r"\b" + stop_word + r"\b"
        data['Tweet_Parsed_6'] = data['Tweet_Parsed_6'].str.replace(regex_stopword, '')
    return data

In [32]:
data_train = removeStopword(data_train,stop_words)
data_test = removeStopword(data_test,stop_words)
data_val = removeStopword(data_val,stop_words)

In [43]:
data_train_trans = removeStopword(data_train_trans,stop_words)
data_test_trans = removeStopword(data_test_trans,stop_words)
data_val_trans = removeStopword(data_val_trans,stop_words)

7. clean space

In [44]:
def clean_space(text):
    text_split = text.split(' ')
    text_clean = [i for i in text_split if i != '']
    text_fix = ' '.join(map(str,text_clean))
    return text_fix

def clean_space_data(data):
    data['Tweet_Parsed_7'] = data['Tweet_Parsed_6']
    data['Tweet_Parsed_7'] = data['Tweet_Parsed_7'].map(lambda com : clean_space(com))
    return data

In [34]:
data_train = clean_space_data(data_train)
data_test = clean_space_data(data_test)
data_val = clean_space_data(data_val)

In [45]:
data_train_trans = clean_space_data(data_train_trans)
data_test_trans = clean_space_data(data_test_trans)
data_val_trans = clean_space_data(data_val_trans)

## Preprocessing Result

### Data Train Without Translation

In [41]:
data_train['Tweet'].loc[6]

"ISRAEL UNFORGIVEN !!! YOU'RE THE REAL TERRORIST !! LAKNAT ALLAH MENANTIMU ZIONIS !! #PrayForGaza #PrayForPalestine"

In [42]:
data_train['Tweet_Parsed_1'].loc[6]

"ISRAEL UNFORGIVEN !!! YOU'RE THE REAL TERRORIST !! LAKNAT ALLAH MENANTIMU ZIONIS !! #PrayForGaza #PrayForPalestine"

In [43]:
data_train['Tweet_Parsed_2'].loc[6]

"israel unforgiven !!! you're the real terrorist !! laknat allah menantimu zionis !! #prayforgaza #prayforpalestine"

In [44]:
data_train['Tweet_Parsed_3'].loc[6]

'israel unforgiven you are the real terrorist laknat allah menantimu zionis prayforgaza prayforpalestine'

In [45]:
data_train['Tweet_Parsed_4'].loc[6]

'israel unforgiven you are the real terrorist laknat allah menantimu zionis pray for gaza pray for palestine'

In [46]:
data_train['Tweet_Parsed_5'].loc[6]

'israel unforgiven you be the real terrorist laknat allah nanti zionis pray for gaza pray for palestine'

In [47]:
data_train['Tweet_Parsed_6'].loc[6]

'israel unforgiven    real terrorist laknat allah  zionis pray  gaza pray  palestine'

In [48]:
data_train['Tweet_Parsed_7'].loc[6]

'israel unforgiven real terrorist laknat allah zionis pray gaza pray palestine'

### Data Test Without Translation

In [90]:
data_test['Tweet'].loc[60]

"Fuck you, i'd raid for giratina three times, but i didn't get anyhing anjing kau bangsat,,,,"

In [91]:
data_test['Tweet_Parsed_1'].loc[60]

"Fuck you, i'd raid for giratina three times, but i didn't get anyhing anjing kau bangsat,,,,"

In [92]:
data_test['Tweet_Parsed_2'].loc[60]

"fuck you, i'd raid for giratina three times, but i didn't get anyhing anjing kau bangsat,,,,"

In [93]:
data_test['Tweet_Parsed_3'].loc[60]

'fuck you i d raid for giratina three times but i did not get anyhing anjing kau bangsat '

In [94]:
data_test['Tweet_Parsed_4'].loc[60]

'fuck you i di raid for giratina three times but i did not get anything anjing kamu bangsat '

In [95]:
data_test['Tweet_Parsed_5'].loc[60]

'fuck you i di raid for giratina three time but i do not get anything anjing kamu bangsat '

In [96]:
data_test['Tweet_Parsed_6'].loc[60]

'fuck    raid  giratina three time     get anything anjing  bangsat '

In [97]:
data_test['Tweet_Parsed_7'].loc[60]

'fuck raid giratina three time get anything anjing bangsat'

### Data Train With Translation

In [46]:
data_train_trans['Tweet'].loc[0]

"- kdg will be kind enough to show you how to look classy (gamau pny tmn kampungan)\n- drinks only the finest wine or no drinking\n- when he's being modest, will accompany you to watch movies\n- doesnt listen to you that much but will know when ur not okay bcs he actually cares abt u"

In [47]:
data_train_trans['Tweet_Parsed_1'].loc[0]

"- kdg will be kind enough to show you how to look classy (gamau pny tmn kampungan) - drinks only the finest wine or no drinking - when he's being modest, will accompany you to watch movies - doesnt listen to you that much but will know when ur not okay bcs he actually cares abt u"

In [48]:
data_train_trans['Tweet_Parsed_2'].loc[0]

"- kdg will be kind enough to show you how to look classy (gamau pny tmn kampungan) - drinks only the finest wine or no drinking - when he's being modest, will accompany you to watch movies - doesnt listen to you that much but will know when ur not okay bcs he actually cares abt u"

In [49]:
data_train_trans['Tweet_Parsed_3'].loc[0]

' kdg will be kind enough to show you how to look classy gamau pny tmn kampungan drinks only the finest wine or no drinking when he being modest will accompany you to watch movies doesnt listen to you that much but will know when ur not okay bcs he actually cares abt u'

In [50]:
data_train_trans['Tweet_Parsed_4'].loc[0]

'kadang akan cukup baik untuk menunjukkan kepada anda bagaimana terlihat berkelas tidak mau punya teman kampungan hanya minum anggur terbaik orang tidak minum ketika dia sedang sederhana akan menemani anda untuk menonton film tidak mendengarkan anda banyak tetapi akan tahu kapan anda tidak baik baik saja karena dia sebenarnya peduli tentang kamu'

In [51]:
data_train_trans['Tweet_Parsed_5'].loc[0]

'kadang akan cukup baik untuk tunjuk kepada anda bagaimana lihat kelas tidak mau punya teman kampung hanya minum anggur baik orang tidak minum ketika dia sedang sederhana akan tani anda untuk tonton film tidak dengar anda banyak tetapi akan tahu kapan anda tidak baik baik saja karena dia benar peduli tentang kamu'

In [52]:
data_train_trans['Tweet_Parsed_6'].loc[0]

'kadang         lihat kelas    teman kampung  minum anggur  orang  minum    sederhana  tani   tonton film  dengar               peduli  '

In [53]:
data_train_trans['Tweet_Parsed_7'].loc[0]

'kadang lihat kelas teman kampung minum anggur orang minum sederhana tani tonton film dengar peduli'

## Fixing Data

In [54]:
def fixData(data, stemming=True, stopword=True):
    if stopword == True:
        data_fix = data.drop(['Tweet_Parsed_1', 'Tweet_Parsed_2','Tweet_Parsed_3','Tweet_Parsed_4','Tweet_Parsed_5','Tweet_Parsed_6'],axis=1)
        data_fix = data_fix.rename(columns={'Tweet_Parsed_7': 'Tweet_Parsed'})
    elif stemming == True:
        data_fix = data.drop(['Tweet_Parsed_1', 'Tweet_Parsed_2','Tweet_Parsed_3','Tweet_Parsed_4','Tweet_Parsed_6','Tweet_Parsed_7'],axis=1)
        data_fix = data_fix.rename(columns={'Tweet_Parsed_5': 'Tweet_Parsed'})
    else:
        data_fix = data.drop(['Tweet_Parsed_1', 'Tweet_Parsed_2','Tweet_Parsed_3','Tweet_Parsed_5','Tweet_Parsed_6','Tweet_Parsed_7'],axis=1)
        data_fix = data_fix.rename(columns={'Tweet_Parsed_4': 'Tweet_Parsed'})
    return data_fix

In [99]:
data_train_full = fixData(data_train,stemming=True,stopword=True)
data_test_full = fixData(data_test,stemming=True,stopword=True)
data_val_full = fixData(data_val,stemming=True,stopword=True)

In [100]:
data_train_nostopword = fixData(data_train,stemming=True,stopword=False)
data_test_nostopword = fixData(data_test,stemming=True,stopword=False)
data_val_nostopword = fixData(data_val,stemming=True,stopword=False)

In [101]:
data_train_nostemstop = fixData(data_train,stemming=False,stopword=False)
data_test_nostemstop = fixData(data_test,stemming=False,stopword=False)
data_val_nostemstop = fixData(data_val,stemming=False,stopword=False)

In [55]:
data_train_trans_full = fixData(data_train_trans,stemming=True,stopword=True)
data_test_trans_full = fixData(data_test_trans,stemming=True,stopword=True)
data_val_trans_full = fixData(data_val_trans,stemming=True,stopword=True)

In [56]:
data_train_trans_nostopword = fixData(data_train_trans,stemming=True,stopword=False)
data_test_trans_nostopword = fixData(data_test_trans,stemming=True,stopword=False)
data_val_trans_nostopword = fixData(data_val_trans,stemming=True,stopword=True)

In [57]:
data_train_trans_nostemstop = fixData(data_train_trans,stemming=False,stopword=False)
data_test_trans_nostemstop = fixData(data_test_trans,stemming=False,stopword=False)
data_val_trans_nostemstop = fixData(data_val_trans,stemming=True,stopword=True)

## Empty String

In [58]:
def handleEmptyString(data):
    data.loc[data['Tweet_Parsed'] == '', ['Tweet_Parsed']] = " "
    return data

In [103]:
data_train_full = handleEmptyString(data_train_full)
data_test_full = handleEmptyString(data_test_full)
data_val_full = handleEmptyString(data_val_full)

In [104]:
data_train_nostopword = handleEmptyString(data_train_nostopword)
data_test_nostopword = handleEmptyString(data_test_nostopword)
data_val_nostopword = handleEmptyString(data_val_nostopword)

In [105]:
data_train_nostemstop = handleEmptyString(data_train_nostemstop)
data_test_nostemstop = handleEmptyString(data_test_nostemstop)
data_val_nostemstop = handleEmptyString(data_val_nostemstop)

In [59]:
data_train_trans_full = handleEmptyString(data_train_trans_full)
data_test_trans_full = handleEmptyString(data_test_trans_full)
data_val_trans_full = handleEmptyString(data_val_trans_full)

In [60]:
data_train_trans_nostopword = handleEmptyString(data_train_trans_nostopword)
data_test_trans_nostopword = handleEmptyString(data_test_trans_nostopword)
data_val_trans_nostopword = handleEmptyString(data_val_trans_nostopword)

In [61]:
data_train_trans_nostemstop = handleEmptyString(data_train_trans_nostemstop)
data_test_trans_nostemstop = handleEmptyString(data_test_trans_nostemstop)
data_val_trans_nostemstop = handleEmptyString(data_val_trans_nostemstop)

## Save Data

In [106]:
data_train_full.to_csv("../Data/data_train_full_preprocessed.csv",index = False,sep=";")
data_test_full.to_csv("../Data/data_test_full_preprocessed.csv",index = False,sep=";")
data_val_full.to_csv("../Data/data_val_full_preprocessed.csv",index = False,sep=";")

In [107]:
data_train_nostopword.to_csv("../Data/data_train_nostopword_preprocessed.csv",index = False,sep=";")
data_test_nostopword.to_csv("../Data/data_test_nostopword_preprocessed.csv",index = False,sep=";")
data_val_nostopword.to_csv("../Data/data_val_nostopword_preprocessed.csv",index = False,sep=";")

In [108]:
data_train_nostemstop.to_csv("../Data/data_train_nostemstop_preprocessed.csv",index = False,sep=";")
data_test_nostemstop.to_csv("../Data/data_test_nostemstop_preprocessed.csv",index = False,sep=";")
data_val_nostemstop.to_csv("../Data/data_val_nostemstop_preprocessed.csv",index = False,sep=";")

In [62]:
data_train_trans_full.to_csv("../Data/data_train_trans_full_preprocessed.csv",index = False,sep=";")
data_test_trans_full.to_csv("../Data/data_test_trans_full_preprocessed.csv",index = False,sep=";")
data_val_trans_full.to_csv("../Data/data_val_trans_full_preprocessed.csv",index = False,sep=";")

In [63]:
data_train_trans_nostopword.to_csv("../Data/data_train_trans_nostopword_preprocessed.csv",index = False,sep=";")
data_test_trans_nostopword.to_csv("../Data/data_test_trans_nostopword_preprocessed.csv",index = False,sep=";")
data_val_trans_nostopword.to_csv("../Data/data_val_trans_nostopword_preprocessed.csv",index = False,sep=";")

In [64]:
data_train_trans_nostemstop.to_csv("../Data/data_train_trans_nostemstop_preprocessed.csv",index = False,sep=";")
data_test_trans_nostemstop.to_csv("../Data/data_test_trans_nostemstop_preprocessed.csv",index = False,sep=";")
data_val_trans_nostemstop.to_csv("../Data/data_val_trans_nostemstop_preprocessed.csv",index = False,sep=";")