# Importing Review Data

In [1]:
# The places
places = ["pahlawan_trip","pak_man","trowulan","damas","gun","cak_man","samut","presiden"]

# The reviews
reviews = []

for i, c in enumerate(places):
    with open("reviews/" + c + ".txt", "rb") as file:
        reviews.append(file.read())

# Cleaning Review Data

In [2]:
import re
import string

clean_phase_1 = []

def clean_text_phase1(text):
    # decoding bytes to utf-8
    text = text.decode('utf-8')
    
    # parse text to lowercase
    text = text.lower()
    # remove text inside bracket
    text = re.sub('\[.*?\]', '', text)
    # sub \r with space
    text = re.sub('\r', ' ', text)
    # sub \n with space
    text = re.sub('\n', ' ', text)
    # sub punctuation with space
    text = re.sub('[%s]' % re.escape(string.punctuation), ' ', text)
    # remove alphanumeric
    text = re.sub('\w*\d\w*', '', text)
    # sub multiple space to single space
    text = re.sub(' +', ' ', text)
    return text

for i,review in enumerate(reviews):
    clean_phase_1.append(clean_text_phase1(review))
    
reviews = clean_phase_1

# Combining Corpus Data

In [3]:
import pandas as pd
pd.set_option('max_colwidth',150)

data = {}
for i, c in enumerate(places):
    data[c] = reviews[i] 
    
data = pd.Series(data).to_frame()

data_df = pd.DataFrame(data)
data_df.columns = ['reviews']

fullname_places = ["Bakso Bakar Pahlawan Trip","Bakso Bakar Pak Man","Bakso Trowulan","Bakso Damas","Bakso Gun","Bakso Kota Cak Man","Bakso Pak Samut","Bakso Presiden"]
data_df['full_name'] = fullname_places

data_df

Unnamed: 0,reviews,full_name
pahlawan_trip,kalo ke kota malang pasti salah satu makanan yang akan dicari adalah bakso banyak jenis bakso saat ini yang disajikan ada bakso keju bakso urat ba...,Bakso Bakar Pahlawan Trip
pak_man,malang identik dengan bakso nya dan ini adalah salah satu bakso yang legendaris di kota malang bakso bakar nya top bgt dagingnya kerasa sambalnya ...,Bakso Bakar Pak Man
trowulan,perbiji rupiah pilihan bakso bakar rasa manis dan pedas dengan bakso halus dan kasar urat disantap dengan kuah yang segar enak mantap maknyusss ma...,Bakso Trowulan
damas,ini untuk kedua kalinya makan di tempat ini pertama kali makan disini sangat nikmat dan pembelinya tidak serame sekarang kedua kali makan disini m...,Bakso Damas
gun,saya mencoba bakso gun di matos malang kita bisa pilih bakso pangsit atau tahu dll pilihan sesuai keinginan saya ambil macam total harga rp sangat...,Bakso Gun
cak_man,cita rasa yg khas dan penyajiannya yg masih original walau sedikit modern bakso ini menggambarkan ke khasan citarasa baso malang yg termasyhur pil...,Bakso Kota Cak Man
samut,rumah makan ini berada di sebuah ruko di kawasan perumahan tidar lahan parker tidak cukup luas untuk kendaraan roda empat rumah makannya tidak ter...,Bakso Pak Samut
presiden,salah satu tujuan kuliner legendaris di kota malang adalah bakso president letaknya di pinggir rel kereta tepatnya jl batanghari no rampal celaket...,Bakso Presiden


In [4]:
# pickle corpus for later use
data_df.to_pickle("corpus.pkl")

# Document-Term Matrix

## Indonesian Stopwords

In [5]:
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory
from Sastrawi.Dictionary.ArrayDictionary import ArrayDictionary

def indonesianStopwordAndStemmer(text):
    stemmerFactory = StemmerFactory()
    stemmer = stemmerFactory.create_stemmer()
    stopWordFactory = StopWordRemoverFactory()
    stopWord = stopWordFactory.create_stop_word_remover()

    #   output = stemmer.stem(text)
    output = stopWord.remove(text)
    
    # additional stopword
    output = additionalStopWord(output)
    
    return output

# Remove additional stopword so the data make sense
def additionalStopWord(text):
    # additional stopwords
    additionalStopWords = ['yg','sy','pak','di','yang','nya','cak','man','tidak',
                           'ga','jadi','gun','damas','kota','bakwan','samut','samud',
                           'pilih','banyak','bakar','sangat','tempat','pilihan','sangat',
                          'sangat','mencoba','saya']
    dictionary = ArrayDictionary(additionalStopWords)
    words = text.split(' ')
    stopped_words = [word for word in words if not dictionary.contains(word)]
    
    return ' '.join(stopped_words)

indonesian_reviews = []
for review in reviews:
    indonesian_reviews.append(indonesianStopwordAndStemmer(review))

## The DTM : split review by word

In [6]:
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer()
data_cv = cv.fit_transform(indonesian_reviews)
data_dtm = pd.DataFrame(data_cv.toarray(), columns=cv.get_feature_names())
data_dtm.index = places
data_dtm

Unnamed: 0,abal,abis,abm,ac,ada,adalah,adany,adanya,aduhai,agak,...,wisatawan,wlwpn,worth,wow,yaa,yach,yah,yamin,yin,yummy
pahlawan_trip,0,0,0,0,3,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
pak_man,0,0,0,0,2,0,0,0,0,0,...,0,1,0,0,0,0,0,1,0,1
trowulan,1,1,2,0,3,1,0,0,0,0,...,0,0,0,0,0,0,1,0,0,1
damas,0,1,0,1,2,0,1,0,0,0,...,0,0,1,0,0,0,0,0,0,0
gun,0,0,0,0,2,2,0,0,0,0,...,0,0,0,1,0,0,0,0,1,0
cak_man,0,0,0,0,2,1,0,1,0,0,...,0,0,0,0,0,0,1,0,0,0
samut,0,0,0,0,3,1,0,0,0,0,...,0,0,0,0,0,0,2,0,0,0
presiden,0,0,0,0,2,0,0,0,1,1,...,0,0,0,1,1,1,0,0,0,0


In [7]:
# Let's pickle it for later use
data_dtm.to_pickle("dtm.pkl")

In [91]:
# Let's take a look at the updated text
import pickle

file = open('clean_data.pkl', 'wb')
pickle.dump(indonesian_reviews, file)