In [9]:
import pandas as pd
from nltk.tokenize import word_tokenize
import string

In [10]:
df = pd.read_csv("new_reviews.csv", names=["Review"])
len(df)

4798

### Data Cleansing - Drop NaN value

In [11]:
df = df.dropna()
df = df.reset_index(drop=True)
len(df)

4159

In [12]:
df = df.drop_duplicates()
df = df.reset_index(drop=True)
len(df)

864

### Data Cleansing - Preprocessing

In [13]:
import re
import demoji
demoji.download_codes()

# def deEmojify(text):
#     regrex_pattern = re.compile(pattern = "["
#         u"\U0001F600-\U0001F64F"  # emoticons
#         u"\U0001F300-\U0001F5FF"  # symbols & pictographs
#         u"\U0001F680-\U0001F6FF"  # transport & map symbols
#         u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
#                            "]+", flags = re.UNICODE)
#     return regrex_pattern.sub(r'',text)

Downloading emoji data ...
... OK (Got response in 0.74 seconds)
Writing emoji data to C:\Users\rachm\.demoji\codes.json ...
... OK


In [15]:
for sentence in (df["Review"][:2]):
    print("Before:", sentence)
#     sentence = sentence.translate(str.maketrans("","", string.punctuation))
#     print()
#     print("Hapus simbol:", sentence)
#     deEmo = deEmojify(sentence)
#     print("Using Regex:", deEmo)
    sentence = demoji.replace(sentence)
    print()
    print("Hapus Emoji:", sentence)
    sentence = sentence.split()
    print()
    print("Tokenize:", sentence)
    print()

Before: Tiap makan kesini ngga pernah cuma 1 atau 2 porsi, pasti nambah terus, karena emang pas banget rasanya di lidah. Black peppernya yang paling enak disini.  Emang selalu waiting list, cuma sei sapi lamalera absolutely worth to wait! Recommended!

Hapus Emoji: Tiap makan kesini ngga pernah cuma 1 atau 2 porsi, pasti nambah terus, karena emang pas banget rasanya di lidah. Black peppernya yang paling enak disini.  Emang selalu waiting list, cuma sei sapi lamalera absolutely worth to wait! Recommended!

Tokenize: ['Tiap', 'makan', 'kesini', 'ngga', 'pernah', 'cuma', '1', 'atau', '2', 'porsi,', 'pasti', 'nambah', 'terus,', 'karena', 'emang', 'pas', 'banget', 'rasanya', 'di', 'lidah.', 'Black', 'peppernya', 'yang', 'paling', 'enak', 'disini.', 'Emang', 'selalu', 'waiting', 'list,', 'cuma', 'sei', 'sapi', 'lamalera', 'absolutely', 'worth', 'to', 'wait!', 'Recommended!']

Before: Se’i sapi sambel Lu’at nya dabest!   Pertama kesini pas jam 8 malam tp udh close order dan cuma bisa take awa

## Tokenized

In [16]:
import time
from tqdm import tqdm

df_hasil = pd.DataFrame(columns=['Kalimat #', 'Word'])
kalimat = 1

for sentence in (tqdm(df["Review"], desc="Loading...")):
    # print("Kalimat", kalimat)
    # sentence = sentence.translate(str.maketrans("","", string.punctuation))
    sentence = demoji.replace(sentence.lower())
    s_token = sentence.split()
    s_token = [[kalimat, w] for w in s_token]
    # print(s_token)
    df_temp = pd.concat([pd.DataFrame([i], columns=['Kalimat #', 'Word']) for i in s_token], ignore_index=True)
    df_hasil = df_hasil.append(df_temp, ignore_index=True)
    kalimat = kalimat + 1
    pass

Loading...: 100%|████████████████████████████████████████████████████████████████████| 864/864 [00:29<00:00, 29.40it/s]


In [17]:
df_hasil

Unnamed: 0,Kalimat #,Word
0,1,tiap
1,1,makan
2,1,kesini
3,1,ngga
4,1,pernah
...,...,...
51275,864,even
51276,864,they
51277,864,left
51278,864,the


In [18]:
df_hasil.to_csv('reviews_tokenized.csv',index=False)

## Per Kalimat

In [26]:
import time
from tqdm import tqdm

preprocessed = []
for sentence in (tqdm(df["Review"], desc="Loading...")):
    # print("Kalimat", kalimat)
    # sentence = sentence.translate(str.maketrans("","", string.punctuation))
    sentence = demoji.replace(sentence.lower())
    preprocessed.append(sentence)
    pass

df_sentence = pd.DataFrame(preprocessed, columns=['text'])

Loading...: 100%|███████████████████████████████████████████████████████████████████| 864/864 [00:01<00:00, 609.56it/s]


In [27]:
df_sentence

Unnamed: 0,text
0,tiap makan kesini ngga pernah cuma 1 atau 2 po...
1,se’i sapi sambel lu’at nya dabest! pertama k...
2,makanan yang sebenarnya simple tapi enak bange...
3,"enak bgtttt, sambel enak, daging sho good gata..."
4,rasa sei nya blm ada yg ngalahin
...,...
859,cozy place with poor service. pelayan menginfo...
860,"instagramable place menurut gw, hrga makanan &..."
861,foto dan harga di aplikasi kurang jelas
862,i like the ambience here... and love the food ...


In [28]:
df_sentence.to_csv('reviews_preprocessed.csv',index=False)

## Split Half

In [29]:
len(df_sentence)/2

432.0

In [30]:
df_sentence.iloc[:len(df_sentence)//2]

Unnamed: 0,text
0,tiap makan kesini ngga pernah cuma 1 atau 2 po...
1,se’i sapi sambel lu’at nya dabest! pertama k...
2,makanan yang sebenarnya simple tapi enak bange...
3,"enak bgtttt, sambel enak, daging sho good gata..."
4,rasa sei nya blm ada yg ngalahin
...,...
427,mau makan sehat tapi enak dan murah? makan ras...
428,"i love thier concept! first, we have to know i..."
429,the best vegan dining i ever eat.. semua makan...
430,"pesen nasi timbel. enak bgt ""daging""nyaa kenye..."


In [31]:
df_sentence.iloc[len(df_sentence)//2:]

Unnamed: 0,text
432,the best burger i had in a while. this place w...
433,burger dari brother jonn & sons menurut aku ja...
434,tidak seenak yang aku bayangkan rekomen dari o...
435,seneng banget menemukan tempat burger di bandu...
436,burger terenak di bandung! lagi liburan cravin...
...,...
859,cozy place with poor service. pelayan menginfo...
860,"instagramable place menurut gw, hrga makanan &..."
861,foto dan harga di aplikasi kurang jelas
862,i like the ambience here... and love the food ...


In [32]:
df_sentence.iloc[:len(df_sentence)//2].to_csv('reviews_preprocessed_firsthalf.csv',index=False)
df_sentence.iloc[len(df_sentence)//2:].to_csv('reviews_preprocessed_secondhalf.csv',index=False)