### Import Libraries

In [18]:
import re
import emoji
import nltk
import pandas as pd
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from kiwipiepy import Kiwi
from sklearn.utils import shuffle

In [19]:
nltk.download("stopwords")
nltk.download("punkt")
kiwi = Kiwi()
stemmer = PorterStemmer()

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/nimisha/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /Users/nimisha/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [20]:
stopwords_en = set(stopwords.words("english"))
stopwords_kr = ['도', '는', '다', '의', '가', '이', '은', '한', '에', '하', '고', '을', '를', '인', '듯', '과', '와', '네', '들', '듯', '지', '임', '게']

## Load Dataset

In [21]:
df_en = pd.read_csv("../data/english_only_data.csv")
df_ko = pd.read_csv("../data/korean_only_data.csv")

In [22]:
df_en["language"] = "en"
df_ko["language"] = "ko"

df = pd.concat([df_en, df_ko], ignore_index=True)
df = shuffle(df, random_state=70)
df.head(10)

Unnamed: 0,Comment,Sentiment,language
328712,They should refuse to move.... Gaza is their ...,Negative,en
431204,Where can I find the source code?,Negative,en
556515,전투의 재미도 없고 모험하는 맛도 지금에서 하기에는 타 게임에 많이 밀린다. 오직 ...,negative,ko
101150,Newsmax\nReal Americas Voice \nOne America New...,Negative,en
158853,Dude is doing anything and everything except t...,Negative,en
838334,아플 때 먹어서 좋았어요 하루 종일 토하고 나서 먹었죠 좋았습니다,positive,ko
665596,차돌 3갈비 1 갈비탕 사이다,positive,ko
131240,I'm Nigerian myself (Igbo) and I can really re...,Positive,en
33957,The narcissist Trump is slowly losing control ...,Negative,en
820871,커피 맛 괜찮아요 일행은 베이커리도 먹었는데 괜찮다고 합니다. 이 일대에서 분위기 ...,positive,ko


In [23]:
df["Comment"].isnull().sum()

np.int64(0)

In [24]:
df["Sentiment"] = df["Sentiment"].replace({"Negative": "negative", "Positive": "positive"})
df["Sentiment"].value_counts()

Sentiment
negative    440000
positive    440000
Name: count, dtype: int64

In [25]:
df["Sentiment"] = df["Sentiment"].replace({"negative": 0, "positive": "1"})
df["Sentiment"].value_counts()

Sentiment
0    440000
1    440000
Name: count, dtype: int64

### Traditional Preprocessing

In [12]:
def preprocess_for_traditional(text, language):
    text = emoji.demojize(text) 
    text = text.lower()

    text = re.sub(r"[^a-zA-Z0-9가-힣\s]", "", text)

    if language == "en":
        tokens = nltk.word_tokenize(text, language="english", preserve_line=True)
        tokens = [stemmer.stem(t) for t in tokens if t not in stopwords_en] 
    elif language == "ko":
        tokens = [word.form for word in kiwi.tokenize(text)]
        tokens = [stemmer.stem(t) for t in tokens if t not in stopwords_kr]

    return " ".join(tokens)

In [13]:
df["clean_comment"] = df.apply(
    lambda row: preprocess_for_traditional(row["Comment"], row["language"]), axis=1
)

In [14]:
df.head()

Unnamed: 0,Comment,Sentiment,language,clean_comment
328712,They should refuse to move.... Gaza is their ...,0,en,refus move gaza home
431204,Where can I find the source code?,0,en,find sourc code
556515,전투의 재미도 없고 모험하는 맛도 지금에서 하기에는 타 게임에 많이 밀린다. 오직 ...,0,ko,전투 재미 없 모험 맛 지금 에서 기 타 게임 많이 밀리 ᆫ다 오직 퀘스트 깨 재미...
101150,Newsmax\nReal Americas Voice \nOne America New...,0,en,newsmax real america voic one america news oan...
158853,Dude is doing anything and everything except t...,0,en,dude anyth everyth except talk wife


In [16]:
df = df[["clean_comment", "Sentiment", "language"]]
df.rename(columns={"clean_comment": "Comment", "language": "Language"}, inplace=True)
df.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.rename(columns={"clean_comment": "Comment", "language": "Language"}, inplace=True)


Unnamed: 0,Comment,Sentiment,Language
328712,refus move gaza home,0,en
431204,find sourc code,0,en
556515,전투 재미 없 모험 맛 지금 에서 기 타 게임 많이 밀리 ᆫ다 오직 퀘스트 깨 재미...,0,ko
101150,newsmax real america voic one america news oan...,0,en
158853,dude anyth everyth except talk wife,0,en


In [17]:
df.to_csv("../data/traditional_preprocessed_data.csv", index=False)