# Text Data Preparation 

## Plot:
1. Convert all string to into lowercases
2. Tokenization
3. Filter nonstandard Indonesian words with Kamus Alay
4. Convert back to string
5. Normalize words (removing punctuation, non-ascii, whitespaces, etc)
6. Translate to english using Google Translate
7. Normalize words
8. Tokenization -> **Output with stopwords**
9. Convert back to string
10. Lemmatization and remove stopwords using Spacy -> **Ouput without stopwords**

In [None]:
import re
import string
from nltk.tokenize import word_tokenize
from googletrans import Translator

#Google translate
#Google translate API allows 5 calls/s -> error is expected
translator = Translator()
translator.raise_Exception = True

#Get kamus alay dictionary
standard_id_word = pd.read_csv("new_kamusalay.csv")
standard_id_word_dict = {}

for index, row in standard_id_word.iterrows():
    if row[0] not in standard_id_word_dict:
        standard_id_word_dict[row[0]] = row[1]

## Preprocessing steps

In [None]:
def get_lowercase_text(df):
    review_column = 'content'
    df['string_text'] = df['{}'.format(review_column)].str.lower()
    return df
    
def tokenize_text(text):
    return word_tokenize(text)
    
def get_tokenized_text(df):
    df['token'] = df['string_text'].apply(tokenize_text)
    return df

def standardize_text(text):
    return [standard_id_word_dict[term] if term in standard_id_word_dict else term for term in text]
    
def get_standardized_text(df):
    df['standardized_token'] = df['token'].apply(standardize_text)
    return df
    
def get_string(df):
    df['standardized_string'] = [' '.join(map(str, l)) for l in df['standardized_token']]
    return df
    
def normalize_text(text):
    
    text = re.sub(r"can't", "cannot ", text)
    text = re.sub(r"cant", "cannot ", text)
    text = re.sub(r"what's", "what is ", text)
    text = re.sub(r"i'm", "i am ", text)
    text = re.sub(r"\'s", " is ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = text.encode('ascii', 'replace').decode('ascii')
    text = re.sub(r"\d+", "", text)
    text = text.translate(str.maketrans("","",string.punctuation))
    text = re.sub('\s+',' ',text)
    text = text.strip()
        
    return text
    
def get_normalized_text(df):
    df['normalized_text'] = df['standardized_string'].apply(normalize_text)
    return df
    
def get_translated_text(df):
    df['en_with_stopwords'] = df['normalized_text'].apply(translator.translate, dest='en').apply(getattr, args=('text',))
    return df
    
def get_normalized_en_with_stopwords(df):
    df['normalized_en_with_stopwords'] = df['en_with_stopwords'].apply(normalize_text)
    return df
    
def get_tokenized_en_with_stopwords(df):
    df['en_token_with_stopwords'] = df['normalized_en_with_stopwords'].apply(tokenize_text)
    return df
    
def get_en_string(df):
    df['en_string_with_stopwords'] = [' '.join(map(str, l)) for l in df['en_token_with_stopwords']]
    return df
    
def __get_lemmas(text):
    nlp = spacy.load('en_core_web_lg')
    lemmas = []
    doc = nlp(text)

    for token in doc: 
        if ((token.is_stop == False) and (token.is_punct == False)) and (token.pos_ != 'PRON'):
            lemmas.append(token.lemma_)
    return lemmas
    
def get_text_without_stopwords(df):
    df['en_token_without_stopwords'] = df['en_string_with_stopwords'].apply(__get_lemmas)
    return df

## Compiling all preprocessing steps

In [None]:
def preprocess(df):
    df = get_lowercase_text(df)
    df = get_tokenized_text(df)
    df = get_standardized_text(df)
    df = get_string(df)
    df = get_normalized_text(df)
    df = get_translated_text(df)
    df = get_normalized_en_with_stopwords(df)
    df = get_tokenized_en_with_stopwords(df)
    df = get_en_string(df)
    df = get_text_without_stopwords(df)
    return df

In [None]:
df = pd.read_csv('PlayStoreReview.csv')

In [None]:
df = preprocess(df)