###Setup

####Importing library

In [0]:
# Basic libraries
import os
import pickle
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore",category=DeprecationWarning)
from google.colab import drive
drive.mount('/content/drive')

# Text Pre-Processing
# Tokenize
import re
nltk.download('punkt')

# Lemmatization
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

# Stopword Removal
import spacy
# from spacy.lang.id.stop_words import STOP_WORDS
!python3 -m spacy download en
from nltk.corpus import stopwords
nltk.download('stopwords')

# N-gram
from nltk import ngrams
from sklearn.feature_extraction.text import CountVectorizer

####Defining Function

In [0]:
def tokenize(sentence):
    sentence = str(sentence)
    
    # Keep alphanumeric chars
    tokens = re.sub(r'[\W_]+', ' ', sentence)

    # Remove numbers
    tokens = re.sub(r'\d+', '', tokens)

    # Tokenize
    tokens = nltk.word_tokenize(tokens)

    # Transform into lowercase and remove word shorter than 3 chararcters
    tokens = [word.lower() for word in tokens if len(word)>3]

    return tokens

def remove_stopwords(sentence):
    words = []
    for word in sentence:
        if str(word) not in stop_words:
            words.append(word)
    return words
    
def make_bigrams(texts):
    # return [bigram_mod[doc] for doc in texts]
    return bigram_mod[texts]

def make_trigrams(texts):
    return trigram_mod[bigram_mod[texts]]

def get_wordnet_pos(word):
    """Map POS tag to first character lemmatize() accepts"""
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}
    return tag_dict.get(tag, wordnet.NOUN)

def lemmatization_nltk(word):
    lemmatizer = WordNetLemmatizer()
    lemmatized = lemmatizer.lemmatize(word, get_wordnet_pos(word))
    return lemmatized

def lemmatization_spacy(sentence, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """https://spacy.io/api/annotation"""
    doc = nlp(" ".join(sentence))
    words = []
    for token in doc:
        if token.pos_ in allowed_postags: words.append(token.lemma_)
    return words

nlp = spacy.load('en', disable=['parser', 'ner'])

# Build stop words list using NLTK
stop_words = stopwords.words('english')
stop_words.extend(['from', 'subject', 're', 'edu', 'use', 'after', 'able', 'at'])

####Loading data

In [0]:
# Set working folder
root = '/content/drive/My Drive/isi_nama_folder_disini'

# Load csv file
df = pd.read_csv(root+'/nama_file.csv')
print('Row: ' + str(df.shape[0]) + '\n' + 'Column: ' + str(df.shape[1]))
df.head()


###Text Pre-Processing

####Tokenize, Stopwords, Lemmatize

In [0]:
col = 'example' # Name of the column where the text data is stored in the csv file

# Tokenize
df['tokenized'] = df[col].apply(tokenize)

# Remove stopword
df['nostops'] = df.tokenized.apply(remove_stopwords)

# Lemmatize using spacy
df['lemmatized'] = df.nostops.apply(lemmatization_spacy, allowed_postags=['NOUN', 'VERB'])

# Lemmatize using NLTK
# df['lemmatized'] = ''
# for idx, row in df.iterrows():
#     sent = row['bigrams']
#     for n, word in enumerate(row['nostops']):
#         sent[n] = lemmatization_nltk(word)
#     row['lemmatized'] = sent

print('Row: ' + str(df.shape[0]) + '\n' + 'Column: ' + str(df.shape[1]))
df.head()

# Save file to Google Drive
df.to_csv(root+'/nama_file_baru.csv')