In [17]:
import pandas as pd
import numpy as np
import nltk
from nltk import word_tokenize
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer
import re
import os
import time

In [18]:
stop_words = set(stopwords.words('english'))

In [19]:
dictionary = []

In [20]:
df = pd.read_csv("data/dataset/train-balanced-sarcasm.csv")

In [21]:
df_new = df[['parent_comment','comment','label']]

In [22]:
df_new = df_new.sample(100000)

In [23]:
df_new.shape

(100000, 3)

In [24]:
df_new.head()

Unnamed: 0,parent_comment,comment,label
651304,"They got a few alright players, getting pretty...",I think you forgot the,1
721274,So Kanye?,"Yes, exactly like Kanye.",1
270775,im super drunk but im prty sure thast not why ...,Nazi gangbanger tattoos.,0
302972,I don't understand the VICE reporters closing ...,"They weren't talking about Ramadi right there,...",0
752252,This kills the toilet.,F,0


In [25]:
def remove_stopwords(tokens):
    tokens_wo_stopwords = []
    for i in range(0,len(tokens)):
        if tokens[i].lower() not in stop_words:
            tokens_wo_stopwords.append(tokens[i].lower())
    return tokens_wo_stopwords

In [26]:
def get_pos_tag(token):
    pos_tag = nltk.pos_tag([token])[0][1]
    if pos_tag.startswith('N'):
        return wordnet.NOUN
    elif pos_tag.startswith('V'):
        return wordnet.VERB
    elif pos_tag.startswith('J'):
        return wordnet.ADJ
    elif pos_tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

In [27]:
def lemmatize(tokens):
    lemmatizer = WordNetLemmatizer()
    for i in range(0,len(tokens)):
        tokens[i] = lemmatizer.lemmatize(tokens[i],pos=str(get_pos_tag(tokens[i])))
    return tokens

In [28]:
def add_to_dictionary(tokens):
    for token in tokens:
        if token not in dictionary:
            dictionary.append(token)

In [29]:
def save_dictionary():
    with open('data/processed/dictionary.txt','w') as file:
        file.writelines("%s\n" % word for word in dictionary)

In [30]:
def create_dictionary(dataset):
    for index,row in dataset.iterrows():
        processed_parent_comment = re.sub(r'[^a-zA-Z]', ' ', str(row['parent_comment']))
        processed_comment = re.sub(r'[^a-zA-Z]', ' ', str(row['comment']))
        comment = processed_parent_comment + " " + processed_comment
        tokens_comment = word_tokenize(comment)
        tokens_comment = remove_stopwords(tokens_comment)
        tokens_comment = lemmatize(tokens_comment)
        add_to_dictionary(tokens_comment)
    save_dictionary()

In [31]:
if not os.path.isfile('data/processed/dictionary.txt'):
    starttime = time.time()
    create_dictionary(df_new)
    endtime = time.time()
    print("Time to create dictionary")
    print(endtime - starttime)

Time to create dictionary
467.5977520942688


In [32]:
len(dictionary)

67735