In [1]:
import pandas as pd
import numpy as np
import nltk
from nltk import word_tokenize
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer
import re
import os
import time

In [2]:
stop_words = set(stopwords.words('english'))

In [3]:
dictionary = []

In [4]:
embeddings = {}

In [5]:
df = pd.read_csv("data/dataset/train-balanced-sarcasm.csv")

In [6]:
df_new = df[['parent_comment','comment','label']]

In [7]:
df_new = df_new.sample(100000)

In [8]:
df_new.shape

(100000, 3)

In [9]:
df_new.head()

Unnamed: 0,parent_comment,comment,label
216199,SSSSSNEK,"Having lived in Germany for a bit, if I saw Ge...",1
237378,"If I'm Lue, Shump just played his final mins t...",for sure this game,0
191594,"dati pa, ngayon lang ako nag post dahil na lat...",Ingat ka sa sinasabi mo baka may magalit kasi ...,1
884645,Denmark.,Tolerance towards hate groups,0
705286,My guess is the only reason they made it a blu...,Yea...i would have been totally pissed if I go...,1


In [10]:
def remove_stopwords(tokens):
    tokens_wo_stopwords = []
    for i in range(0,len(tokens)):
        if tokens[i].lower() not in stop_words:
            tokens_wo_stopwords.append(tokens[i].lower())
    return tokens_wo_stopwords

In [11]:
def get_pos_tag(token):
    pos_tag = nltk.pos_tag([token])[0][1]
    if pos_tag.startswith('N'):
        return wordnet.NOUN
    elif pos_tag.startswith('V'):
        return wordnet.VERB
    elif pos_tag.startswith('J'):
        return wordnet.ADJ
    elif pos_tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

In [12]:
def lemmatize(tokens):
    lemmatizer = WordNetLemmatizer()
    for i in range(0,len(tokens)):
        tokens[i] = lemmatizer.lemmatize(tokens[i],pos=str(get_pos_tag(tokens[i])))
    return tokens

In [13]:
def add_to_dictionary(tokens):
    for token in tokens:
        if token not in dictionary:
            dictionary.append(token)

In [14]:
def save_dictionary():
    with open('data/processed/dictionary.txt','w') as file:
        file.writelines("%s\n" % word for word in dictionary)

In [15]:
def read_dictionary():
    with open('data/processed/dictionary.txt','r') as file:
        temp = file.read().splitlines()
        for i in range(0,len(temp)):
            dictionary.append(temp[i])

In [22]:
def preprocess(sentence):
    processed_sentence = re.sub(r'[^a-zA-Z]', ' ', sentence)
    tokens_comment = word_tokenize(processed_sentence)
    tokens_comment = remove_stopwords(tokens_comment)
    tokens_comment = lemmatize(tokens_comment)
    return tokens_comment

In [23]:
def create_dictionary(dataset):
    for index,row in dataset.iterrows():
        tokens_comment = preprocess(str(row['parent_comment']) + " " + str(row['comment']))
        add_to_dictionary(tokens_comment)
    save_dictionary()

In [18]:
def populate_embeddings_dict():
    starttime = time.time()
    with open('data/processed/glove.6B.300d.txt','r') as file:
        for line in file:
            values = line.split()
            word = values[0]
            word_embedding = np.asarray(values[1:])
            embeddings[word] = word_embedding
    endtime = time.time()
    print("Time taken to load embeddings:- ")
    print(endtime - starttime)

In [19]:
def embedding_lookup(x,embedding_dim=300):
    if(len(embeddings) == 0):
        populate_embeddings_dict()
    embedding = []
    for i in range(0,len(x)):
        if(x[i] in embeddings):
            embedding.append(embeddings[x[i]])
        else:
            zero_arr = np.zeros(embedding_dim).tolist()
            embedding.append(zero_arr)
    embedding = np.array(embedding)
    return embedding

In [20]:
if not os.path.isfile('data/processed/dictionary.txt'):
    starttime = time.time()
    create_dictionary(df_new)
    endtime = time.time()
    print("Time to create dictionary")
    print(endtime - starttime)
else:   
    read_dictionary()

Time to create dictionary
430.11281394958496


In [21]:
len(dictionary)

67028

In [26]:
embeddings = embedding_lookup(preprocess(df_new['comment'].iloc[0]))

In [29]:
embeddings

array([['-0.36633', '0.48371', '-0.31369', ..., '0.22848', '0.023664',
        '-0.075187'],
       ['0.046596', '0.18904', '-0.46218', ..., '-0.54401', '0.72344',
        '0.0443'],
       ['0.082004', '-0.12691', '-0.03035', ..., '0.23209', '0.19432',
        '0.18716'],
       ...,
       ['-0.14124', '-0.11836', '-0.30782', ..., '-0.19883', '-0.061105',
        '0.11568'],
       ['-0.24586', '-0.28032', '-0.23196', ..., '0.20135', '0.11195',
        '0.099126'],
       ['0.34831', '0.13124', '0.088767', ..., '0.055823', '0.30498',
        '-0.036958']], dtype='<U11')

In [27]:
preprocess(df_new['comment'].iloc[0])

['live',
 'germany',
 'bit',
 'saw',
 'german',
 'color',
 'follow',
 's',
 'get',
 'little',
 'nervous']

In [30]:
embeddings[7]

array(['0.31001', '0.046907', '-0.31283', '-0.2605', '-0.3749',
       '-0.13487', '-0.2176', '0.59326', '-0.30687', '-1.4446',
       '-0.26943', '0.45318', '-0.14138', '0.030587', '0.13575',
       '-0.11702', '-0.049775', '0.001526', '-0.064243', '-0.060033',
       '0.099257', '-0.11476', '-0.18558', '0.045222', '0.36369',
       '0.078655', '0.0026155', '0.37286', '0.47996', '-0.2032',
       '-0.37088', '0.68873', '-0.50702', '-0.12339', '-0.7473',
       '-0.15392', '0.47928', '-0.51113', '-0.1744', '-0.0010949',
       '-0.17066', '0.13301', '-0.97459', '0.44952', '-0.21822',
       '-0.4853', '0.12379', '-0.12521', '-0.3585', '0.0077671',
       '-0.16394', '0.1331', '-0.018957', '0.078365', '0.15956',
       '0.22007', '0.2083', '-0.45311', '0.062373', '-0.18905', '-0.1078',
       '0.41859', '0.043782', '-0.084046', '-0.023192', '0.20653',
       '-0.13351', '-0.086689', '0.15332', '-0.22299', '-0.2856',
       '0.2663', '0.030985', '0.089068', '0.12446', '-0.10969', '0.2539