In [29]:
import pandas as pd
import numpy as np
import nltk
from nltk import word_tokenize
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer
import re
import os
import time
import tensorflow_hub as hub

ModuleNotFoundError: No module named 'tensorflow_hub'

In [2]:
stop_words = set(stopwords.words('english'))

In [4]:
dictionary = []

In [5]:
embeddings = {}

In [6]:
df = pd.read_csv("data/dataset/train-balanced-sarcasm.csv")

In [7]:
df_new = df[['parent_comment','comment','label']]

In [8]:
df_new = df_new.sample(100000)

In [10]:
df_new.shape

(100000, 3)

In [11]:
df_new.head()

Unnamed: 0,parent_comment,comment,label
782258,Dear Libertarians I wish that you were all dea...,"Cool story, bro.",0
617258,I am a person who is unhappy with Starbound. I...,That and outside of the main quest there is ab...,0
646561,"So I calculated that on a website im 5' 11"" an...",Not a lot more but a small surplus of a few hu...,0
157433,Socialists who argue that one need only look a...,"No, no, no, we just didn't try hard enough!",1
298149,"It's written in stone. Don't blame ""people lik...","Yeah, well, we are working on carving a new me...",0


In [12]:
def remove_stopwords(tokens):
    tokens_wo_stopwords = []
    for i in range(0,len(tokens)):
        if tokens[i].lower() not in stop_words:
            tokens_wo_stopwords.append(tokens[i].lower())
    return tokens_wo_stopwords

In [13]:
def get_pos_tag(token):
    pos_tag = nltk.pos_tag([token])[0][1]
    if pos_tag.startswith('N'):
        return wordnet.NOUN
    elif pos_tag.startswith('V'):
        return wordnet.VERB
    elif pos_tag.startswith('J'):
        return wordnet.ADJ
    elif pos_tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

In [14]:
def lemmatize(tokens):
    lemmatizer = WordNetLemmatizer()
    for i in range(0,len(tokens)):
        tokens[i] = lemmatizer.lemmatize(tokens[i],pos=str(get_pos_tag(tokens[i])))
    return tokens

In [15]:
def add_to_dictionary(tokens):
    for token in tokens:
        if token not in dictionary:
            dictionary.append(token)

In [16]:
def save_dictionary():
    with open('data/processed/dictionary.txt','w') as file:
        file.writelines("%s\n" % word for word in dictionary)

In [17]:
def read_dictionary():
    with open('data/processed/dictionary.txt','r') as file:
        temp = file.read().splitlines()
        for i in range(0,len(temp)):
            dictionary.append(temp[i])

In [18]:
def preprocess(sentence):
    processed_sentence = re.sub(r'[^a-zA-Z]', ' ', sentence)
    tokens_comment = word_tokenize(processed_sentence)
    tokens_comment = remove_stopwords(tokens_comment)
    tokens_comment = lemmatize(tokens_comment)
    return tokens_comment

In [19]:
def create_dictionary(dataset):
    for index,row in dataset.iterrows():
        tokens_comment = preprocess(str(row['parent_comment']) + " " + str(row['comment']))
        add_to_dictionary(tokens_comment)
    save_dictionary()

In [20]:
def populate_embeddings_dict():
    starttime = time.time()
    with open('data/processed/glove.6B.300d.txt','r') as file:
        for line in file:
            values = line.split()
            word = values[0]
            word_embedding = np.asarray(values[1:])
            embeddings[word] = word_embedding
    endtime = time.time()
    print("Time taken to load embeddings:- ")
    print(endtime - starttime)

In [21]:
def embedding_lookup(x,embedding_dim=300):
    if(len(embeddings) == 0):
        populate_embeddings_dict()
    embedding = []
    for i in range(0,len(x)):
        if(x[i] in embeddings):
            embedding.append(embeddings[x[i]])
        else:
            zero_arr = np.zeros(embedding_dim).tolist()
            embedding.append(zero_arr)
    embedding = np.array(embedding)
    return embedding

In [22]:
if not os.path.isfile('data/processed/dictionary.txt'):
    starttime = time.time()
    create_dictionary(df_new)
    endtime = time.time()
    print("Time to create dictionary")
    print(endtime - starttime)
else:   
    read_dictionary()

In [23]:
len(dictionary)

67028

In [24]:
embeddings = embedding_lookup(preprocess(df_new['comment'].iloc[0]))

Time taken to load embeddings:- 
35.096932888031006


In [25]:
embeddings

array([['0.44874', '-0.4454', '-0.20424', '-0.15572', '-0.17863',
        '-0.17937', '0.16693', '0.19144', '0.1905', '-1.2896', '0.18248',
        '-0.21426', '0.085121', '-0.0058638', '-0.026263', '0.0016149',
        '-0.36555', '-0.28236', '0.17898', '0.7489', '0.32092',
        '0.74883', '0.081232', '-0.058681', '-0.20367', '-0.38068',
        '0.18391', '0.09356', '-0.06925', '-0.47984', '-0.49404',
        '0.058204', '-0.4334', '-0.26631', '-0.99599', '0.29223',
        '-0.023785', '0.18912', '-0.36762', '0.34751', '0.048832',
        '0.036862', '-0.35062', '0.10833', '0.4946', '-0.039863',
        '0.39128', '-0.020769', '-0.12705', '0.046382', '-0.21406',
        '-0.59181', '0.21041', '-0.56966', '-0.24769', '0.38859',
        '-0.28034', '0.034968', '0.18841', '0.024812', '0.3951',
        '0.35182', '0.3994', '0.16125', '-0.44957', '0.24849', '0.23305',
        '0.45207', '-0.06272', '-0.17149', '-0.17172', '0.068708',
        '0.11551', '0.25054', '-0.28774', '-0.16684

In [26]:
preprocess(df_new['comment'].iloc[0])

['cool', 'story', 'bro']

In [None]:
elmo = hub.Module("https://tfhub.dev/google/elmo/2",trainable=True)

In [None]:
def get_elmo_embeddings(tokens):
    embeddings = elmo(tokens,signature='default',as_dict=True)["elmo"]
    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())
        sess.run(tf.tables_initializer())
        return sess.run(tf.reduce_mean(embeddings,1))