In [127]:
import numpy as np
import pandas as pd
from pprint import pprint

import gensim
from gensim.utils import simple_preprocess
from gensim import corpora, models

from nltk.stem import WordNetLemmatizer
import nltk
nltk.download('wordnet')
from nltk.tokenize import TweetTokenizer
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
from symspellpy.symspellpy import SymSpell, Verbosity
from sklearn.model_selection import train_test_split

import collections
import torch
import torch.nn as nn
from torch import optim
import random
import time

[nltk_data] Downloading package wordnet to /home/ashwin/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
df = pd.read_csv('data/labeled_prelim.csv')
df = df.dropna()
df = df.astype({'Id':np.int32, 'Relevancy':np.int32, 'Urgency':np.int32})
df.head()

Unnamed: 0,Id,Text,Relevancy,Urgency
0,247434,More millions in #Afghanistan even with ZERO a...,0,0
1,294115,These are the last post my brother made on soc...,2,1
2,24622,In @cityofcc listening to local officials abou...,0,0
3,37807,So so so damn proud of @5ugarcane who is tirel...,3,0
4,37386,How can you help with #Harvey disaster respons...,0,0


In [9]:
def create_symspell(max_edit_distance, prefix_length, freq_file_path):
    
    # create object
    sym_spell = SymSpell(max_edit_distance, prefix_length)
    
    # create dictionary using corpus.txt
    if not sym_spell.create_dictionary(freq_file_path):
        print("Corpus file not found")
        return None
    
    return sym_spell

def process_tweet(tweet, tknzr, sym_spell):
    st_1 = []
    for w in tknzr.tokenize(tweet):
        #remove retweet annotation if present:
        if w == 'RT':
            continue
        #remove hashtag symbol
        elif w[0] == '#':
            st_1.append(w[1:])
        #replace link with LINK keyword
        elif w[:4] == 'http':
            st_1.append('link')
        else:
            st_1.append(w)
    
    #remove stop words and punctuation, make everything lowercase
    #st_2 = [sym_spell.word_segmentation(w.lower()).corrected_string 
    #            for w in st_1 if w.isalpha() and not w.lower() in stop_words]
    
    st_2 = [w.lower() for w in st_1 if w.isalpha() and not w.lower() in stop_words]
    
    #lemmatization (converts all words to root form for standardization)
    lem = WordNetLemmatizer()
    st_3 = list(map(lambda x: lem.lemmatize(x, pos='v'), st_2))
    
    #now do word segmentation/spell check
    return ' '.join(st_3)

In [10]:
#sym_spell = create_symspell(2,7,'data/frequency_dictionary_en_82_765.txt')
tknzr = TweetTokenizer(strip_handles=True, reduce_len=True)

In [11]:
doc_sample = df['Text'][0]
process_tweet(doc_sample, tknzr, None)

'millions afghanistan even zero attack isis sympathizers invest texas nation build harvey texasflood'

In [6]:
#list of embeddings
vec_length = 25
embeddings = np.zeros((1193514, vec_length))

#two-way map, index->word and word->index
glove = {}

index = 0
with open('data/glove.twitter.27B/glove.twitter.27B.%dd.txt' % vec_length) as f:
    for l in f:
        line = []
        try:
            line = l.split()
            if len(line) != vec_length+1:
                print('empty line')
                continue
            
            word = line[0]
            embeddings[index] = np.array(line[1:]).astype(np.float)
            glove[index] = word
            glove[word] = index
            index += 1
        except:
            print(line)
            print(index)
            break

empty line


In [13]:
#sanity checks
print(embeddings[1193512])
print(glove[74])
print(glove['if'])
print(embeddings[74])

[-2.5807   -1.0965   -0.59056   1.1178   -0.30615  -0.44198  -1.377
 -2.3494    2.0436   -0.15692   2.6962    1.033     0.81358  -1.7224
  0.066939 -0.71714   1.0608   -0.43463   2.1178    0.65876   0.62825
 -1.2018    1.7123    0.79867   0.32424 ]
if
74
[ 0.18243  0.70534 -0.34209 -0.10779 -0.72721 -0.58802  1.7457  -0.13666
 -0.61576  0.15336 -0.19019  0.70282 -5.725   -0.20901 -0.33692  0.16916
  0.35872 -0.9871   0.45495 -0.36607  0.62973  0.11066  0.31315  0.08787
 -0.88679]


In [14]:
#convert columns to numpy arrays
text = df['Text'].values
relevancy = df['Relevancy'].values
urgency = df['Urgency'].values

In [142]:
class RelevancyClassifier(nn.Module):
    def __init__(self, index, embeddings, embed_len, num_classes):
        super(RelevancyClassifier, self).__init__()
        self.hidden_size = 30
        self.embed_len = embed_len
        #print(embeddings.shape)
        #self.embedding = nn.Embedding.from_pretrained(embeddings)
        self.fc1 = nn.Linear(embed_len, self.hidden_size)
        self.nl = nn.LeakyReLU()
        self.fc2 = nn.Linear(self.hidden_size, num_classes)

    def forward(self, x):
        return self.fc2(self.nl(self.fc1(x)))


def train_relevancy_classifier(train_exs, train_labels, embeddings, index):
    try:
        print(type(train_exs))
        epochs = 10
        lr = .0001
        num_classes = 2
        rc = RelevancyClassifier(index, embeddings, len(embeddings[0]), num_classes)
        optimizer = optim.Adam(rc.parameters(), lr=lr)
        loss = nn.CrossEntropyLoss()
        
        for epoch in range(epochs):
            ex_indices = [i for i in range(len(train_exs))]
            random.shuffle(ex_indices)
            total_loss = 0.0
            num_tweets = len(ex_indices)
            for idx in ex_indices:
                #cur_idx = ex_indices[idx]
                cur_tweet = train_exs[idx]
                cur_embed = []
                for i in cur_tweet.split():
                    if i in index:
                        cur_embed.append(embeddings[index[i]])
                if len(cur_embed) == 0:
                    num_tweets -= 1
                    continue
                x = torch.from_numpy(np.asarray(np.mean(cur_embed, axis=0)).reshape(1,25)).float()
                #print(x.shape)
                y = np.asarray(train_labels[idx]).reshape(1)
                if y[0] > 0:
                    y[0] = 1
                y = torch.tensor(y).long()
                #print(y.shape)
                rc.zero_grad()
                probs = rc.forward(x)
                cur_loss = loss(probs, y)
                total_loss += cur_loss
                cur_loss.backward()
                optimizer.step()
            print("Avg loss on epoch %i: %f" % (epoch, total_loss/num_tweets))
        return rc
    except KeyboardInterrupt:
        return rc

In [154]:
processed_tweets = df['Text'].map(lambda x: process_tweet(x, tknzr, None))

In [155]:
X_train, X_test, y_train, y_test = train_test_split(processed_tweets.values, urgency, test_size=0.33)

In [156]:
model = train_relevancy_classifier(X_train, y_train, embeddings, glove)

<class 'numpy.ndarray'>
Avg loss on epoch 0: 0.475863
Avg loss on epoch 1: 0.323312
Avg loss on epoch 2: 0.310519
Avg loss on epoch 3: 0.305674
Avg loss on epoch 4: 0.301835
Avg loss on epoch 5: 0.298196
Avg loss on epoch 6: 0.295057
Avg loss on epoch 7: 0.291967
Avg loss on epoch 8: 0.289537
Avg loss on epoch 9: 0.288159


In [157]:
val_tweets = X_test
val_labels = y_test
for i in range(len(val_labels)):
    if val_labels[i] > 0:
        val_labels[i] = 1
num_correct = 0
for i in range(len(val_tweets)):
    cur_embed = []
    cur_tweet = val_tweets[i]
    cur_label = val_labels[i]
    for i in cur_tweet.split():
        if i in glove:
            cur_embed.append(embeddings[glove[i]])
    if len(cur_embed) == 0:
        #num_tweets -= 1
        continue
    x = torch.from_numpy(np.asarray(np.mean(cur_embed, axis=0)).reshape(1,25)).float()
    probs = model.forward(x).detach().numpy().reshape(2)
#     print(probs)
#     print(np.argmax(probs))
#     print(val_labels[i])
    if np.argmax(probs) == cur_label:
        num_correct += 1
print(num_correct/len(val_tweets))

0.8922764227642277
