In [29]:
import pandas as pd
import numpy as np
import spacy
from tqdm import tqdm
import re
import time
import pickle
pd.set_option('display.max_colwidth', 200)

In [101]:
df = pd.read_csv('train.csv')

In [102]:
df.columns.tolist()

['id', 'label', 'tweet']

In [103]:
df.head()

Unnamed: 0,id,label,tweet
0,1,0,#fingerprint #Pregnancy Test https://goo.gl/h1MfQV #android #apps #beautiful #cute #health #igers #iphoneonly #iphonesia #iphone
1,2,0,Finally a transparant silicon case ^^ Thanks to my uncle :) #yay #Sony #Xperia #S #sonyexperias… http://instagram.com/p/YGEt5JC6JM/
2,3,0,We love this! Would you go? #talk #makememories #unplug #relax #iphone #smartphone #wifi #connect... http://fb.me/6N3LsUpCu
3,4,0,I'm wired I know I'm George I was made that way ;) #iphone #cute #daventry #home http://instagr.am/p/Li_5_ujS4k/
4,5,1,What amazing service! Apple won't even talk to me about a question I have unless I pay them $19.95 for their stupid support!


In [104]:
df.drop(['id'], axis=1, inplace=True)

In [105]:
df.head()

Unnamed: 0,label,tweet
0,0,#fingerprint #Pregnancy Test https://goo.gl/h1MfQV #android #apps #beautiful #cute #health #igers #iphoneonly #iphonesia #iphone
1,0,Finally a transparant silicon case ^^ Thanks to my uncle :) #yay #Sony #Xperia #S #sonyexperias… http://instagram.com/p/YGEt5JC6JM/
2,0,We love this! Would you go? #talk #makememories #unplug #relax #iphone #smartphone #wifi #connect... http://fb.me/6N3LsUpCu
3,0,I'm wired I know I'm George I was made that way ;) #iphone #cute #daventry #home http://instagr.am/p/Li_5_ujS4k/
4,1,What amazing service! Apple won't even talk to me about a question I have unless I pay them $19.95 for their stupid support!


In [106]:
df = df[['tweet', 'label']]

In [107]:
df.head()

Unnamed: 0,tweet,label
0,#fingerprint #Pregnancy Test https://goo.gl/h1MfQV #android #apps #beautiful #cute #health #igers #iphoneonly #iphonesia #iphone,0
1,Finally a transparant silicon case ^^ Thanks to my uncle :) #yay #Sony #Xperia #S #sonyexperias… http://instagram.com/p/YGEt5JC6JM/,0
2,We love this! Would you go? #talk #makememories #unplug #relax #iphone #smartphone #wifi #connect... http://fb.me/6N3LsUpCu,0
3,I'm wired I know I'm George I was made that way ;) #iphone #cute #daventry #home http://instagr.am/p/Li_5_ujS4k/,0
4,What amazing service! Apple won't even talk to me about a question I have unless I pay them $19.95 for their stupid support!,1


In [108]:
df.shape

(7920, 2)

In [109]:
df_train = df.copy()

In [110]:
df_test = pd.read_csv('test.csv')

In [111]:
df_test.columns

Index(['id', 'tweet'], dtype='object')

In [112]:
df_test.drop('id', axis=1, inplace=True)

In [113]:
df_test.shape

(1953, 1)

In [114]:
df_train['label'].value_counts(normalize=True)
# 1 represents negative review, 0 ==> positive

0    0.744192
1    0.255808
Name: label, dtype: float64

In [115]:
df_train[df_train['tweet'].str.contains('http')].__len__()

4359

In [116]:
## Remove URL Links

df_train['clean_tweet'] = df_train['tweet'].apply(lambda x: re.sub(r'http\S+', '', x))
df_test['clean_tweet'] = df_test['tweet'].apply(lambda x: re.sub(r'http\S+', '', x))

In [117]:
## Remove punctuation

punctuation = '!"#$%&()*+_\/:;<=>?@[\\]^_`{|}~'

df_train['clean_tweet'] = df_train['clean_tweet'].apply(lambda x: ''.join(ch for ch in x if ch not in set(punctuation)))
df_test['clean_tweet'] = df_test['clean_tweet'].apply(lambda x: ''.join(ch for ch in x if ch not in set(punctuation)))

In [118]:
## Convert text to lowercase

df_train['clean_tweet'] = df_train['clean_tweet'].apply(lambda x: x.lower())
df_test['clean_tweet'] = df_test['clean_tweet'].apply(lambda x: x.lower())

In [119]:
## Remove whitespaces

df_train['clean_tweet'] = df_train['clean_tweet'].apply(lambda x: " ".join(x.split()))
df_test['clean_tweet'] = df_test['clean_tweet'].apply(lambda x: " ".join(x.split()))

In [120]:
## Remove numbers

df_train['clean_tweet'] = df_train['clean_tweet'].str.replace("[0-9]", "")
df_test['clean_tweet'] = df_train['clean_tweet'].str.replace("[0-9]", "")

In [121]:
## Text Normalization

nlp = spacy.load("en_core_web_md", disable=['parser', "ner"])


def lemmatization(text):
    output = list()
    
    for word in text:
        s = [token.lemma_ for token in nlp(word)]
        output.append(" ".join(s))
    return output   

In [122]:
df_train['clean_tweet'] = lemmatization(df_train['clean_tweet'])
df_test['clean_tweet'] = lemmatization(df_test['clean_tweet'])

In [123]:
df_train.head(6)

Unnamed: 0,tweet,label,clean_tweet
0,#fingerprint #Pregnancy Test https://goo.gl/h1MfQV #android #apps #beautiful #cute #health #igers #iphoneonly #iphonesia #iphone,0,fingerprint pregnancy test android app beautiful cute health iger iphoneonly iphonesia iphone
1,Finally a transparant silicon case ^^ Thanks to my uncle :) #yay #Sony #Xperia #S #sonyexperias… http://instagram.com/p/YGEt5JC6JM/,0,finally a transparant silicon case thank to -PRON- uncle yay sony xperia s sonyexperias …
2,We love this! Would you go? #talk #makememories #unplug #relax #iphone #smartphone #wifi #connect... http://fb.me/6N3LsUpCu,0,-PRON- love this would -PRON- go talk makememorie unplug relax iphone smartphone wifi connect ...
3,I'm wired I know I'm George I was made that way ;) #iphone #cute #daventry #home http://instagr.am/p/Li_5_ujS4k/,0,-PRON- be wire i know -PRON- be george i be make that way iphone cute daventry home
4,What amazing service! Apple won't even talk to me about a question I have unless I pay them $19.95 for their stupid support!,1,what amazing service apple will not even talk to -PRON- about a question i have unless i pay -PRON- . for -PRON- stupid support
5,iPhone software update fucked up my phone big time Stupid iPhones,1,iphone software update fuck up -PRON- phone big time stupid iphone


In [124]:
## TRANSFER LEARNING

In [125]:
# !pip install tensorflow-hub

Collecting tensorflow-hub
[?25l  Downloading https://files.pythonhosted.org/packages/9e/f0/3a3ced04c8359e562f1b91918d9bde797c8a916fcfeddc8dc5d673d1be20/tensorflow_hub-0.3.0-py2.py3-none-any.whl (73kB)
[K    100% |████████████████████████████████| 81kB 1.5MB/s ta 0:00:01
Installing collected packages: tensorflow-hub
Successfully installed tensorflow-hub-0.3.0


In [131]:
import tensorflow_hub as hub
import tensorflow as tf
elmo = hub.Module("https://tfhub.dev/google/elmo/2", trainable=True)

In [133]:
# just a random sentence

x = ['Rosted ants are a popular snack in Columbia']

## Extract ELMo features
embeddings = elmo(x, signature="default", as_dict=True)["elmo"]

embeddings.shape

TensorShape([Dimension(1), Dimension(8), Dimension(1024)])

The output is a 3 dimensional tensor of shape (1, 8, 1024):

The first dimension of this tensor represents the number of training samples. This is 1 in our case
The second dimension represents the maximum length of the longest string in the input list of strings. Since we have only 1 string in our input list, the size of the 2nd dimension is equal to the length of the string — 8
The third dimension is equal to the length of the ELMo vecto

Hence, **very word in the input sentence has an ELMo vector of size 1024.**

Let’s go ahead and extract ELMo vectors for the cleaned tweets in the train and test datasets. However, to arrive at the vector representation of an entire tweet, we will take the mean of the ELMo vectors of constituent terms or tokens of the tweet.

In [135]:
def elmo_vectors(x):
    embeddings = elmo(x.tolist(), signature="default", as_dict=True)["elmo"]

    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())
        sess.run(tf.tables_initializer())
        # return average of ELMo features
        return sess.run(tf.reduce_mean(embeddings,1))

You might run out of computational resources (memory) if you use the above function to extract embeddings for the tweets in one go. As a workaround, split both train and test set into batches of 100 samples each. Then, pass these batches sequentially to the function elmo_vectors( ).

In [138]:
list_train = [df_train[i:i+100] for i in range(0, df_train.shape[0], 100)]  #start, end, step_size in range
list_test = [df_test[i:i+100] for i in range(0, df_test.shape[0], 100)]  #start, end, step_size in range

Now, we will iterate through these batches and extract the ELMo vectors. Let me warn you, this will take a long time.

In [None]:
# Extract ELMo embeddings

elmo_train = [elmo_vectors(x['clean_tweet']) for x in list_train]
elmo_test = [elmo_vectors(x['clean_tweet']) for x in list_train]

Once we have all the vectors, we can concatenate them back to a single array:

In [None]:
elmo_train_new = np.concatenate(elmo_train, axis = 0) 
elmo_test_new = np.concatenate(elmo_test, axis = 0)

save these arrays as it took us a long time to get the ELMo vectors for them. We will save them as pickle files:

In [None]:
# save elmo_train_new
pickle_out = open("elmo_train_03272019.pickle","wb")
pickle.dump(elmo_train_new, pickle_out)
pickle_out.close()

# save elmo_test_new
pickle_out = open("elmo_test_03272019.pickle","wb")
pickle.dump(elmo_test_new, pickle_out)
pickle_out.close()

In [None]:
# Load back the ELMo vectors


# load elmo_train_new
pickle_in = open("elmo_train_03272019.pickle", "rb")
elmo_train_new = pickle.load(pickle_in)

# load elmo_train_new
pickle_in = open("elmo_test_03272019.pickle", "rb")
elmo_test_new = pickle.load(pickle_in)

In [None]:
# Model Building with ELMo

from sklearn.model_selection import train_test_split

X_train, X_valid, y_train, y_valid = train_test_split(elmo_train_new, df_train['label'], random_state=42, test_size=0.2) 

Since our objective is to set a baseline score, we will build a simple logistic regression model using ELMo vectors as features:



In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score

lreg = LogisticRegression()
lreg.fit(X_train, y_train)

In [None]:
pred_valid = lreg.predict(X_valid)

In [None]:
f1_score = (y_valid, pred_valid)

In [None]:
# Make predictions on the test set

pred_test = lgred.pred(elmo_test_new)

In [None]:
# # prepare submission dataframe
# sub = pd.DataFrame({'id':test['id'], 'label':preds_test})

# # write predictions to a CSV file
# sub.to_csv("sub_lreg.csv", index=False)