<a href="https://colab.research.google.com/github/sahug/ds-bert/blob/main/ELMo%20NLP%20-%20Extracting%20Features%20from%20Text.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**ELMo NLP - Extracting Features from Text**

**Dataset**

Here’s a breakdown of the dataset we have:

The train set contains 7,920 tweets

The test set contains 1,953 tweets


In [1]:
#Import Libraries

import pandas as pd
import numpy as np
import spacy

from tqdm import tqdm

import re
import time
import pickle

pd.set_option("display.max_colwidth", 200)

In [2]:
#Read and Inspect Data

train = pd.read_csv("/content/sample_data/train_2kmZucJ.csv")
test = pd.read_csv("/content/sample_data/test_oJQbWVk.csv")

train.shape, test.shape

((7920, 3), (1953, 2))

In [3]:
train.head()

Unnamed: 0,id,label,tweet
0,1,0,#fingerprint #Pregnancy Test https://goo.gl/h1MfQV #android #apps #beautiful #cute #health #igers #iphoneonly #iphonesia #iphone
1,2,0,Finally a transparant silicon case ^^ Thanks to my uncle :) #yay #Sony #Xperia #S #sonyexperias… http://instagram.com/p/YGEt5JC6JM/
2,3,0,We love this! Would you go? #talk #makememories #unplug #relax #iphone #smartphone #wifi #connect... http://fb.me/6N3LsUpCu
3,4,0,I'm wired I know I'm George I was made that way ;) #iphone #cute #daventry #home http://instagr.am/p/Li_5_ujS4k/
4,5,1,What amazing service! Apple won't even talk to me about a question I have unless I pay them $19.95 for their stupid support!


In [4]:
#1 represents a negative tweet while 0 represents a non-negative tweet.
train["label"].value_counts(normalize = True)

0    0.744192
1    0.255808
Name: label, dtype: float64

In [5]:
#Data Preprocessing

#Remove URLs
train["clean_tweet"] = train["tweet"].apply(lambda x: re.sub("http\S+", "", x))
test["clean_tweet"] = test["tweet"].apply(lambda x: re.sub("http\S+", "", x))

#Remove Puntuations
puntuation = '!"#$%&()*+-/:;<=>?@[\\]^_`{|}~'

train["clean_tweet"] = train["clean_tweet"].apply(lambda x: "".join(ch for ch in x if ch not in set(puntuation)))
test["clean_tweet"] = test["clean_tweet"].apply(lambda x: "".join(ch for ch in x if ch not in set(puntuation)))

#Convert text To Lowercase
train["clean_tweet"] = train["clean_tweet"].str.lower()
test["clean_tweet"] = test["clean_tweet"].str.lower()

#Remove Numbers
train["clean_tweet"] = train["clean_tweet"].str.replace("[0-9]", " ")
test["clean_tweet"] = test["clean_tweet"].str.replace("[0-9]", " ")

#Remove Whitespace
train['clean_tweet'] = train['clean_tweet'].apply(lambda x:' '.join(x.split()))
test['clean_tweet'] = test['clean_tweet'].apply(lambda x: ' '.join(x.split()))



In [6]:
#Lemmatization -  This helps in reducing a word to its base form. For example, the base form of the words ‘produces’, ‘production’, and ‘producing’ is ‘product’

#import spaCy's language model

nlp = spacy.load("en", disable=["parser", "ner"])

#Funtion to lemmentize test
def lemmantization(texts):
  output = []
  for i in texts:
    s = [token.lemma_ for token in nlp(i)]
    output.append(" ".join(s))
  return output

In [7]:
#Lemmantize Tweets

train["clean_tweet"] = lemmantization(train["clean_tweet"])
test["clean_tweet"] = lemmantization(test["clean_tweet"])

In [8]:
train.head()

Unnamed: 0,id,label,tweet,clean_tweet
0,1,0,#fingerprint #Pregnancy Test https://goo.gl/h1MfQV #android #apps #beautiful #cute #health #igers #iphoneonly #iphonesia #iphone,fingerprint pregnancy test android apps beautiful cute health iger iphoneonly iphonesia iphone
1,2,0,Finally a transparant silicon case ^^ Thanks to my uncle :) #yay #Sony #Xperia #S #sonyexperias… http://instagram.com/p/YGEt5JC6JM/,finally a transparant silicon case thank to -PRON- uncle yay sony xperia s sonyexperias …
2,3,0,We love this! Would you go? #talk #makememories #unplug #relax #iphone #smartphone #wifi #connect... http://fb.me/6N3LsUpCu,-PRON- love this would -PRON- go talk makememorie unplug relax iphone smartphone wifi connect ...
3,4,0,I'm wired I know I'm George I was made that way ;) #iphone #cute #daventry #home http://instagr.am/p/Li_5_ujS4k/,-PRON- be wire i know -PRON- be george i be make that way iphone cute daventry home
4,5,1,What amazing service! Apple won't even talk to me about a question I have unless I pay them $19.95 for their stupid support!,what amazing service apple will not even talk to -PRON- about a question i have unless i pay -PRON- . for -PRON- stupid support


**Pretrained Models from Tensorhub**

In [None]:
!pip install tensorflow==1.15

In [None]:
!pip install tensorflow-hub

**Preparing ELMo Vectors**

In [11]:
import tensorflow_hub as hub
import tensorflow as tf

elmo = hub.Module("https://tfhub.dev/google/elmo/2", trainable=True)

In [13]:
#Testing with a random sentence

x = ["Roasted ants are popular snack in Columbia"]

#Extract ELMo Features
embeddings = elmo(x, signature="default", as_dict=True)["elmo"]

#The first dimension of this tensor represents the number of training samples. This is 1 in our case
#The second dimension represents the maximum length of the longest string in the input list of strings. Since we have only 1 string in our input list, the size of the 2nd dimension is equal to the length of the string – 8
#The third dimension is equal to the length of the ELMo vector

embeddings.shape

INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


TensorShape([Dimension(1), Dimension(7), Dimension(1024)])

In [14]:
def elmo_vectors(x):
  embeddings = elmo(x.tolist(), signature="default", as_dict=True)["elmo"]

  with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    sess.run(tf.tables_initializer())

    #Return average of ELMo features
    return sess.run(tf.reduce_mean(embeddings, 1))

In [15]:
list_train = [train[i:i+100] for i in range(0,train.shape[0],100)]
list_test = [test[i:i+100] for i in range(0,test.shape[0],100)]

In [None]:
# Extract ELMo embeddings
elmo_train = [elmo_vectors(x['clean_tweet']) for x in list_train]
elmo_test = [elmo_vectors(x['clean_tweet']) for x in list_test]

In [17]:
elmo_train_new = np.concatenate(elmo_train, axis = 0)
elmo_test_new = np.concatenate(elmo_test, axis = 0)

In [18]:
# Save elmo_train_new
pickle_out = open("elmo_train_03032019.pickle","wb")
pickle.dump(elmo_train_new, pickle_out)
pickle_out.close()

# Save elmo_test_new
pickle_out = open("elmo_test_03032019.pickle","wb")
pickle.dump(elmo_test_new, pickle_out)
pickle_out.close()

In [19]:
# Load elmo_train_new
pickle_in = open("elmo_train_03032019.pickle", "rb")
elmo_train_new = pickle.load(pickle_in)

# Load elmo_train_new
pickle_in = open("elmo_test_03032019.pickle", "rb")
elmo_test_new = pickle.load(pickle_in)

**Model**

In [20]:
from sklearn.model_selection import train_test_split

xtrain, xvalid, ytrain, yvalid = train_test_split(elmo_train_new, 
                                                  train['label'],  
                                                  random_state=42, 
                                                  test_size=0.2)

In [28]:
xtrain

array([[ 0.10796256, -0.15025751,  0.08917782, ..., -0.02896453,
         0.21496482,  0.13388969],
       [ 0.09576906, -0.04906286, -0.00841576, ..., -0.0099304 ,
         0.12691543,  0.0287101 ],
       [-0.0070042 , -0.15437905, -0.01979056, ..., -0.11072069,
         0.292789  ,  0.00396855],
       ...,
       [-0.01730759,  0.04019286,  0.02190867, ..., -0.08021908,
        -0.05449635, -0.0276353 ],
       [ 0.09427078,  0.01278635, -0.02669737, ...,  0.05523895,
         0.03871485,  0.06402151],
       [-0.09971473, -0.08113606,  0.05070456, ...,  0.09827308,
         0.09978132,  0.02583575]], dtype=float32)

In [22]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score

lreg = LogisticRegression(solver='lbfgs', max_iter=100)
lreg.fit(xtrain, ytrain)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


LogisticRegression()

In [23]:
preds_valid = lreg.predict(xvalid)

In [24]:
f1_score(yvalid, preds_valid)

0.7625899280575541

In [25]:
# Make predictions on test set
preds_test = lreg.predict(elmo_test_new)

In [26]:
preds_test

array([1, 1, 1, ..., 0, 1, 0])