Importing Libraries



In [1]:
#Importing Libraries
import re
import time
import pickle
import spacy
import numpy as np
import pandas as pd
from tqdm import tqdm

Read and Inspect Data

In [2]:
#read data
train_dataset = pd.read_csv("/content/train.csv")
test_dataset = pd.read_csv("/content/test.csv")

In [5]:
#checkng the shape of train and test dataset
train_dataset.shape, test_dataset.shape

((7920, 3), (1953, 2))

The train set has 7,920 tweets while the test set has only 1,953.

In [7]:
#checking the class distribution in the train set
train_dataset['label'].value_counts(normalize = True)

0    0.744192
1    0.255808
Name: label, dtype: float64

In [9]:
#first five rows of the dataset
train_dataset.head()

Unnamed: 0,id,label,tweet
0,1,0,#fingerprint #Pregnancy Test https://goo.gl/h1...
1,2,0,Finally a transparant silicon case ^^ Thanks t...
2,3,0,We love this! Would you go? #talk #makememorie...
3,4,0,I'm wired I know I'm George I was made that wa...
4,5,1,What amazing service! Apple won't even talk to...


Text Cleaning and Preprocessing

In [11]:
# remove URL's from train and test
train_dataset['clean_tweet'] = train_dataset['tweet'].apply(lambda x: re.sub(r'http\S+', '', x))
test_dataset['clean_tweet'] = test_dataset['tweet'].apply(lambda x: re.sub(r'http\S+', '', x))

In [13]:
# remove punctuation marks
punctuation = '!"#$%&()*+-/:;<=>?@[\\]^_`{|}~'

train_dataset['clean_tweet'] = train_dataset['clean_tweet'].apply(lambda x: ''.join(ch for ch in x if ch not in set(punctuation)))
test_dataset['clean_tweet'] = test_dataset['clean_tweet'].apply(lambda x: ''.join(ch for ch in x if ch not in set(punctuation)))

In [15]:
# convert text to lowercase
train_dataset['clean_tweet'] = train_dataset['clean_tweet'].str.lower()
test_dataset['clean_tweet'] = test_dataset['clean_tweet'].str.lower()

In [16]:
#remove numbers
train_dataset['clean_tweet'] = train_dataset['clean_tweet'].str.replace("[0-9]"," ")
test_dataset['clean_tweet'] = test_dataset['clean_tweet'].str.replace("[0-9]"," ")

  train_dataset['clean_tweet'] = train_dataset['clean_tweet'].str.replace("[0-9]"," ")
  test_dataset['clean_tweet'] = test_dataset['clean_tweet'].str.replace("[0-9]"," ")


In [17]:
#remove whitespace
train_dataset['clean_tweet'] = train_dataset['clean_tweet'].apply(lambda x: ' '.join(x.split()))
test_dataset['clean_tweet'] = test_dataset['clean_tweet'].apply(lambda x: ' '.join(x.split()))

Text Normalization

In [19]:
# import spaCy's language model
nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])

# function to lemmatize text
def lemmatization(texts):
    output = []
    for i in texts:
        s = [token.lemma_ for token in nlp(i)]
        output.append(' '.join(s))
    return output

In [20]:
train_dataset['clean_tweet'] = lemmatization(train_dataset['clean_tweet'])
test_dataset['clean_tweet'] = lemmatization(test_dataset['clean_tweet'])

In [21]:
train_dataset.sample(10)

Unnamed: 0,id,label,tweet,clean_tweet
7288,7289,0,It's beautiful ... #samsung #galaxy #note #tab...,it be beautiful ... samsung galaxy note tablet...
6778,6779,1,@oliyoung but I'm still #hating on them for no...,oliyoung but I be still hate on they for no gt...
4677,4678,1,"Dear, #Apple. You've made me reset my password...","dear , apple . you 've make I reset my passwor..."
7066,7067,1,You #suck #apple . All this hype for the #Beat...,you suck apple . all this hype for the beatle ...
1491,1492,0,Posing Curiously..!! #cute #angel #cutiepie #g...,pose curiously .. cute angel cutiepie girlnext...
4887,4888,1,"No seriously, I don't mind a report I finished...","no seriously , I do not mind a report I finish..."
3022,3023,0,Don't kiss girl without permission https://you...,do not kiss girl without permission via youtub...
3141,3142,1,Apple better replace my piece of $&@*# phone c...,apple well replace my piece of phone cuz I wan...
4575,4576,0,Delicious fresh apple juice. . #goodmorning #m...,delicious fresh apple juice . . goodmorning mo...
7073,7074,0,$&@*# yeah #gradpresent #apple #mac #book #pro...,yeah gradpresent apple mac book pro excited fu...


**Installing Tensorflow Hub**

In [22]:
!pip install "tensorflow>=1.7.0"



In [23]:
!pip install tensorflow-hub



Preparing ELMo Vectors

In [46]:
import tensorflow_hub as hub
import tensorflow as tf

In [47]:
elmo = hub.load("https://tfhub.dev/google/elmo/3")

Elmo vectors for Random sentences

In [48]:
# Define your input text as a list of strings
input_text = ["This is a sample sentence.", "ELMo embeddings are powerful."]

In [49]:
# Get ELMo embeddings
elmo_result = elmo.signatures["default"](tf.constant(input_text))["elmo"]

In [51]:
elmo_result.shape

TensorShape([2, 5, 1024])

In [50]:
# Create a TensorFlow session
with tf.compat.v1.Session() as session:
    session.run(tf.compat.v1.global_variables_initializer())
    session.run(tf.compat.v1.tables_initializer())

    # Run the session to obtain ELMo embeddings
    embeddings_result = session.run(elmo_result)

Creating function for ELMO-Vectors

In [52]:
def elmo_vectors(x):
  elmo_result = elmo.signatures["default"](tf.constant(input_text))["elmo"]
  # Create a TensorFlow session
  with tf.compat.v1.Session() as session:
    session.run(tf.compat.v1.global_variables_initializer())
    session.run(tf.compat.v1.tables_initializer())

    # Run the session to obtain ELMo embeddings
    embeddings_result = session.run(elmo_result)
    return embeddings_result

Extract ELMO embeddings

In [53]:
#spliting both train and test set into batches of 100 samples
list_train = [train_dataset[i:i+100] for i in range(0,train_dataset.shape[0],100)]
list_test = [test_dataset[i:i+100] for i in range(0,test_dataset.shape[0],100)]

In [54]:
# Extract ELMo embeddings
elmo_train = [elmo_vectors(x['clean_tweet']) for x in list_train]
elmo_test = [elmo_vectors(x['clean_tweet']) for x in list_test]

In [55]:
# concatenating vectors back to a single array
elmo_train_new = np.concatenate(elmo_train, axis = 0)
elmo_test_new = np.concatenate(elmo_test, axis = 0)

In [56]:
elmo_train_new

array([[[-0.5142067 , -0.46363446,  0.06954876, ..., -0.5121533 ,
         -0.07905366, -0.04618377],
        [-0.27039832, -0.51222134,  0.11160287, ..., -0.41459325,
          0.44053242,  0.59563017],
        [-0.18552758,  0.07125243,  0.41650227, ...,  0.4526219 ,
         -0.11935171, -0.23624256],
        [-0.49269325, -0.2930713 ,  1.0139275 , ...,  0.33318982,
          0.11209002, -0.02819705],
        [-0.25487682,  0.26428893, -0.16695958, ..., -0.11661101,
          0.19026297,  0.98159707]],

       [[ 0.22764051, -0.0571156 ,  0.10660114, ..., -0.30185682,
         -0.8543476 , -0.10723631],
        [-0.38947773,  1.1483682 ,  0.02387787, ...,  0.31108117,
          0.08831009, -0.07550527],
        [ 0.15515377,  0.15371701, -0.16215982, ..., -0.11710428,
          0.28011465,  0.01975572],
        [-0.0251216 ,  0.31629717, -0.40936878, ...,  0.14916626,
         -0.0158485 ,  0.1849309 ],
        [-0.0284084 , -0.04353216,  0.04130162, ...,  0.02583168,
         -0.01

In [57]:
elmo_test_new

array([[[-0.5142067 , -0.46363446,  0.06954876, ..., -0.5121533 ,
         -0.07905366, -0.04618377],
        [-0.27039832, -0.51222134,  0.11160287, ..., -0.41459325,
          0.44053242,  0.59563017],
        [-0.18552758,  0.07125243,  0.41650227, ...,  0.4526219 ,
         -0.11935171, -0.23624256],
        [-0.49269325, -0.2930713 ,  1.0139275 , ...,  0.33318982,
          0.11209002, -0.02819705],
        [-0.25487682,  0.26428893, -0.16695958, ..., -0.11661101,
          0.19026297,  0.98159707]],

       [[ 0.22764051, -0.0571156 ,  0.10660114, ..., -0.30185682,
         -0.8543476 , -0.10723631],
        [-0.38947773,  1.1483682 ,  0.02387787, ...,  0.31108117,
          0.08831009, -0.07550527],
        [ 0.15515377,  0.15371701, -0.16215982, ..., -0.11710428,
          0.28011465,  0.01975572],
        [-0.0251216 ,  0.31629717, -0.40936878, ...,  0.14916626,
         -0.0158485 ,  0.1849309 ],
        [-0.0284084 , -0.04353216,  0.04130162, ...,  0.02583168,
         -0.01