# Sentiment analysis using ELMo as feature extractor

## import dependency

In [None]:
import pandas as pd
import numpy as np
import spacy
from tqdm import tqdm
import re
import time
import pickle
from sklearn.model_selection import train_test_split
import tensorflow_hub as hub
import tensorflow as tf
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score, accuracy_score
from sklearn.preprocessing import LabelEncoder

In [None]:
tf.__version__

'2.12.0'

## import dataset
we going to use sentiment data from US airline for sentiment analysis

https://www.kaggle.com/datasets/crowdflower/twitter-airline-sentiment

In [None]:
# import dataset
df = pd.read_csv(r'https://raw.githubusercontent.com/satyajeetkrjha/kaggle-Twitter-US-Airline-Sentiment-/master/Tweets.csv')[['text','airline_sentiment']]
df.columns = ['text','airline_sentiment']
len(df)

14640

In [None]:
df = df.head(1000)

In [None]:
labelencoder = LabelEncoder()
df['label'] = labelencoder.fit_transform(df['airline_sentiment'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['label'] = labelencoder.fit_transform(df['airline_sentiment'])


## data preprocessing
only minor preprocessing is required to help the model learn faster for task specific

In [None]:
# remove URL's from train and test
df['clean_text'] = df['text'].apply(lambda x: re.sub(r'http\S+', '', x))

# remove punctuation marks
punctuation = '@'
df['clean_text'] = df['clean_text'].apply(lambda x: ''.join(ch for ch in x if ch not in set(punctuation)))

# convert text to lowercase
df['clean_text'] = df['clean_text'].str.lower()

# remove whitespaces
df['clean_text'] = df['clean_text'].apply(lambda x:' '.join(x.split()))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['clean_text'] = df['text'].apply(lambda x: re.sub(r'http\S+', '', x))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['clean_text'] = df['clean_text'].apply(lambda x: ''.join(ch for ch in x if ch not in set(punctuation)))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['clean_text'] = df['cl

In [None]:
df.head(10)

Unnamed: 0,text,airline_sentiment,label,clean_text
0,@VirginAmerica What @dhepburn said.,neutral,1,virginamerica what dhepburn said.
1,@VirginAmerica plus you've added commercials t...,positive,2,virginamerica plus you've added commercials to...
2,@VirginAmerica I didn't today... Must mean I n...,neutral,1,virginamerica i didn't today... must mean i ne...
3,@VirginAmerica it's really aggressive to blast...,negative,0,virginamerica it's really aggressive to blast ...
4,@VirginAmerica and it's a really big bad thing...,negative,0,virginamerica and it's a really big bad thing ...
5,@VirginAmerica seriously would pay $30 a fligh...,negative,0,virginamerica seriously would pay $30 a flight...
6,"@VirginAmerica yes, nearly every time I fly VX...",positive,2,"virginamerica yes, nearly every time i fly vx ..."
7,@VirginAmerica Really missed a prime opportuni...,neutral,1,virginamerica really missed a prime opportunit...
8,"@virginamerica Well, I didn't…but NOW I DO! :-D",positive,2,"virginamerica well, i didn't…but now i do! :-d"
9,"@VirginAmerica it was amazing, and arrived an ...",positive,2,"virginamerica it was amazing, and arrived an h..."


## pre-trained loading

In [None]:
elmo = hub.load("https://tfhub.dev/google/elmo/3")

In [None]:
# test extract feature from the 1st document
x = df['clean_text'].head().tolist()

# Extract ELMo features 
embeddings = elmo.signatures["default"](tf.constant(x))["elmo"]
embeddings.shape

TensorShape([5, 17, 1024])

In [None]:
# print the 1st document and its embedding matrix
x[0], embeddings[0]

('virginamerica what dhepburn said.',
 <tf.Tensor: shape=(17, 1024), dtype=float32, numpy=
 array([[ 0.2328333 , -0.2434732 ,  0.40921795, ...,  0.3118505 ,
          0.2960112 , -0.1535493 ],
        [ 0.60510397,  0.08181188,  0.2538871 , ...,  0.30966172,
          0.06107214,  0.18624276],
        [ 0.22881086, -0.11599298,  0.03148502, ..., -0.01174149,
          0.28047734,  0.08254117],
        ...,
        [-0.0284084 , -0.04353216,  0.04130162, ...,  0.02583168,
         -0.01429836, -0.01650422],
        [-0.0284084 , -0.04353216,  0.04130162, ...,  0.02583168,
         -0.01429836, -0.01650422],
        [-0.0284084 , -0.04353216,  0.04130162, ...,  0.02583168,
         -0.01429836, -0.01650422]], dtype=float32)>)

## feature extraction

In [None]:
elmo_data = elmo.signatures["default"](tf.constant(df['clean_text']))["elmo"]

In [None]:
# elmo_train.shape,elmo_test.shape
elmo_data.shape

TensorShape([1000, 32, 1024])

In [None]:
# elmo_train_new = tf.reshape(elmo_train, [len(elmo_train),-1])
# elmo_test_new = tf.reshape(elmo_test, [len(elmo_test),-1])
elmo_data_new = tf.reshape(elmo_data, [len(elmo_data),-1])

In [None]:
# split train test
elmo_train_new, elmo_test_new, y_train, y_test = train_test_split(elmo_data_new.numpy(), df['label'].values, test_size=0.20, random_state=42)

In [None]:
# elmo_train_new.shape,elmo_test_new.shape
elmo_train_new.shape, elmo_test_new.shape

((800, 32768), (200, 32768))

## save and load embedding

In [None]:
# save elmo_train_new
pickle_out = open("elmo_train.pickle","wb")
pickle.dump(elmo_train_new, pickle_out)
pickle_out.close()

# save elmo_test_new
pickle_out = open("elmo_test.pickle","wb")
pickle.dump(elmo_test_new, pickle_out)
pickle_out.close()

In [None]:
# load elmo_train_new
pickle_in = open("elmo_train.pickle", "rb")
elmo_train_new = pickle.load(pickle_in)

# load elmo_train_new
pickle_in = open("elmo_test.pickle", "rb")
elmo_test_new = pickle.load(pickle_in)

## modeling using LR

In [None]:
clf = LogisticRegression(random_state=0).fit(elmo_train_new, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [None]:
preds_test = clf.predict(elmo_test_new)

## evaluate model

In [None]:
f1_score(y_test, preds_test,average='macro')

0.6137936471217681

In [None]:
accuracy_score(y_test, preds_test)

0.695