# How to Learn of document embeddings and use them to train a model

We are using the "Sentiment and Emotion in Text" dataset from Kaggle to identify emotions in text and use them for sentiment analysis.

In [17]:
import pandas as pd
import nltk
nltk.download('stopwords')
from nltk.tokenize import TweetTokenizer
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

SEED=42

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Loading and Exploring dataset

In [2]:
try:
    from google.colab import files
    !wget -P DATAPATH https://raw.githubusercontent.com/practical-nlp/practical-nlp/master/Ch4/Data/Sentiment%20and%20Emotion%20in%20Text/train_data.csv
    !wget -P DATAPATH https://raw.githubusercontent.com/practical-nlp/practical-nlp/master/Ch4/Data/Sentiment%20and%20Emotion%20in%20Text/test_data.csv
    !ls -lah DATAPATH
    filepath = "DATAPATH/train_data.csv"
except ModuleNotFoundError:
    filepath = "Data/Sentiment and Emotion in Text/train_data.csv"

--2023-06-12 00:56:23--  https://raw.githubusercontent.com/practical-nlp/practical-nlp/master/Ch4/Data/Sentiment%20and%20Emotion%20in%20Text/train_data.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.109.133, 185.199.111.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.109.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 2479133 (2.4M) [text/plain]
Saving to: ‘DATAPATH/train_data.csv’


2023-06-12 00:56:24 (37.8 MB/s) - ‘DATAPATH/train_data.csv’ saved [2479133/2479133]

--2023-06-12 00:56:24--  https://raw.githubusercontent.com/practical-nlp/practical-nlp/master/Ch4/Data/Sentiment%20and%20Emotion%20in%20Text/test_data.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting resp

In [3]:
df = pd.read_csv(filepath)
print(df.shape)
df.head()

(30000, 2)


Unnamed: 0,sentiment,content
0,empty,@tiffanylue i know i was listenin to bad habi...
1,sadness,Layin n bed with a headache ughhhh...waitin o...
2,sadness,Funeral ceremony...gloomy friday...
3,enthusiasm,wants to hang out with friends SOON!
4,neutral,@dannycastillo We want to trade with someone w...


In [4]:
df['sentiment'].value_counts()

worry         7433
neutral       6340
sadness       4828
happiness     2986
love          2068
surprise      1613
hate          1187
fun           1088
relief        1021
empty          659
enthusiasm     522
boredom        157
anger           98
Name: sentiment, dtype: int64

In [5]:
# filtering out sentiments everything except 3 of them
shortlist=['neutral', 'happiness', 'worry']
df_subset=df[df['sentiment'].isin(shortlist)]
print("shape", df_subset.shape)
df_subset['sentiment'].value_counts()

shape (16759, 2)


worry        7433
neutral      6340
happiness    2986
Name: sentiment, dtype: int64

# Text Processing

In [15]:
# because tweet messages have a different format and content than regular text, we consider in the text processing stage:
# - Remove the @s and urls because they do not contribute to emotion analysis
# - Use TweetTokenizer
# - remove stop words, numbers and any other special character
from nltk.tokenize import TweetTokenizer

tweeter=TweetTokenizer(strip_handles=True, preserve_case=False)
mystopwords=set(stopwords.words("english"))

# function to tokenize tweets
def preprocess_corpus(texts):
  def remove_stops_digits(tokens):
    return [token for token in tokens if token not in mystopwords and not token.isdigit()]

  return [remove_stops_digits(tweeter.tokenize(content)) for content in texts]

mydata=preprocess_corpus(df_subset['content'])
mycats=df_subset['sentiment']
print(len(mydata), len(mycats))

16759 16759


In [18]:
# splitting data for training and testing proposes
train_data, test_data, train_cats,test_cats = train_test_split(mydata, mycats, random_state=SEED)

In [28]:
# massage training data to put it in doc2vec format
train_doc2vec=[TaggedDocument((d), tags=[str(i)]) for i, d in enumerate(train_data)]
# create doc2vec model
model=Doc2Vec(vector_size=50, alpha=0.025, min_count=5, dm=1, epochs=100)
# train model
model.build_vocab(train_doc2vec)
model.train(train_doc2vec, total_examples=model.corpus_count, epochs=model.epochs)
model.save("d2v.model")
print("Model Saved")

Model Saved


In [34]:
# once the model is trained, we can use it to infer the feature representation for train and test data
model=Doc2Vec.load('d2v.model')

# infering in multiple epochs gives stability to vector values
train_vectors=[model.infer_vector(list_of_tokens, epochs=50) for list_of_tokens in train_data]
test_vectors=[model.infer_vector(list_of_tokens, epochs=50) for list_of_tokens in test_data]

# choosing LogisticRegression as classifier
from sklearn.linear_model import LogisticRegression

myclass = LogisticRegression(class_weight="balanced")
myclass.fit(train_vectors, train_cats)

preds = myclass.predict(test_vectors)
from sklearn.metrics import classification_report, confusion_matrix
print(classification_report(test_cats, preds))

print(confusion_matrix(test_cats,preds))

              precision    recall  f1-score   support

   happiness       0.33      0.52      0.41       724
     neutral       0.46      0.54      0.49      1586
       worry       0.61      0.40      0.48      1880

    accuracy                           0.47      4190
   macro avg       0.47      0.48      0.46      4190
weighted avg       0.51      0.47      0.47      4190

[[376 231 117]
 [383 850 353]
 [365 772 743]]
