In [1]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

In [2]:
# Step 1: Data Preprocessing
# Load the dataset
data = pd.read_csv('tweet_emotions .csv')

In [6]:
data.head()

Unnamed: 0,tweet_id,sentiment,content
0,1956967341,empty,@tiffanylue i know i was listenin to bad habi...
1,1956967666,sadness,Layin n bed with a headache ughhhh...waitin o...
2,1956967696,sadness,Funeral ceremony...gloomy friday...
3,1956967789,enthusiasm,wants to hang out with friends SOON!
4,1956968416,neutral,@dannycastillo We want to trade with someone w...


In [8]:
data.shape

(40000, 3)

Pre Processing

In [9]:
# Text Cleaning
def clean_text(text):
    text = text.lower()
    text = re.sub(r'\W', ' ', text)
    text = re.sub(r'http\S+', '', text)
    text = re.sub(r'\d+', '', text)
    text = re.sub(r'\s+', ' ', text)
    return text.strip()

data['clean_text'] = data['content'].apply(clean_text)

In [11]:
# Tokenization
nltk.download('punkt')
data['tokens'] = data['clean_text'].apply(nltk.word_tokenize)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [12]:
nltk.download('stopwords')
# Removing Stopwords
stopwords = set(stopwords.words('english'))
data['tokens'] = data['tokens'].apply(lambda x: [word for word in x if word not in stopwords])

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [13]:
# Stemming
stemmer = PorterStemmer()
data['tokens'] = data['tokens'].apply(lambda x: [stemmer.stem(word) for word in x])

Feature Extraction

In [14]:

# TF-IDF Vectorization
tfidf = TfidfVectorizer()
features = tfidf.fit_transform(data['tokens'].apply(lambda x: ' '.join(x))).toarray()

In [None]:
#reduce the Data Set Size
from scipy.sparse import csr_matrix

features_sparse = csr_matrix(features)

In [16]:
# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(features_sparse, data['sentiment'], test_size=0.2, random_state=42)

# Model Building

1.Random Forest

In [17]:
# Initialize and train a Random Forest classifier
classifier = RandomForestClassifier(n_estimators=100, random_state=42)
classifier.fit(X_train, y_train)


In [18]:
# Make predictions on the test set
y_pred = classifier.predict(X_test)

In [19]:
# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [20]:
print("Accuracy: {:.2f}%".format(accuracy * 100))
print("Classification Report:\n", report)

Accuracy: 33.21%
Classification Report:
               precision    recall  f1-score   support

       anger       0.00      0.00      0.00        19
     boredom       0.00      0.00      0.00        31
       empty       0.06      0.01      0.01       162
  enthusiasm       0.00      0.00      0.00       163
         fun       0.08      0.01      0.02       338
   happiness       0.31      0.32      0.32      1028
        hate       0.35      0.19      0.25       268
        love       0.46      0.40      0.42       762
     neutral       0.33      0.58      0.42      1740
      relief       0.19      0.02      0.03       352
     sadness       0.35      0.22      0.27      1046
    surprise       0.26      0.04      0.06       425
       worry       0.32      0.42      0.36      1666

    accuracy                           0.33      8000
   macro avg       0.21      0.17      0.17      8000
weighted avg       0.31      0.33      0.30      8000



2.Logistic Regres

In [47]:
# Train the model using LR
from sklearn.linear_model import LogisticRegression
model = LogisticRegression()
model.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [48]:
# Predict the emotions for test set
y_pred = model.predict(X_test)


In [49]:
# Evaluate the model
from sklearn.metrics import classification_report as clf_reportLR
accuracy = accuracy_score(y_test, y_pred)
report = clf_reportLR(y_test, y_pred)

print("Accuracy: {:.2f}%".format(accuracy * 100))
print("Classification Report:\n", report)

Accuracy: 34.80%
Classification Report:
               precision    recall  f1-score   support

       anger       0.00      0.00      0.00        19
     boredom       0.00      0.00      0.00        31
       empty       0.00      0.00      0.00       162
  enthusiasm       0.00      0.00      0.00       163
         fun       0.06      0.01      0.01       338
   happiness       0.32      0.34      0.33      1028
        hate       0.50      0.14      0.22       268
        love       0.51      0.37      0.43       762
     neutral       0.33      0.60      0.43      1740
      relief       0.43      0.03      0.05       352
     sadness       0.36      0.24      0.29      1046
    surprise       0.28      0.03      0.06       425
       worry       0.34      0.47      0.39      1666

    accuracy                           0.35      8000
   macro avg       0.24      0.17      0.17      8000
weighted avg       0.33      0.35      0.31      8000



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


SVM

In [41]:
from sklearn.svm import SVC

In [42]:
classifier = SVC(kernel='linear', random_state=42)
classifier.fit(X_train, y_train)

In [44]:
# Make predictions on the test set
y_pred = classifier.predict(X_test)

In [46]:
# Evaluate the model
from sklearn.metrics import classification_report as clf_report
accuracy = accuracy_score(y_test, y_pred)
report = clf_report(y_test, y_pred)

print("Accuracy: {:.2f}%".format(accuracy * 100))
print("Classification Report:\n", report)

  _warn_prf(average, modifier, msg_start, len(result))


Accuracy: 35.08%
Classification Report:
               precision    recall  f1-score   support

       anger       0.00      0.00      0.00        19
     boredom       0.00      0.00      0.00        31
       empty       0.00      0.00      0.00       162
  enthusiasm       0.00      0.00      0.00       163
         fun       0.05      0.01      0.01       338
   happiness       0.32      0.38      0.35      1028
        hate       0.43      0.19      0.26       268
        love       0.49      0.38      0.43       762
     neutral       0.34      0.58      0.43      1740
      relief       0.54      0.04      0.07       352
     sadness       0.37      0.24      0.29      1046
    surprise       0.35      0.03      0.06       425
       worry       0.34      0.47      0.39      1666

    accuracy                           0.35      8000
   macro avg       0.25      0.18      0.18      8000
weighted avg       0.34      0.35      0.31      8000



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


## The SVM has More Accuracy



