In [1]:
#Detects emotions from text
#Created by S. Biswas
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk
import re
from sklearn.preprocessing import LabelEncoder
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC



In [2]:
df_train = pd.read_csv('train.txt', sep=';', names=['Sentence', 'Emotion'], encoding='UTF8')
#data cleaning
df_train = df_train[df_train['Sentence'] != '']
df_train = df_train.dropna()
df_train = df_train.drop_duplicates()
print(df_train['Emotion'])


0        sadness
1        sadness
2          anger
3           love
4          anger
          ...   
15995    sadness
15996    sadness
15997        joy
15998      anger
15999    sadness
Name: Emotion, Length: 15999, dtype: object


In [3]:
df_train['length'] = [len(sen) for sen in df_train['Sentence']]

In [4]:

lb = LabelEncoder()
df_train['Emotions'] = lb.fit_transform(df_train['Emotion'])
print(df_train['Emotions'])
df_train['Original_Emotion'] = lb.inverse_transform(df_train['Emotions'])
print(df_train['Original_Emotion'])

0        4
1        4
2        0
3        3
4        0
        ..
15995    4
15996    4
15997    2
15998    0
15999    4
Name: Emotions, Length: 15999, dtype: int64
0        sadness
1        sadness
2          anger
3           love
4          anger
          ...   
15995    sadness
15996    sadness
15997        joy
15998      anger
15999    sadness
Name: Original_Emotion, Length: 15999, dtype: object


In [5]:
# print(df_train['Emotion'])

In [6]:

stopwords = nltk.corpus.stopwords.words('english')

stemmer = PorterStemmer()
df_train['Cleaned_Sentence'] = df_train['Sentence'].apply(lambda x: re.sub("[^a-zA-Z]", " ", x))
df_train['Cleaned_Sentence'] = df_train['Cleaned_Sentence'].apply(lambda x: x.lower())
df_train['Final_Cleaned_Sentence'] = df_train['Cleaned_Sentence'].apply(lambda x: ' '.join(stemmer.stem(word) for word in x.split() if word not in stopwords))

from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

In [7]:
df_train.head()

Unnamed: 0,Sentence,Emotion,length,Emotions,Original_Emotion,Cleaned_Sentence,Final_Cleaned_Sentence
0,i didnt feel Humiliated,sadness,23,4,sadness,i didnt feel humiliated,didnt feel humili
1,i can go from feeling so hopeless to so damned...,sadness,108,4,sadness,i can go from feeling so hopeless to so damned...,go feel hopeless damn hope around someon care ...
2,im grabbing a minute to post i feel greedy wrong,anger,48,0,anger,im grabbing a minute to post i feel greedy wrong,im grab minut post feel greedi wrong
3,i am ever feeling nostalgic about the fireplac...,love,92,3,love,i am ever feeling nostalgic about the fireplac...,ever feel nostalg fireplac know still properti
4,i am feeling grouchy,anger,20,0,anger,i am feeling grouchy,feel grouchi


In [8]:


x_train, x_test, y_train, y_test = train_test_split(df_train['Final_Cleaned_Sentence'], df_train['Emotion'], test_size= 0.2 , random_state=42 )


In [15]:

tfidfv = TfidfVectorizer()
x_train_tfidf = tfidfv.fit_transform(x_train)
x_test_tfidf = tfidfv.transform(x_test)


In [16]:


classifier = {
    'MultinomialNB' : MultinomialNB(),
    'SVC':SVC(),
    'RandomForestClassifier' : RandomForestClassifier(),
    'LogisticRegression' : LogisticRegression(),   
}
for name, cls in classifier.items():
    cls.fit(x_train_tfidf, y_train)
    y_pred = cls.predict(x_test_tfidf)
    acc = accuracy_score(y_test, y_pred)
    print(f"====={name}======\n")
    print(f"====={acc}======\n")
    rep = classification_report(y_test, y_pred)
    print(f"======{rep}=====")
    
    




       anger       0.95      0.32      0.48       439
        fear       0.88      0.23      0.37       375
         joy       0.58      0.98      0.73      1027
        love       1.00      0.03      0.05       303
     sadness       0.72      0.91      0.80       950
    surprise       1.00      0.02      0.04       106

    accuracy                           0.66      3200
   macro avg       0.85      0.42      0.41      3200
weighted avg       0.76      0.66      0.59      3200
=====



       anger       0.88      0.79      0.83       439
        fear       0.84      0.73      0.78       375
         joy       0.75      0.94      0.83      1027
        love       0.82      0.37      0.51       303
     sadness       0.87      0.90      0.89       950
    surprise       0.82      0.47      0.60       106

    accuracy                           0.82      3200
   macro avg       0.83      0.70      0.74      3200
weighted avg       0.82      0.82      0.80      3200
=====



     

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [6]:


# Assuming stemmer and tfidfv are already defined and trained
lg = LogisticRegression()
lg.fit(x_train_tfidf, y_train)

def clean_data_for_predict(text):
    clean_text = re.sub("[^a-zA-Z]", " ", text)
    t = clean_text.lower()  # Apply lowercasing on clean_text

    # Ensure stemmer is called correctly
    t = ' '.join(stemmer.stem(w) for w in t.split())
    return t

def predict_emotion(text):
    text = clean_data_for_predict(text)
    v = tfidfv.transform([text])  # Wrap text in a list to form a 2D array
    
    label = lg.predict(v)[0]  # Get the predicted label directly
    emo = lb.inverse_transform(lg.predict(v))[0]
    return emo, label



NameError: name 'x_train_tfidf' is not defined

In [13]:
predict_emotion("I love you")

ValueError: y contains previously unseen labels: ['love']

In [14]:
from tensorflow.keras.preprocessing.text import one_hot
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import LabelEncoder

def cleaning_data(df, col, voc_size, max_len):
    stemmer = PorterStemmer()
    corpus = []
    one_hot_code = []
    for x in df[col]:
        x = re.sub('[^a-zA-Z]', ' ', x)
        x = x.lower()
        x = x.split()
        clean_words = []
        for y in x:
            if y not in stopwords:
                clean_words.append(stemmer.stem(y))
        corpus.append(' '.join(clean_words)) 

                
    for line in corpus:       
        encoded = one_hot(line, voc_size)
        one_hot_code.append(encoded)

    one_hot_code_padded = pad_sequences(one_hot_code, maxlen=max_len, padding='pre')
    return one_hot_code_padded

2024-09-27 12:11:34.864979: I external/local_tsl/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2024-09-27 12:11:34.871619: I external/local_tsl/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2024-09-27 12:11:34.954602: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [15]:
from tensorflow.keras.utils import to_categorical
x_train = cleaning_data(df_train, 'Sentence', 12000, 3000)
print(x_train)
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(df_train['Emotion'])
y_train = to_categorical(y_train_encoded)
print(len(x_train))
print(len(y_train))

[[    0     0     0 ...  1892  3380  8217]
 [    0     0     0 ...  2494  1388   827]
 [    0     0     0 ...  3380  5490  4101]
 ...
 [    0     0     0 ... 11480   139 11504]
 [    0     0     0 ...  7044  9305  3795]
 [    0     0     0 ...  3380  1439  3781]]
15999
15999


In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Activation, Embedding, Dropout, LSTM, Flatten
from tensorflow.keras.utils import to_categorical
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

# Assuming y_train contains class labels (0-5 for 6 classes)
# One-hot encode y_train if it is not already
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(df_train['Emotion'])
y_train = to_categorical(y_train_encoded)  
model = Sequential()
model.add(Embedding(input_dim=12000, output_dim=150, input_length=300))
model.add(Dropout(0.2))
# model.add(LSTM(128, return_sequences=True))  
model.add(LSTM(64, return_sequences=False))  
# model.add(Dense(64, activation='sigmoid')) 
model.add(Dense(6, activation='softmax'))  
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
print(len(x_train))
print(len(y_train))
model.fit(x_train, y_train, epochs=5, batch_size=16, verbose=1)



15999
15999
Epoch 1/5


2024-09-27 12:11:49.367869: W external/local_tsl/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 191988000 exceeds 10% of free system memory.


[1m1000/1000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2665s[0m 3s/step - accuracy: 0.5257 - loss: 1.2177
Epoch 2/5
[1m1000/1000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2362s[0m 2s/step - accuracy: 0.9170 - loss: 0.2363
Epoch 3/5
[1m1000/1000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2195s[0m 2s/step - accuracy: 0.9457 - loss: 0.1525
Epoch 4/5
[1m 491/1000[0m [32m━━━━━━━━━[0m[37m━━━━━━━━━━━[0m [1m18:24[0m 2s/step - accuracy: 0.9610 - loss: 0.1080