In [1]:
!pip install contractions --quiet
!pip install emoji --quiet

In [2]:
import pandas as pd
import numpy as np
import textblob
import nltk
import contractions as ct
import re
import emoji
import textblob
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

In [3]:
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('punkt_tab')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\nelso\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\nelso\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\nelso\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\nelso\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [4]:
df_anger = pd.read_csv('data/angriness.csv')
df_anger.head()

Unnamed: 0,content,intensity
0,"Sometimes I’m not angry, I’m hurt and there’s ...",angriness
1,Not available for busy people☺,angriness
2,I do not exist to impress the world. I exist t...,angriness
3,Everything is getting expensive except some pe...,angriness
4,My phone screen is brighter than my future 🙁,angriness


In [5]:
df_happy = pd.read_csv('data/happiness.csv')
df_happy.head()

Unnamed: 0,content,intensity
0,Wants to know how the hell I can remember word...,happiness
1,Love is a long sweet dream & marriage is an al...,happiness
2,The world could be amazing when you are slight...,happiness
3,My secret talent is getting tired without doin...,happiness
4,"Khatarnaak Whatsapp Status Ever… Can\’t talk, ...",happiness


In [6]:
df_sad = pd.read_csv('data/sadness.csv')
df_sad.head()

Unnamed: 0,content,intensity
0,"Never hurt people who love you a lot, because ...",sadness
1,Don’t expect me to tell you what you did wrong...,sadness
2,I preferred walking away than fighting for you...,sadness
3,"Moving forward in life isn’t the hard part, it...",sadness
4,"Never cry for anyone in your life, because tho...",sadness


In [7]:
print("Shape of anger file:",df_anger.shape)
print("Shape of happy file:",df_happy.shape)
print("Shape of sad file:",df_sad.shape)

Shape of anger file: (696, 2)
Shape of happy file: (708, 2)
Shape of sad file: (635, 2)


In [8]:
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

In [9]:
ps = nltk.porter.PorterStemmer()

In [10]:
def preprocess_text(text):
    text = str(text).lower()
    text = emoji.demojize(text)
    text = ct.fix(text)
    text = re.sub(r'[^a-zA-Z]',r' ', text)
    text = re.sub(r'nbsp',r' ',text)
    text = re.sub(' +', ' ', text)
    text = ' '.join([ps.stem(word) for word in text.split()])
    text = ' '.join([word for word in text.split() if word not in stop_words])
    return text

In [11]:
df_anger['cleaned_content'] = df_anger['content'].apply(preprocess_text)
df_happy['cleaned_content'] = df_happy['content'].apply(preprocess_text)
df_sad['cleaned_content'] = df_sad['content'].apply(preprocess_text)

In [12]:
print(df_anger[['content', 'cleaned_content']].head(20))

                                              content  \
0   Sometimes I’m not angry, I’m hurt and there’s ...   
1                      Not available for busy people☺   
2   I do not exist to impress the world. I exist t...   
3   Everything is getting expensive except some pe...   
4        My phone screen is brighter than my future 🙁   
5   Anger is a feeling that makes your mouth work ...   
6   This man, is man, a man, good man, way man, to...   
7   Sometimes the best way to get someone\’s atten...   
8   People like to bring up your past, when your p...   
9               Wakeup>job>noluv>daaru>sleep>repeat..   
10  To all u Haters!!! Thanks for making me famous...   
11  Its hurts the worst when the person that made ...   
12  Life is too short don’t waste it by reading my...   
13  If I treated you the way you treat me, you wou...   
14         Girls r like potato chips…you can’t have 1   
15  Had you asked for my life i would have sacrifi...   
16  People always miss you more

In [13]:
print(df_happy[['content', 'cleaned_content']].head(20))

                                              content  \
0   Wants to know how the hell I can remember word...   
1   Love is a long sweet dream & marriage is an al...   
2   The world could be amazing when you are slight...   
3   My secret talent is getting tired without doin...   
4   Khatarnaak Whatsapp Status Ever… Can\’t talk, ...   
5   The new way of forgetting your past is deletin...   
6   204 countries, 805 Islands, 7 seas, 7+ Billion...   
7                Available…. Prabhu ichhaa tak……!!!!!   
8   Life is too short to be serious all the time. ...   
9   A really cool feature of the Nano they don’t t...   
10  You love flowers, but you cut them. You love a...   
11  All person b very careful when u step out toda...   
12  Men have feelings too. For example, we feel hu...   
13  3 horrible things in life: 1) Slow Internet. 2...   
14  My teacher today gave 45 minute speech about n...   
15  Women loves shoes bcz no matter how much & wha...   
16  Never laugh at your wife’s 

In [14]:
print(df_sad[['content', 'cleaned_content']].head(20))

                                              content  \
0   Never hurt people who love you a lot, because ...   
1   Don’t expect me to tell you what you did wrong...   
2   I preferred walking away than fighting for you...   
3   Moving forward in life isn’t the hard part, it...   
4   Never cry for anyone in your life, because tho...   
5   Seven billion people on this planet and I have...   
6   I know you won’t come back but my heart will t...   
7   When you are in a habit to talk someone daily,...   
8   Value the person when they are alive don’t cry...   
9   A magic of songs and sound, When you are happy...   
10  Behind every flirty boy there is a girl who le...   
11  I don’t get tired of loving you. I just get ti...   
12  The hardest thing to do is watch the person yo...   
13  I will never stop caring, but if you decide to...   
14  It hurts when you have someone in your heart b...   
15  The more you show your true feelings, the more...   
16  It’s amazing how someone br

In [15]:
combined_df = pd.concat([df_anger, df_happy, df_sad], ignore_index=True)

In [16]:
combined_df

Unnamed: 0,content,intensity,cleaned_content
0,"Sometimes I’m not angry, I’m hurt and there’s ...",angriness,sometim angri hurt big differ
1,Not available for busy people☺,angriness,avail busi peopl smile face
2,I do not exist to impress the world. I exist t...,angriness,exist impress world exist live life way make h...
3,Everything is getting expensive except some pe...,angriness,everyth get expens except peopl get cheaper
4,My phone screen is brighter than my future 🙁,angriness,phone screen brighter futur slightli frown face
...,...,...,...
2034,Stop crying over yesterday and start smiling f...,sadness,stop cri yesterday start smile tomorrow
2035,An Eye with Dust ‘n A Heart with Trust Always ...,sadness,eye dust n heart trust alway cri
2036,Tears come from the heart and not from the brain.,sadness,tear come heart brain
2037,"Sometimes you have to hold your head up high, ...",sadness,sometim hold head high blink away tear say goo...


In [17]:
combined_df[['cleaned_content', 'intensity']].to_csv("Cleaned_data.csv",index=False)
combined_df.isnull().sum()

content            0
intensity          0
cleaned_content    0
dtype: int64

In [18]:
df = pd.read_csv('Cleaned_data.csv')
df.head()

Unnamed: 0,cleaned_content,intensity
0,sometim angri hurt big differ,angriness
1,avail busi peopl smile face,angriness
2,exist impress world exist live life way make h...,angriness
3,everyth get expens except peopl get cheaper,angriness
4,phone screen brighter futur slightli frown face,angriness


In [19]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(max_features=5000)
X = vectorizer.fit_transform(df['cleaned_content'])
print("Shape of TF-IDF matrix:", X.shape)

Shape of TF-IDF matrix: (2039, 1751)


In [20]:
X

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 92298 stored elements and shape (2039, 1751)>

In [21]:
print("Number of unique tokens in the vocabulary:", len(vectorizer.get_feature_names_out()))
print("Shape of TF-IDF matrix:", X.shape)
print("Number of non-zero features:", X.nnz)

Number of unique tokens in the vocabulary: 1751
Shape of TF-IDF matrix: (2039, 1751)
Number of non-zero features: 92298


In [22]:
sparsity = 100 * (1 - (X.nnz / (X.shape[0] * X.shape[1])))
print(f"Sparsity: {sparsity:.2f}%")

Sparsity: 97.41%


In [23]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
y = label_encoder.fit_transform(df['intensity'])
print(f"Encoded labels: {label_encoder.classes_}")

Encoded labels: ['angriness' 'happiness' 'sadness']


In [24]:
y

array([0, 0, 0, ..., 2, 2, 2])

In [25]:
from sklearn.model_selection import train_test_split

In [26]:
X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=42,stratify=y)
print(X_train.shape)
print(X_test.shape)

(1529, 1751)
(510, 1751)


In [27]:
pd.DataFrame(y_train).value_counts()

0
1    531
0    522
2    476
Name: count, dtype: int64

In [28]:
pd.DataFrame(y_test).value_counts()

0
1    177
0    174
2    159
Name: count, dtype: int64

In [29]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report,confusion_matrix

In [30]:
lr_model = LogisticRegression(max_iter=1000, random_state=42)
lr_model.fit(X_train, y_train)

In [31]:
y_pred_lr = lr_model.predict(X_test)

In [32]:
print(f"Accuracy of Logistic Regression: {accuracy_score(y_test, y_pred_lr):.4f}")
print("Classification Report:\n", classification_report(y_test, y_pred_lr))
confusion_matrix(y_test, y_pred_lr)

Accuracy of Logistic Regression: 0.7686
Classification Report:
               precision    recall  f1-score   support

           0       0.80      0.78      0.79       174
           1       0.73      0.82      0.77       177
           2       0.78      0.70      0.74       159

    accuracy                           0.77       510
   macro avg       0.77      0.77      0.77       510
weighted avg       0.77      0.77      0.77       510



array([[135,  23,  16],
       [ 17, 145,  15],
       [ 16,  31, 112]], dtype=int64)

In [33]:
model = LogisticRegression(
    solver='liblinear',
    C=10,
    penalty='l1',
    max_iter=1000,
    random_state=42
)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

In [34]:
print(f"Accuracy of Logistic Regression: {accuracy_score(y_test, y_pred):.4f}")
print("Classification Report:\n", classification_report(y_test, y_pred))
confusion_matrix(y_test, y_pred)

Accuracy of Logistic Regression: 0.7765
Classification Report:
               precision    recall  f1-score   support

           0       0.78      0.82      0.80       174
           1       0.77      0.74      0.76       177
           2       0.78      0.77      0.78       159

    accuracy                           0.78       510
   macro avg       0.78      0.78      0.78       510
weighted avg       0.78      0.78      0.78       510



array([[142,  19,  13],
       [ 24, 131,  22],
       [ 16,  20, 123]], dtype=int64)

In [35]:
svm_model = SVC(kernel='rbf', C=1.0, random_state=42)

In [36]:
svm_model.fit(X_train, y_train)

In [37]:
y_pred_svm = svm_model.predict(X_test)

In [38]:
print(f"Accuracy of SVM: {accuracy_score(y_test, y_pred_svm):.4f}")
print("Classification Report:\n", classification_report(y_test, y_pred_svm))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_svm))

Accuracy of SVM: 0.7922
Classification Report:
               precision    recall  f1-score   support

           0       0.89      0.77      0.82       174
           1       0.69      0.88      0.78       177
           2       0.85      0.72      0.78       159

    accuracy                           0.79       510
   macro avg       0.81      0.79      0.79       510
weighted avg       0.81      0.79      0.79       510

Confusion Matrix:
 [[134  34   6]
 [  7 156  14]
 [ 10  35 114]]


In [39]:
xgb_model = XGBClassifier(eval_metric='logloss', random_state=42)

In [40]:
xgb_model.fit(X_train, y_train)

In [41]:
y_pred_xgb = xgb_model.predict(X_test)

In [42]:
print(f"Accuracy of XGBoost: {accuracy_score(y_test, y_pred_xgb):.4f}")
print("Classification Report:\n", classification_report(y_test, y_pred_xgb))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_xgb))

Accuracy of XGBoost: 0.7647
Classification Report:
               precision    recall  f1-score   support

           0       0.78      0.76      0.77       174
           1       0.73      0.78      0.75       177
           2       0.80      0.75      0.77       159

    accuracy                           0.76       510
   macro avg       0.77      0.76      0.77       510
weighted avg       0.77      0.76      0.76       510

Confusion Matrix:
 [[133  30  11]
 [ 20 138  19]
 [ 18  22 119]]


In [55]:
lgbm_model = LGBMClassifier(random_state=42)

In [56]:
lgbm_model.fit(X_train, y_train)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.008112 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3386
[LightGBM] [Info] Number of data points in the train set: 1529, number of used features: 531
[LightGBM] [Info] Start training from score -1.074702
[LightGBM] [Info] Start training from score -1.057607
[LightGBM] [Info] Start training from score -1.166951


In [57]:
y_pred_lgbm = lgbm_model.predict(X_test)

In [58]:
print(f"Accuracy of LightGBM: {accuracy_score(y_test, y_pred_lgbm):.4f}")
print("Classification Report:\n", classification_report(y_test, y_pred_lgbm))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_lgbm))

Accuracy of LightGBM: 0.7451
Classification Report:
               precision    recall  f1-score   support

           0       0.78      0.77      0.77       174
           1       0.72      0.70      0.71       177
           2       0.74      0.77      0.75       159

    accuracy                           0.75       510
   macro avg       0.75      0.75      0.75       510
weighted avg       0.75      0.75      0.74       510

Confusion Matrix:
 [[134  27  13]
 [ 23 124  30]
 [ 15  22 122]]
