1. Loading and Preprocessing

In [1]:
import pandas as pd

In [2]:
data = pd.read_csv("C:\\Users\\barbi\\Downloads\\nlp_dataset.csv")


In [3]:
import re

In [4]:
print(data.columns)

Index(['Comment', 'Emotion'], dtype='object')


In [5]:
print(data.head())

                                             Comment Emotion
0  i seriously hate one subject to death but now ...    fear
1                 im so full of life i feel appalled   anger
2  i sit here to write i start to dig out my feel...    fear
3  ive been really angry with r and i feel like a...     joy
4  i feel suspicious if there is no one outside l...    fear


Text Cleaning

In [6]:
def clean_text(text):
    text = re.sub(r'[^a-zA-Z\s]', '', text)  
    text = text.lower() 
    return text

In [7]:
data['cleaned_text'] = data['Comment'].apply(clean_text)


Tokenization

In [8]:
pip install nltk


Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.0 -> 24.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [20]:

import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords


In [21]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\barbi\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [32]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\barbi\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [28]:
nltk.download('punkt_tab')


[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\barbi\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt_tab.zip.


True

In [26]:
print(data.columns)

Index(['Comment', 'Emotion', 'cleaned_text'], dtype='object')


Removal of stopwords

In [30]:
data['tokens'] = data['cleaned_text'].apply(word_tokenize)

In [34]:
stop_words = set(stopwords.words('english'))

In [37]:
def remove_stopwords(tokens): return [word for word in tokens if word not in stop_words]


In [38]:
data['tokens'] = data['tokens'].apply(remove_stopwords)


In [39]:
print(data.columns)

Index(['Comment', 'Emotion', 'cleaned_text', 'tokens'], dtype='object')


2. Feature Extraction

In [76]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

In [80]:
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(data['Comment'])
y = data['Emotion']

Tfidvectorizer transforms text data into a numerical format .

    
This helps highlight unique and significant words.

In [81]:
data=pd.read_csv("C:\\Users\\barbi\\Downloads\\nlp_dataset.csv")
print(data)

                                                Comment Emotion
0     i seriously hate one subject to death but now ...    fear
1                    im so full of life i feel appalled   anger
2     i sit here to write i start to dig out my feel...    fear
3     ive been really angry with r and i feel like a...     joy
4     i feel suspicious if there is no one outside l...    fear
...                                                 ...     ...
5932                 i begun to feel distressed for you    fear
5933  i left feeling annoyed and angry thinking that...   anger
5934  i were to ever get married i d have everything...     joy
5935  i feel reluctant in applying there because i w...    fear
5936  i just wanted to apologize to you because i fe...   anger

[5937 rows x 2 columns]


In [82]:
count_vectorizer = CountVectorizer()
count_matrix = count_vectorizer.fit_transform(data['Comment'])
count_features = count_vectorizer.get_feature_names_out()

In [83]:
count_df = pd.DataFrame(count_matrix.toarray(), columns=count_features)


In [84]:
print("CountVectorizer Features:")
print(count_df)


CountVectorizer Features:
      aa  aac  aaron  ab  abandon  abandoned  abandonment  abbigail  abc  \
0      0    0      0   0        0          0            0         0    0   
1      0    0      0   0        0          0            0         0    0   
2      0    0      0   0        0          0            0         0    0   
3      0    0      0   0        0          0            0         0    0   
4      0    0      0   0        0          0            0         0    0   
...   ..  ...    ...  ..      ...        ...          ...       ...  ...   
5932   0    0      0   0        0          0            0         0    0   
5933   0    0      0   0        0          0            0         0    0   
5934   0    0      0   0        0          0            0         0    0   
5935   0    0      0   0        0          0            0         0    0   
5936   0    0      0   0        0          0            0         0    0   

      abdomen  ...  zendikar  zero  zest  zhu  zipline  zombi

In [85]:
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(data['Comment'])
tfidf_features = tfidf_vectorizer.get_feature_names_out()


In [86]:
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=tfidf_features)

In [87]:
print("\nTfidfVectorizer Features:")
print(tfidf_df)


TfidfVectorizer Features:
       aa  aac  aaron   ab  abandon  abandoned  abandonment  abbigail  abc  \
0     0.0  0.0    0.0  0.0      0.0        0.0          0.0       0.0  0.0   
1     0.0  0.0    0.0  0.0      0.0        0.0          0.0       0.0  0.0   
2     0.0  0.0    0.0  0.0      0.0        0.0          0.0       0.0  0.0   
3     0.0  0.0    0.0  0.0      0.0        0.0          0.0       0.0  0.0   
4     0.0  0.0    0.0  0.0      0.0        0.0          0.0       0.0  0.0   
...   ...  ...    ...  ...      ...        ...          ...       ...  ...   
5932  0.0  0.0    0.0  0.0      0.0        0.0          0.0       0.0  0.0   
5933  0.0  0.0    0.0  0.0      0.0        0.0          0.0       0.0  0.0   
5934  0.0  0.0    0.0  0.0      0.0        0.0          0.0       0.0  0.0   
5935  0.0  0.0    0.0  0.0      0.0        0.0          0.0       0.0  0.0   
5936  0.0  0.0    0.0  0.0      0.0        0.0          0.0       0.0  0.0   

      abdomen  ...  zendikar  zero  

3. Model Development



In [88]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split

In [89]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
nb_model = MultinomialNB()
nb_model.fit(X_train, y_train)


In [90]:
from sklearn.svm import SVC

svm_model = SVC()
svm_model.fit(X_train, y_train)


4. Model comparison

In [91]:
from sklearn.metrics import accuracy_score, f1_score

In [92]:
nb_pred = nb_model.predict(X_test)
svm_pred = svm_model.predict(X_test)

In [93]:
nb_accuracy = accuracy_score(y_test, nb_pred)
nb_f1 = f1_score(y_test, nb_pred, average='weighted')


In [94]:
svm_accuracy = accuracy_score(y_test, svm_pred)
svm_f1 = f1_score(y_test, svm_pred, average='weighted')


In [95]:
print(f'Naive Bayes Accuracy: {nb_accuracy}, F1-score: {nb_f1}')
print(f'SVM Accuracy: {svm_accuracy}, F1-score: {svm_f1}')


Naive Bayes Accuracy: 0.9006734006734006, F1-score: 0.9006441177535423
SVM Accuracy: 0.9116161616161617, F1-score: 0.9117003051960515


Model Suitability

Naive Bayes is suitable for text classification due to its simplicity and efficiency
    
SVM is effective in high-dimensional spaces, making it ideal for text classification