In [1]:
import  pandas as pd
import re
from math import ceil
import warnings
warnings.filterwarnings("ignore")
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Ankur\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
# read csv file
textData = pd.read_csv("Emotions_training.csv")

# showing data 
textData.head(5)

Unnamed: 0,text,label
0,i didnt feel humiliated,0
1,i can go from feeling so hopeless to so damned...,0
2,im grabbing a minute to post i feel greedy wrong,3
3,i am ever feeling nostalgic about the fireplac...,2
4,i am feeling grouchy,3


In [3]:
textData.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16000 entries, 0 to 15999
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    16000 non-null  object
 1   label   16000 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 250.1+ KB


# Data Pre-processing

Text column fits to be a **string** datatype which will enable us to perform string operations if needed in future for any analysis.

In [4]:
string_colms = {'text':'string'}
textData = textData.astype(string_colms)
textData.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16000 entries, 0 to 15999
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    16000 non-null  string
 1   label   16000 non-null  int64 
dtypes: int64(1), string(1)
memory usage: 250.1 KB


In [5]:
textData.describe()

Unnamed: 0,label
count,16000.0
mean,1.565937
std,1.50143
min,0.0
25%,0.0
50%,1.0
75%,3.0
max,5.0


As we see maximum integer value in **label** colume is **5** so **int8** datatype will be enough.

In [6]:
#converting datatype for the int column
textData['label'] = textData['label'].astype('int8')
textData.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16000 entries, 0 to 15999
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    16000 non-null  string
 1   label   16000 non-null  int8  
dtypes: int8(1), string(1)
memory usage: 140.8 KB


So,now datatype is fixed.

Moving further, there are certain required pre-rpocessing asked to be done in this project. These are as mentioned below : 
- Covert to Lower Case
- Remove links
- Remove next lines (\n)
- Remove Words containing numbers
- Remove Extra spaces
- Remove Special characters

While perfoming above data-perprocessing the impact was not clearly visible as we did not know what rows are getting refactored. So,firstly let's check upon the impact of performing these on the rows of dataset we have.

In [7]:
# Find if upper case exists to be converted to lower case
rowsWithCapitalLetters = textData[textData['text'].str.islower() == False].shape[0]

# Find rows with links to be removed
rowsWithLinks = textData[textData['text'].str.contains(r'https?://[^\s]+')].shape[0]

# Find rows with newlines to be removed
rowsWithNewLine = textData[textData['text'].str.contains('\n')].shape[0]

# Find rows with special characters to be removed
def has_special_chars(text):
    for char in text:
        if not char.isalnum() and char not in [' ']: # It can also be new line but its zero as well :)
            return True
    return False

rowsWithSpChar = textData[textData['text'].apply(has_special_chars)].shape[0]

# Find rows with alphanumeric characters to be removed
def has_alnum(text):
    for char in text:
        if char.isnumeric():
            return True
    return False

rowsWithNum = textData[textData['text'].apply(has_alnum)].shape[0]

# Find rows with extra spaces to be removed
def has_extra_spaces(text):
    return '  ' in text

rowsWithExSpc = textData[textData['text'].apply(has_extra_spaces)].shape[0]

print(rowsWithCapitalLetters, 
      rowsWithLinks, 
      rowsWithNewLine, 
      rowsWithSpChar, 
      rowsWithNum, 
      rowsWithExSpc)

0 0 0 0 0 0


For the checks mentioned for the pre-processing, all the impacted rows were extracted into their respective dataframes and looked upon the count of rows impacted. Surprisingly, none of the rows had anything that would be altered even if these pre-processing steps were performed.
Therefore it is concluded that these pre-processing steps would have no change on the dataset.

However, while performing these, it was found that the text data has random words, weird songs, html tags, etc inserted to many of the sentences in text column making our data noisy. Below is one of the example :

In [8]:
# Finding occurance of random word in dataset
randomWords = textData[textData['text'].str.contains('http')] #https, href, a href http, www, etc
print(randomWords.shape)
randomWords.head(5)

(199, 2)


Unnamed: 0,text,label
125,i feel they are pretty safe on my blog img src...,1
323,i stopped feeling so exhausted a href http pro...,0
462,i feel so dazed a href http twitter,5
866,i feel unwelcome at work sometimes and think p...,0
967,i a href http feeling groggy,0


So, an attempt is being made to find and get rid of as much noise as possible by removing it.

In [9]:
# Define the substrings to remove
remove_strings = [' a href http ', ' http ', ' https ', ' www ', ' href ', ' src ', ' img ', ' s ']

# Iterate through each substring and remove it from the 'text' column in the main DataFrame
for substring in remove_strings:
    textData['text'] = textData['text'].str.replace(substring, ' ')

In [10]:
# Checking if it is removed for one as example
randomWords = textData[textData['text'].str.contains('http')] #http, href, a href http, www, etc
print(randomWords.shape)
randomWords.head(5)

(12, 2)


Unnamed: 0,text,label
1531,i definitely feel like hot stuff strutting dow...,2
2263,i do not know how to feel my hearts aching sad...,0
3170,i am feeling so festive right now and not just...,1
3231,im feeling my loving heart is all yours for th...,2
4986,im feeling determined to face facts have a gan...,1


#### Question : 
Above is not scalable way. How can we deal with such a scenerio efficiently ? 

In [11]:
# removing stop words
stop_words = set(stopwords.words('english'))
textData['text'] = textData['text'].apply(lambda x: ' '.join([word for word in x.split() if word not in stop_words]))
textData['text'] = textData['text'].astype('string') # as above operation changes it to object

In [12]:
# stemming
stemmer = PorterStemmer()
textData['text'] = textData['text'].apply(lambda x: ' '.join([stemmer.stem(word) for word in x.split()]))


In [13]:
#lemmatization
lemmatizer = WordNetLemmatizer()
textData['text'] = textData['text'].apply(lambda x: ' '.join([lemmatizer.lemmatize(word) for word in x.split()]))
textData['text'] = textData['text'].astype('string') # as above operation changes it to object

In [14]:
textData.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16000 entries, 0 to 15999
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    16000 non-null  string
 1   label   16000 non-null  int8  
dtypes: int8(1), string(1)
memory usage: 140.8 KB


So we have completed with data-preprocessing.As we can see that before data-preprocessing the memory usage was **250.1+ KB** and after data-preprocessing the memory usage is **140.8 KB** it clearly show that memory usage is **reduced**.

### Class Distribution

Another important thing to make sure before feeding our data into the model is the class distribution of the data. In our case where the expected class are divided into six outcomes, 0 to 5, an equal class distribution can be considered ideal.

In [15]:
textData['label'].value_counts()

label
1    5362
0    4666
3    2159
4    1937
2    1304
5     572
Name: count, dtype: int64

conclude above

### Vectorization Of Text  Data
- Vectorization is essential for text classification tasks because machine learning algorithms typically require numerical input data. 
- Text data, being categorical and unstructured, needs to be converted into a numerical format that algorithms can process effectively. 
- Vectorization transforms text into numerical features, allowing machine learning models to learn patterns and make predictions based on the text's content. 

Which ? Wh ?

In [16]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer()

# Fit and transform the 'text' column to generate TF-IDF features
tfidf_features = tfidf_vectorizer.fit_transform(textData['text'])

# Convert TF-IDF features to a DataFrame
tfidf_df = pd.DataFrame(tfidf_features.toarray(), columns=tfidf_vectorizer.get_feature_names_out())

# Concatenate the TF-IDF DataFrame with the original DataFrame
textData = pd.concat([textData, tfidf_df], axis=1)

# Print the updated DataFrame
textData.sample(5)


Unnamed: 0,text,label,aa,aaaaaaand,aaaaand,aaaand,aac,aahhh,aaron,ab,...,zombi,zone,zonisamid,zoo,zoom,zq,zucchini,zum,zumba,zz
15603,im psych stop that kind good thing wont feel p...,4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2691,also feel stubborn,3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
12961,feel pretti cranki could think much better fee...,3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
13756,think lot time woman perceiv problem husband a...,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
12732,feel like cold feel sick,3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
