# TEXT CLASSIFICATION

In [2]:
#import necessary libraries
import numpy as np
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer

In [3]:
#loaded the dataFrame from a CSV file named 'reviews.csv'
df=pd.read_csv('Emotions_training.csv')
df

Unnamed: 0,text,label
0,i didnt feel humiliated,0
1,i can go from feeling so hopeless to so damned...,0
2,im grabbing a minute to post i feel greedy wrong,3
3,i am ever feeling nostalgic about the fireplac...,2
4,i am feeling grouchy,3
...,...,...
15995,i just had a very brief time in the beanbag an...,0
15996,i am now turning and i feel pathetic that i am...,0
15997,i feel strong and good overall,1
15998,i feel like this was such a rude comment and i...,3


In [4]:
#handles missing values in the 'text' column
df.dropna(subset=['text'], inplace=True)

# Lower case

In [5]:
#convert the 'text' column to lower case
df['text'] = df['text'].str.lower() 

# Remove Links

In [6]:
#to remove URL links using regular expressions
def remove_links(text):
    return re.sub(r'http\S+', '', text)

#apply the function to 'content' column
df['text'] = df['text'].apply(remove_links)

# Remove next lines(\n)

In [7]:
#to remove newline characters from the 'text' column
df['text'] = df['text'].str.replace('\n', '') 

# Words containing numbers

In [9]:
#to remove words containing digits from the 'text' column using apply() method and lambda function
df['text'] = df['text'].apply(lambda x: ' '.join(word for word in str(x).split() if not re.match('.*\d.*', str(word))))


# Extra spaces

In [10]:
#to remove extra spaces from the 'text' column using resub() and lambda function
df['text'] = df['text'].apply(lambda x: re.sub(r'\s+', ' ', x)) 

# Special Characters

In [11]:
#to remove special characters using regular expression
df['text'] = df['text'].str.replace(r'[^a-z\s]', '', regex=True)


# Removal of stop words

In [12]:
nltk.download('stopwords')

#stopwords for English language
stop_words = set(stopwords.words('english'))

#function to remove stop words from text
def remove_stopwords(text):
    #tokenize the text into words
    words = text.split()
    #to remove stop words from the list of words
    filtered_words = [word for word in words if word.lower() not in stop_words]
    #to join the filtered words back into a sentence
    filtered_text = ' '.join(filtered_words)
    return filtered_text

#apply the function to the 'content' column 
df['text'] = df['text'].apply(remove_stopwords)

[nltk_data] Downloading package stopwords to C:\Users\Moulya
[nltk_data]     R\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# Stemming

In [13]:
#importing the PorterStemmer class from nltk
from nltk.stem import PorterStemmer

#initializing porter stemmer
porter_stemmer = PorterStemmer()

#to apply stemming to a given text
def apply_stemming(text):
    words = text.split()  #tokenize the text into words
    stemmed_words = [porter_stemmer.stem(word) for word in words]  #apply stemming to each word
    stemmed_text = ' '.join(stemmed_words)  #join the stemmed words back into a sentence
    return stemmed_text
    
#applying function to the 'content' column 
df['text'] = df['text'].apply(apply_stemming)

# Lemmatization

In [14]:
#importing the WordNetLemmatizer class from nltk
from nltk.stem import WordNetLemmatizer

#initializing the WordNet lemmatizer
lemmatizer = WordNetLemmatizer()

def apply_lemmatization(text):
    words = text.split()  # Tokenize the text into words
    lemmatized_words = [lemmatizer.lemmatize(word) for word in words]  # Apply lemmatization to each word
    lemmatized_text = ' '.join(lemmatized_words)  # Join the lemmatized words back into a sentence
    return lemmatized_text
    
#apply function to the 'content' column 
df['text'] = df['text'].apply(apply_lemmatization)
df

Unnamed: 0,text,label
0,didnt feel humili,0
1,go feel hopeless damn hope around someon care ...,0
2,im grab minut post feel greedi wrong,3
3,ever feel nostalg fireplac know still properti,2
4,feel grouchi,3
...,...,...
15995,brief time beanbag said anna feel like beaten,0
15996,turn feel pathet still wait tabl sub teach degre,0
15997,feel strong good overal,1
15998,feel like rude comment im glad,3


# Featuring Engineering

Convert the Text corpus to a matrix of words counts. (Vectorize the Text data)

In [20]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer

#loading dataset
df = pd.read_csv('Emotions_training.csv')
text_column = 'text'

#initialize TfidfVectorizer
vectorizer = TfidfVectorizer()

#fit the vectorizer to the corpus and transform the corpus into a matrix of TF-IDF features
X = vectorizer.fit_transform(df[text_column])

#TF-IDF matrix
tfidf_matrix = X.toarray()

#feature names (words)
feature_names = vectorizer.get_feature_names_out()

#print
print("TF-IDF Matrix:")
print(tfidf_matrix)
print("\nFeature Names:")
print(feature_names)


TF-IDF Matrix:
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]

Feature Names:
['aa' 'aaaaaaand' 'aaaaand' ... 'zum' 'zumba' 'zz']
