Data Preprocessing

In [1]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer
import re

In [2]:
# Load the dataset
df = pd.read_csv('Emotions_training.csv')

In [3]:
print(df.head())

                                                text  label
0                            i didnt feel humiliated      0
1  i can go from feeling so hopeless to so damned...      0
2   im grabbing a minute to post i feel greedy wrong      3
3  i am ever feeling nostalgic about the fireplac...      2
4                               i am feeling grouchy      3


In [4]:
# Download NLTK resources
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\masne\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\masne\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [5]:
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()
    
    # Remove links
    text = re.sub(r'http\S+', '', text)
    
    # Remove newline characters
    text = text.replace('\n', ' ')
    
    # Remove words containing numbers
    text = re.sub(r'\w*\d\w*', '', text)
    
    # Remove extra spaces
    text = re.sub(r'\s+', ' ', text)
    
    # Remove special characters
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    
    # Tokenize the text
    words = nltk.word_tokenize(text)
    
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    words = [word for word in words if word not in stop_words]
    
    # Initialize stemmer and lemmatizer
    stemmer = PorterStemmer()
    lemmatizer = WordNetLemmatizer()
    
    # Stemming and lemmatization
    stemmed_words = [stemmer.stem(word) for word in words]
    lemmatized_words = [lemmatizer.lemmatize(word) for word in stemmed_words]
    
    # Join the preprocessed words back into a single string
    preprocessed_text = ' '.join(lemmatized_words)
    
    return preprocessed_text

In [6]:
# Apply preprocessing to the 'text' column 
df['text'] = df['text'].apply(preprocess_text)

In [7]:
# Display the cleaned content
print(df[['text', 'text']].head())

                                                text  \
0                                  didnt feel humili   
1  go feel hopeless damn hope around someon care ...   
2               im grab minut post feel greedi wrong   
3     ever feel nostalg fireplac know still properti   
4                                       feel grouchi   

                                                text  
0                                  didnt feel humili  
1  go feel hopeless damn hope around someon care ...  
2               im grab minut post feel greedi wrong  
3     ever feel nostalg fireplac know still properti  
4                                       feel grouchi  


Feature Engineering

In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer


In [10]:
# Initialize TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer()

# Fit and transform the preprocessed text data
tfidf_matrix = tfidf_vectorizer.fit_transform(df['text'])

# Convert the TF-IDF matrix to a DataFrame (optional)
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=tfidf_vectorizer.get_feature_names_out())

# Concatenate the TF-IDF matrix with the original DataFrame
df_with_tfidf = pd.concat([df[['text']], tfidf_df], axis=1)




In [11]:
# use df_with_tfidf for modeling
print(df_with_tfidf.head())

                                                text   aa  aaaaaaand  aaaaand  \
0                                  didnt feel humili  0.0        0.0      0.0   
1  go feel hopeless damn hope around someon care ...  0.0        0.0      0.0   
2               im grab minut post feel greedi wrong  0.0        0.0      0.0   
3     ever feel nostalg fireplac know still properti  0.0        0.0      0.0   
4                                       feel grouchi  0.0        0.0      0.0   

   aaaand  aac  aahhh  aaron   ab  abandon  ...  zombi  zone  zonisamid  zoo  \
0     0.0  0.0    0.0    0.0  0.0      0.0  ...    0.0   0.0        0.0  0.0   
1     0.0  0.0    0.0    0.0  0.0      0.0  ...    0.0   0.0        0.0  0.0   
2     0.0  0.0    0.0    0.0  0.0      0.0  ...    0.0   0.0        0.0  0.0   
3     0.0  0.0    0.0    0.0  0.0      0.0  ...    0.0   0.0        0.0  0.0   
4     0.0  0.0    0.0    0.0  0.0      0.0  ...    0.0   0.0        0.0  0.0   

   zoom   zq  zucchini  zum  zum