In [26]:
import pandas as pd

# Load the dataset
file_path = 'data/isear/isear-train.csv'
data = pd.read_csv(file_path)

# Display the first few rows of the dataframe and its info to understand the structure and contents
data_info = data.info()
data_head = data.head()
data_info, data_head

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5341 entries, 0 to 5340
Data columns (total 7 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   Emotion Label  5340 non-null   object
 1   Text           5334 non-null   object
 2   Unnamed: 2     7 non-null      object
 3   Unnamed: 3     5 non-null      object
 4   Unnamed: 4     2 non-null      object
 5   Unnamed: 5     1 non-null      object
 6   Unnamed: 6     1 non-null      object
dtypes: object(7)
memory usage: 292.2+ KB


(None,
   Emotion Label                                               Text Unnamed: 2  \
 0           joy  When I understood that I was admitted to the U...        NaN   
 1          fear  I broke a window of a neighbouring house and I...        NaN   
 2           joy                         Got a big fish in fishing.        NaN   
 3          fear  Whenever I am alone in a dark room, walk alone...        NaN   
 4         shame  I bought a possible answer to a homework probl...        NaN   
 
   Unnamed: 3 Unnamed: 4 Unnamed: 5 Unnamed: 6  
 0        NaN        NaN        NaN        NaN  
 1        NaN        NaN        NaN        NaN  
 2        NaN        NaN        NaN        NaN  
 3        NaN        NaN        NaN        NaN  
 4        NaN        NaN        NaN        NaN  )

In [27]:
# Clean up the dataset by dropping unnecessary columns and any missing values in the main columns of interest
cleaned_data = data[['Emotion Label', 'Text']].dropna()

# Check the cleaned data structure
cleaned_data.head(), cleaned_data.info()


<class 'pandas.core.frame.DataFrame'>
Index: 5334 entries, 0 to 5340
Data columns (total 2 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   Emotion Label  5334 non-null   object
 1   Text           5334 non-null   object
dtypes: object(2)
memory usage: 125.0+ KB


(  Emotion Label                                               Text
 0           joy  When I understood that I was admitted to the U...
 1          fear  I broke a window of a neighbouring house and I...
 2           joy                         Got a big fish in fishing.
 3          fear  Whenever I am alone in a dark room, walk alone...
 4         shame  I bought a possible answer to a homework probl...,
 None)

It seems there are some entries in the "Emotion Label" column with incorrect formatting or additional text. I'll correct these entries and provide a cleaned-up distribution of emotions.

In [28]:
# Clean up the improperly formatted emotion labels by extracting only the relevant emotion part
cleaned_data['Emotion Label'] = cleaned_data['Emotion Label'].str.split(',').str[0].str.strip()

# Recalculate the distribution of emotions after cleaning
cleaned_emotion_distribution = cleaned_data['Emotion Label'].value_counts()

# Display the cleaned distribution of emotions
cleaned_emotion_distribution


Emotion Label
joy        778
guilt      769
sadness    762
shame      758
disgust    758
anger      758
fear       751
Name: count, dtype: int64

The emotions are quite evenly distributed across the dataset. This distribution can be visualized in a bar chart if you'd like.

second insight, which is the analysis of the average length of text entries for each emotion. 

In [29]:
# Calculate the average length of text entries for each emotion
cleaned_data['Text Length'] = cleaned_data['Text'].apply(len)
average_text_length_by_emotion = cleaned_data.groupby('Emotion Label')['Text Length'].mean()

# Display the average text length for each emotion
average_text_length_by_emotion


Emotion Label
anger      127.112137
disgust    110.885224
fear       117.944075
guilt      118.661899
joy         98.410026
sadness    103.771654
shame      111.841689
Name: Text Length, dtype: float64

analyze the most common words used across the dataset and within each emotion category to identify key themes and expressions.

In [31]:
import re
from collections import Counter

def simple_count_vectorizer(texts, top_n=20, stopwords=None):
    
    if stopwords is None:
        stopwords = set()  # Define or import a list of stopwords if necessary
    
    # Initialize a counter to hold all word counts
    word_count = Counter()

    # Process each document
    for text in texts:
        # Tokenize and clean text
        words = re.findall(r'\b\w+\b', text.lower())  # Tokenize and convert to lower case
        # Remove stopwords and count words
        filtered_words = [word for word in words if word not in stopwords]
        word_count.update(filtered_words)
    
    # Get the most common words
    most_common_words = dict(word_count.most_common(top_n))
    
    return most_common_words

# Example usage:
texts = cleaned_data['Text'].tolist()
stopwords = set(['the', 'and', 'to', 'of', 'a', 'in', 'that', 'is', 'on', 'for'])  # Define more if needed
top_words = simple_count_vectorizer(texts, top_n=20, stopwords=stopwords)

print(top_words)


{'i': 7625, 'my': 3255, 'was': 3159, 'when': 2556, 'had': 1805, 'me': 1593, 'not': 1152, 'with': 1059, 'it': 1034, 'at': 1024, 'he': 694, 'friend': 686, 'very': 680, 'felt': 628, 'an': 542, 'she': 527, 'her': 517, 'we': 491, 'about': 486, 'one': 480}
