In [6]:
import pandas as pd
import numpy as np
import re
import string
import warnings
warnings.filterwarnings('ignore')

# Download NLTK resources first
import nltk
nltk.download('stopwords', quiet=True)
nltk.download('punkt', quiet=True)
nltk.download('wordnet', quiet=True)
nltk.download('punkt_tab', quiet=True)

# Now import NLTK components
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

print("âœ“ All libraries imported successfully!")

âœ“ All libraries imported successfully!


In [7]:
# Load clean dataset
df = pd.read_csv('dataset_clean.csv')

print("Dataset Info:")
print(f"Shape: {df.shape}")
print(f"Columns: {list(df.columns[:5])}... (+ {df.shape[1]-5} more)")
print(f"\nFirst few rows:")
df.head()

Dataset Info:
Shape: (70000, 29)
Columns: ['text', 'admiration', 'amusement', 'anger', 'annoyance']... (+ 24 more)

First few rows:


Unnamed: 0,text,admiration,amusement,anger,annoyance,approval,caring,confusion,curiosity,desire,...,love,nervousness,optimism,pride,realization,relief,remorse,sadness,surprise,neutral
0,That game hurt.,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1,>sexuality shouldnâ€™t be a grouping category I...,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,"You do right, if you don't care then fuck 'em!",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,Man I love reddit.,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
4,"[NAME] was nowhere near them, he was by the Fa...",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


## Text Preprocessing Functions

In [12]:
def remove_emoji(text):
    emoji_pattern = re.compile(
        "["
        "\U0001F600-\U0001F64F"  # emoticons
        "\U0001F300-\U0001F5FF"  # symbols & pictographs
        "\U0001F680-\U0001F6FF"  # transport & map symbols
        "\U0001F1E0-\U0001F1FF"  # flags
        "\U00002700-\U000027BF"  # dingbats
        "\U0001F900-\U0001F9FF"  # supplemental symbols
        "\U0001FA70-\U0001FAFF"  # more emoji
        "\U00002600-\U000026FF"  # misc symbols
        "]+",
        flags=re.UNICODE
    )
    return emoji_pattern.sub(r'', text)

def remove_emoticon(text):
    emoticon_pattern = r'(:\s?\)|:\s?D|:\s?\(|:\'\)|<3|;\)|:-\)|:-D|:-\(|:P|:-P|:v)'
    return re.sub(emoticon_pattern, '', text, flags=re.IGNORECASE)

def clean_text(text):
    """
    Comprehensive text cleaning function
    """
    # Convert to lowercase
    text = text.lower()
    
    #remove emojis
    text = remove_emoji(text)
    
    #remove emoticons
    text = remove_emoticon(text)
    
    # Remove URLs
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    
    # Remove email addresses
    text = re.sub(r'\S+@\S+', '', text)
    
    # Remove mentions (@username)
    text = re.sub(r'@\w+', '', text)
    
    # Remove hashtags (keep the text, remove #)
    text = re.sub(r'#', '', text)
    
    # Remove HTML tags
    text = re.sub(r'<.*?>', '', text)
    
    # Remove numbers
    text = re.sub(r'\d+', '', text)
    
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    
    # Remove extra whitespace
    text = ' '.join(text.split())
    
    return text

# Test the function
sample_text = "OMG! Check this out: https://example.com @user #awesome <3 Best thing ever ðŸ”¥!!!"
print("Original:", sample_text)
print("Cleaned:", clean_text(sample_text))

Original: OMG! Check this out: https://example.com @user #awesome <3 Best thing ever ðŸ”¥!!!
Cleaned: omg check this out awesome best thing ever


In [13]:
def remove_stopwords(text):
    """
    Remove stopwords from text
    """
    stop_words = set(stopwords.words('english'))
    
    # Tokenize
    tokens = word_tokenize(text)
    
    # Remove stopwords
    filtered_tokens = [word for word in tokens if word not in stop_words]
    
    return ' '.join(filtered_tokens)

# Test the function
sample = "this is a sample sentence with some stopwords"
print("Original:", sample)
print("Without stopwords:", remove_stopwords(sample))

Original: this is a sample sentence with some stopwords
Without stopwords: sample sentence stopwords


In [14]:
def lemmatize_text(text):
    """
    Lemmatize words to their base form
    """
    lemmatizer = WordNetLemmatizer()
    
    # Tokenize
    tokens = word_tokenize(text)
    
    # Lemmatize
    lemmatized = [lemmatizer.lemmatize(word) for word in tokens]
    
    return ' '.join(lemmatized)

# Test the function
sample = "running runs ran better best"
print("Original:", sample)
print("Lemmatized:", lemmatize_text(sample))

Original: running runs ran better best
Lemmatized: running run ran better best


In [15]:
def preprocess_text(text):
    """
    Complete preprocessing pipeline
    """
    # Handle missing values
    if pd.isna(text) or text == '':
        return ''
    
    # Convert to string
    text = str(text)
    
    # Apply all preprocessing steps
    text = clean_text(text)
    text = remove_stopwords(text)
    text = lemmatize_text(text)
    
    return text

# Test complete pipeline
sample = "Hey @user! Check out this AMAZING article: https://example.com #ML #AI ðŸ”¥"
print("Original:", sample)
print("Preprocessed:", preprocess_text(sample))

Original: Hey @user! Check out this AMAZING article: https://example.com #ML #AI ðŸ”¥
Preprocessed: hey check amazing article ml ai


## Apply Preprocessing to Dataset

In [None]:
# Show sample data before preprocessing
print("=" * 80)
print("BEFORE PREPROCESSING")
print("=" * 80)
for i in range(3):
    print(f"\n{i+1}. {df['text'].iloc[i][:150]}...")
    print(f"   Length: {len(df['text'].iloc[i])} characters")

BEFORE PREPROCESSING

1. That game hurt....
   Length: 15 characters

2.  >sexuality shouldnâ€™t be a grouping category It makes you different from othet ppl so imo it fits the definition of "grouping" ...
   Length: 127 characters

3. You do right, if you don't care then fuck 'em!...
   Length: 46 characters


In [17]:
# Apply preprocessing with progress indication
print("Preprocessing text data...")
print("This may take a few minutes for large datasets...\n")

from tqdm import tqdm
tqdm.pandas()

df['text_processed'] = df['text'].progress_apply(preprocess_text)

print("\nâœ“ Preprocessing completed!")

Preprocessing text data...
This may take a few minutes for large datasets...



100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 70000/70000 [00:29<00:00, 2376.68it/s]


âœ“ Preprocessing completed!





In [None]:
# Show sample data after preprocessing
print("=" * 80)
print("AFTER PREPROCESSING")
print("=" * 80)
for i in range(3):
    print(f"\n{i+1}. Original: {df['text'].iloc[i][:100]}...")
    print(f"   Processed: {df['text_processed'].iloc[i][:100]}...")
    print(f"   Length: {len(df['text'].iloc[i])} â†’ {len(df['text_processed'].iloc[i])} characters")

## Data Quality Check

In [None]:
# Check for empty texts after preprocessing
empty_texts = df['text_processed'].str.strip() == ''
empty_count = empty_texts.sum()

print("=" * 80)
print("DATA QUALITY CHECK")
print("=" * 80)
print(f"Total rows: {len(df):,}")
print(f"Empty after preprocessing: {empty_count:,} ({empty_count/len(df)*100:.2f}%)")
print(f"Valid texts: {len(df) - empty_count:,} ({(len(df)-empty_count)/len(df)*100:.2f}%)")

# Show statistics
print(f"\nText length statistics (after preprocessing):")
df['text_processed_length'] = df['text_processed'].str.len()
print(f"  Mean: {df['text_processed_length'].mean():.0f} characters")
print(f"  Median: {df['text_processed_length'].median():.0f} characters")
print(f"  Min: {df['text_processed_length'].min()}")
print(f"  Max: {df['text_processed_length'].max()}")

In [None]:
# Remove rows with empty preprocessed text (if any)
df_final = df[df['text_processed'].str.strip() != ''].copy()

print(f"\nRows before filtering: {len(df):,}")
print(f"Rows after filtering: {len(df_final):,}")
print(f"Rows removed: {len(df) - len(df_final):,}")

## Save Preprocessed Data

In [None]:
# Prepare final dataset
# Keep: text_processed + all emotion columns
emotion_cols = ['admiration', 'amusement', 'anger', 'annoyance', 'approval', 
                'caring', 'confusion', 'curiosity', 'desire', 'disappointment', 
                'disapproval', 'disgust', 'embarrassment', 'excitement', 'fear', 
                'gratitude', 'grief', 'joy', 'love', 'nervousness', 'optimism', 
                'pride', 'realization', 'relief', 'remorse', 'sadness', 'surprise', 
                'neutral']

# Create final dataset with only necessary columns
df_preprocessed = df_final[['text_processed'] + emotion_cols].copy()

# Rename text_processed to text for consistency
df_preprocessed.rename(columns={'text_processed': 'text'}, inplace=True)

print("Final dataset structure:")
print(f"Shape: {df_preprocessed.shape}")
print(f"Columns: {list(df_preprocessed.columns)}")
print(f"\nFirst few rows:")
df_preprocessed.head()

In [None]:
# Save to CSV
df_preprocessed.to_csv('dataset_preprocessed.csv', index=False)

print("=" * 80)
print("âœ“ PREPROCESSING COMPLETE!")
print("=" * 80)
print(f"Preprocessed dataset saved to: dataset_preprocessed.csv")
print(f"Total samples: {len(df_preprocessed):,}")
print(f"Features: {df_preprocessed.shape[1]} (1 text + 28 emotion labels)")
print("\nDataset is ready for model training!")

## Summary Statistics

In [None]:
original_avg_length = df_final['text'].str.len().mean()
processed_avg_length = df_preprocessed['text'].str.len().mean()
reduction = ((original_avg_length - processed_avg_length) / original_avg_length) * 100

print("=" * 80)
print("PREPROCESSING IMPACT")
print("=" * 80)
print(f"Original average text length: {original_avg_length:.0f} characters")
print(f"Processed average text length: {processed_avg_length:.0f} characters")
print(f"Reduction: {reduction:.1f}%")

print(f"\nEmotion label distribution:")
emotion_totals = df_preprocessed[emotion_cols].sum()
print(f"Total emotion labels: {emotion_totals.sum():,}")
print(f"Average labels per comment: {emotion_totals.sum() / len(df_preprocessed):.2f}")

print(f"\nTop 5 most common emotions:")
for i, (emotion, count) in enumerate(emotion_totals.sort_values(ascending=False).head(5).items(), 1):
    print(f"{i}. {emotion:15s}: {count:6,} ({count/len(df_preprocessed)*100:5.2f}%)")