# Data Preprocessing & Text Cleaning
## Social Media Sentiment Analysis

This notebook performs comprehensive text preprocessing including cleaning, normalization, lemmatization, and stemming.

## 1. Import Required Libraries

In [1]:
import pandas as pd
import numpy as np
import re
import string
import warnings
from tqdm import tqdm
import matplotlib.pyplot as plt
import seaborn as sns

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer, PorterStemmer

warnings.filterwarnings('ignore')
tqdm.pandas()

print('Libraries imported successfully!')

Libraries imported successfully!


## 2. Download NLTK Resources

In [2]:
nltk_resources = ['stopwords', 'punkt', 'wordnet', 'omw-1.4', 'punkt_tab']

for resource in nltk_resources:
    try:
        nltk.download(resource, quiet=True)
    except:
        print(f'Could not download {resource}')

print('NLTK resources downloaded!')

NLTK resources downloaded!


## 3. Load the Dataset

In [3]:
column_names = ['target', 'ids', 'date', 'flag', 'user', 'text']
df = pd.read_csv('dataset/training.1600000.processed.noemoticon.csv', 
                 encoding='latin-1', 
                 names=column_names)

print(f'Dataset loaded: {df.shape[0]:,} rows × {df.shape[1]} columns')
df.head()

Dataset loaded: 1,600,000 rows × 6 columns


Unnamed: 0,target,ids,date,flag,user,text
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."


## 4. Initial Data Preparation

In [4]:
df_clean = df.copy()

df_clean['sentiment'] = df_clean['target'].map({0: 0, 4: 1})
df_clean['sentiment_label'] = df_clean['sentiment'].map({0: 'Negative', 1: 'Positive'})

print('Sentiment labels created:')
print(df_clean['sentiment_label'].value_counts())
print()
print('Sample data:')
df_clean[['text', 'sentiment', 'sentiment_label']].head()

Sentiment labels created:
sentiment_label
Negative    800000
Positive    800000
Name: count, dtype: int64

Sample data:


Unnamed: 0,text,sentiment,sentiment_label
0,"@switchfoot http://twitpic.com/2y1zl - Awww, t...",0,Negative
1,is upset that he can't update his Facebook by ...,0,Negative
2,@Kenichan I dived many times for the ball. Man...,0,Negative
3,my whole body feels itchy and like its on fire,0,Negative
4,"@nationwideclass no, it's not behaving at all....",0,Negative


## 5. Text Preprocessing Functions

Comprehensive preprocessing functions for text cleaning.

In [5]:
def extract_features(text):
    """Extract features from text before cleaning (URLs, mentions, hashtags)"""
    features = {}
    
    features['url_count'] = len(re.findall(r'http\S+|www\S+', text))
    features['mention_count'] = len(re.findall(r'@\w+', text))
    features['hashtag_count'] = len(re.findall(r'#\w+', text))
    
    hashtags = re.findall(r'#(\w+)', text)
    features['hashtags'] = ','.join(hashtags) if hashtags else ''
    
    return features

def clean_text(text):
    """Comprehensive text cleaning function"""
    text = text.lower()
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    text = re.sub(r'@\w+', '', text)
    text = re.sub(r'#', '', text)
    text = re.sub(r'\d+', '', text)
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = re.sub(r'\s+', ' ', text).strip()
    
    return text

def remove_stopwords(text):
    """Remove common English stopwords"""
    stop_words = set(stopwords.words('english'))
    words = text.split()
    filtered_words = [word for word in words if word not in stop_words]
    return ' '.join(filtered_words)

def lemmatize_text(text):
    """Lemmatize words to their root form"""
    lemmatizer = WordNetLemmatizer()
    words = text.split()
    lemmatized_words = [lemmatizer.lemmatize(word) for word in words]
    return ' '.join(lemmatized_words)

def stem_text(text):
    """Stem words using Porter Stemmer"""
    stemmer = PorterStemmer()
    words = text.split()
    stemmed_words = [stemmer.stem(word) for word in words]
    return ' '.join(stemmed_words)

print('Preprocessing functions created!')

Preprocessing functions created!


## 6. Feature Extraction

Extract features like URL count, mentions, hashtags before cleaning the text.

In [6]:
print('Extracting features from tweets...')

features_df = df_clean['text'].progress_apply(extract_features).apply(pd.Series)
df_clean = pd.concat([df_clean, features_df], axis=1)

print('\nFeatures extracted!')
print(f"\nFeature Statistics:")
print(f"Tweets with URLs: {(df_clean['url_count'] > 0).sum():,}")
print(f"Tweets with mentions: {(df_clean['mention_count'] > 0).sum():,}")
print(f"Tweets with hashtags: {(df_clean['hashtag_count'] > 0).sum():,}")

df_clean[['text', 'url_count', 'mention_count', 'hashtag_count', 'hashtags']].head(10)

Extracting features from tweets...


100%|██████████| 1600000/1600000 [00:05<00:00, 283240.37it/s]




Features extracted!

Feature Statistics:
Tweets with URLs: 81,117
Tweets with mentions: 738,493
Tweets with hashtags: 35,847


Unnamed: 0,text,url_count,mention_count,hashtag_count,hashtags
0,"@switchfoot http://twitpic.com/2y1zl - Awww, t...",2,1,0,
1,is upset that he can't update his Facebook by ...,0,0,0,
2,@Kenichan I dived many times for the ball. Man...,0,1,0,
3,my whole body feels itchy and like its on fire,0,0,0,
4,"@nationwideclass no, it's not behaving at all....",0,1,0,
5,@Kwesidei not the whole crew,0,1,0,
6,Need a hug,0,0,0,
7,@LOLTrish hey long time no see! Yes.. Rains a...,0,1,0,
8,@Tatiana_K nope they didn't have it,0,1,0,
9,@twittera que me muera ?,0,1,0,


## 7. Text Cleaning Pipeline

Apply comprehensive text cleaning to all tweets.

In [7]:
print('Cleaning text data...')
print('This may take a few minutes...')

df_clean['text_original'] = df_clean['text']

df_clean['text_cleaned'] = df_clean['text'].progress_apply(clean_text)

print('\nText cleaning completed!')
print('\nComparison (Before vs After):')
comparison = df_clean[['text_original', 'text_cleaned']].head(10)
for idx, row in comparison.iterrows():
    print(f"\n{idx + 1}. ORIGINAL: {row['text_original']}")
    print(f"   CLEANED:  {row['text_cleaned']}")

Cleaning text data...
This may take a few minutes...


100%|██████████| 1600000/1600000 [00:56<00:00, 28510.88it/s]




Text cleaning completed!

Comparison (Before vs After):

1. ORIGINAL: @switchfoot http://twitpic.com/2y1zl - Awww, that's a bummer.  You shoulda got David Carr of Third Day to do it. ;D
   CLEANED:  a thats a bummer you shoulda got david carr of third day to do it d

2. ORIGINAL: is upset that he can't update his Facebook by texting it... and might cry as a result  School today also. Blah!
   CLEANED:  is upset that he cant update his facebook by texting it and might cry as a result school today also blah

3. ORIGINAL: @Kenichan I dived many times for the ball. Managed to save 50%  The rest go out of bounds
   CLEANED:  i dived many times for the ball managed to save the rest go out of bounds

4. ORIGINAL: my whole body feels itchy and like its on fire 
   CLEANED:  my whole body feels itchy and like its on fire

5. ORIGINAL: @nationwideclass no, it's not behaving at all. i'm mad. why am i here? because I can't see you all over there. 
   CLEANED:  no its not behaving at all im mad wh

## 8. Remove Empty Tweets

Remove any tweets that became empty after cleaning.

In [8]:
empty_tweets = (df_clean['text_cleaned'].str.strip() == '') | (df_clean['text_cleaned'].isna())
print(f'Empty tweets found: {empty_tweets.sum():,}')

df_clean = df_clean[~empty_tweets].reset_index(drop=True)

print(f'Dataset after removing empty tweets: {len(df_clean):,} rows')

Empty tweets found: 3,219
Dataset after removing empty tweets: 1,596,781 rows
Dataset after removing empty tweets: 1,596,781 rows


## 9. Remove Stopwords

Remove common English stopwords that don't add much meaning.

In [9]:
print('Removing stopwords...')

df_clean['text_no_stopwords'] = df_clean['text_cleaned'].progress_apply(remove_stopwords)

print('\nStopwords removed!')
print('\nComparison (With Stopwords vs Without):')
comparison = df_clean[['text_cleaned', 'text_no_stopwords']].head(10)
for idx, row in comparison.iterrows():
    print(f"\n{idx + 1}. WITH:    {row['text_cleaned']}")
    print(f"   WITHOUT: {row['text_no_stopwords']}")

Removing stopwords...


100%|██████████| 1596781/1596781 [03:39<00:00, 7285.05it/s]


Stopwords removed!

Comparison (With Stopwords vs Without):

1. WITH:    a thats a bummer you shoulda got david carr of third day to do it d
   WITHOUT: thats bummer shoulda got david carr third day

2. WITH:    is upset that he cant update his facebook by texting it and might cry as a result school today also blah
   WITHOUT: upset cant update facebook texting might cry result school today also blah

3. WITH:    i dived many times for the ball managed to save the rest go out of bounds
   WITHOUT: dived many times ball managed save rest go bounds

4. WITH:    my whole body feels itchy and like its on fire
   WITHOUT: whole body feels itchy like fire

5. WITH:    no its not behaving at all im mad why am i here because i cant see you all over there
   WITHOUT: behaving im mad cant see

6. WITH:    not the whole crew
   WITHOUT: whole crew

7. WITH:    need a hug
   WITHOUT: need hug

8. WITH:    hey long time no see yes rains a bit only a bit lol im fine thanks hows you
   WITHOUT: hey 




## 10. Lemmatization

Convert words to their base/dictionary form.

In [10]:
print('Lemmatizing text...')

df_clean['text_lemmatized'] = df_clean['text_no_stopwords'].progress_apply(lemmatize_text)

print('\nLemmatization completed!')
print('\nSample lemmatized tweets:')
for idx, text in enumerate(df_clean['text_lemmatized'].head(5), 1):
    print(f"{idx}. {text}")

Lemmatizing text...


100%|██████████| 1596781/1596781 [00:40<00:00, 39365.81it/s]


Lemmatization completed!

Sample lemmatized tweets:
1. thats bummer shoulda got david carr third day
2. upset cant update facebook texting might cry result school today also blah
3. dived many time ball managed save rest go bound
4. whole body feel itchy like fire
5. behaving im mad cant see





## 11. Stemming

Apply stemming as an alternative preprocessing approach.

In [None]:
print('Stemming text...')

df_clean['text_stemmed'] = df_clean['text_no_stopwords'].progress_apply(stem_text)

print('\nStemming completed!')
print('\nComparison (Lemmatization vs Stemming):')
comparison = df_clean[['text_lemmatized', 'text_stemmed']].head(10)
for idx, row in comparison.iterrows():
    print(f"\n{idx + 1}. LEMMA: {row['text_lemmatized']}")
    print(f"   STEM:  {row['text_stemmed']}")

Stemming text...


 16%|█▌        | 247760/1596781 [00:23<02:05, 10741.31it/s]

## 12. Text Length Analysis

Compare text lengths before and after preprocessing.

In [None]:
df_clean['original_length'] = df_clean['text_original'].str.len()
df_clean['cleaned_length'] = df_clean['text_lemmatized'].str.len()
df_clean['original_word_count'] = df_clean['text_original'].str.split().str.len()
df_clean['cleaned_word_count'] = df_clean['text_lemmatized'].str.split().str.len()

print('Text Length Comparison:')
print('\nOriginal Text:')
print(f"  Avg character length: {df_clean['original_length'].mean():.2f}")
print(f"  Avg word count: {df_clean['original_word_count'].mean():.2f}")
print('\nCleaned Text (Lemmatized):')
print(f"  Avg character length: {df_clean['cleaned_length'].mean():.2f}")
print(f"  Avg word count: {df_clean['cleaned_word_count'].mean():.2f}")
print(f"\nReduction: {((df_clean['original_length'].mean() - df_clean['cleaned_length'].mean()) / df_clean['original_length'].mean() * 100):.1f}% in character length")

In [None]:
# Visualize length comparison
fig, axes = plt.subplots(2, 2, figsize=(15, 10))

# Original character length
axes[0, 0].hist(df_clean['original_length'], bins=50, color='skyblue', edgecolor='black', alpha=0.7)
axes[0, 0].set_title('Original Character Length', fontsize=14, fontweight='bold')
axes[0, 0].set_xlabel('Character Count')
axes[0, 0].set_ylabel('Frequency')
axes[0, 0].axvline(df_clean['original_length'].mean(), color='red', linestyle='--', label=f"Mean: {df_clean['original_length'].mean():.1f}")
axes[0, 0].legend()
axes[0, 0].grid(alpha=0.3)

# Cleaned character length
axes[0, 1].hist(df_clean['cleaned_length'], bins=50, color='lightgreen', edgecolor='black', alpha=0.7)
axes[0, 1].set_title('Cleaned Character Length', fontsize=14, fontweight='bold')
axes[0, 1].set_xlabel('Character Count')
axes[0, 1].set_ylabel('Frequency')
axes[0, 1].axvline(df_clean['cleaned_length'].mean(), color='red', linestyle='--', label=f"Mean: {df_clean['cleaned_length'].mean():.1f}")
axes[0, 1].legend()
axes[0, 1].grid(alpha=0.3)

# Original word count
axes[1, 0].hist(df_clean['original_word_count'], bins=50, color='lightcoral', edgecolor='black', alpha=0.7)
axes[1, 0].set_title('Original Word Count', fontsize=14, fontweight='bold')
axes[1, 0].set_xlabel('Word Count')
axes[1, 0].set_ylabel('Frequency')
axes[1, 0].axvline(df_clean['original_word_count'].mean(), color='red', linestyle='--', label=f"Mean: {df_clean['original_word_count'].mean():.1f}")
axes[1, 0].legend()
axes[1, 0].grid(alpha=0.3)

# Cleaned word count
axes[1, 1].hist(df_clean['cleaned_word_count'], bins=50, color='plum', edgecolor='black', alpha=0.7)
axes[1, 1].set_title('Cleaned Word Count', fontsize=14, fontweight='bold')
axes[1, 1].set_xlabel('Word Count')
axes[1, 1].set_ylabel('Frequency')
axes[1, 1].axvline(df_clean['cleaned_word_count'].mean(), color='red', linestyle='--', label=f"Mean: {df_clean['cleaned_word_count'].mean():.1f}")
axes[1, 1].legend()
axes[1, 1].grid(alpha=0.3)

plt.tight_layout()
plt.show()

## 13. Remove Duplicate Tweets

Remove duplicate tweets based on cleaned text.

In [None]:
duplicates_before = df_clean.duplicated(subset=['text_lemmatized']).sum()
print(f'Duplicate tweets found: {duplicates_before:,}')

df_clean = df_clean.drop_duplicates(subset=['text_lemmatized'], keep='first').reset_index(drop=True)

print(f'Dataset after removing duplicates: {len(df_clean):,} rows')

## 14. Dataset Structure Overview

In [None]:
print('Dataset Columns:')
print('  - text_original: Original tweets')
print('  - text_cleaned: Lowercased, URLs/mentions/numbers removed')
print('  - text_no_stopwords: Cleaned + stopwords removed')
print('  - text_lemmatized: Final processed (recommended for modeling)')
print('  - text_stemmed: Alternative stemmed version')
print('\nDataset is ready for sentiment analysis modeling!')

In [None]:
print('\nSample of Final Dataset:')
df_clean[['text_original', 'text_lemmatized', 'sentiment_label']].head(10)

## 15. Save Preprocessed Data

Save the cleaned dataset for future use.

In [None]:
columns_to_save = [
    'text_original',
    'text_cleaned', 
    'text_lemmatized',
    'text_stemmed',
    'sentiment',
    'sentiment_label',
    'url_count',
    'mention_count',
    'hashtag_count',
    'hashtags',
    'user'
]

df_save = df_clean[columns_to_save].copy()

output_file = 'dataset/cleaned_tweets.csv'
df_save.to_csv(output_file, index=False, encoding='utf-8')

print(f'Preprocessed data saved to: {output_file}')
print(f'Saved {len(df_save):,} tweets')
print(f'File size: {round(df_save.memory_usage(deep=True).sum() / 1024**2, 2)} MB')

## 16. Preprocessing Summary Report

In [None]:
print('='*80)
print('DATA PREPROCESSING SUMMARY REPORT')
print('='*80)
print('\nDATA STATISTICS:')
print(f'  - Original dataset: 1,600,000 tweets')
print(f'  - After removing empty tweets: {len(df_clean):,} tweets')
print(f'  - Final dataset: {len(df_save):,} tweets')
print(f'  - Data retention: {(len(df_save)/1600000*100):.2f}%')

print('\nPREPROCESSING STEPS COMPLETED:')
print('  - Text lowercase conversion')
print('  - URL removal')
print('  - Mention (@user) removal')
print('  - Hashtag symbol removal (text kept)')
print('  - Number removal')
print('  - Punctuation removal')
print('  - Extra whitespace removal')
print('  - Stopwords removal')
print('  - Lemmatization')
print('  - Stemming (alternative)')
print('  - Duplicate removal')

print('\nFEATURE ENGINEERING:')
print(f'  - URL count: {(df_clean["url_count"] > 0).sum():,} tweets with URLs')
print(f'  - Mention count: {(df_clean["mention_count"] > 0).sum():,} tweets with mentions')
print(f'  - Hashtag count: {(df_clean["hashtag_count"] > 0).sum():,} tweets with hashtags')

print('\nSENTIMENT DISTRIBUTION:')
for label, count in df_clean['sentiment_label'].value_counts().items():
    pct = (count / len(df_clean)) * 100
    print(f'  - {label}: {count:,} ({pct:.2f}%)')

print('\nTEXT LENGTH ANALYSIS:')
print(f'  - Avg original length: {df_clean["original_length"].mean():.2f} chars')
print(f'  - Avg cleaned length: {df_clean["cleaned_length"].mean():.2f} chars')
print(f'  - Avg original words: {df_clean["original_word_count"].mean():.2f}')
print(f'  - Avg cleaned words: {df_clean["cleaned_word_count"].mean():.2f}')

print('\nOUTPUT FILES:')
print(f'  - Cleaned dataset: dataset/cleaned_tweets.csv')

print('\nPreprocessing complete! Ready for sentiment analysis.')
print('='*80)