# Text Preprocessing for NLP

## Dataset:
**Twitter Sentiment Analysis (Sentiment140)**
- https://www.kaggle.com/datasets/kazanova/sentiment140

This dataset contains 1.6 million tweets with sentiment labels.

In [1]:
# Install required packages (run once if needed)
# !pip install nltk spacy
# !python -m spacy download en_core_web_sm

In [2]:
import pandas as pd
import re
import string
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer

# Download NLTK data
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt to /Users/prince/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/prince/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/prince/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/prince/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

## Load the Dataset

In [3]:
# Load the Kaggle Sentiment140 dataset
data_path = "training.1600000.processed.noemoticon.csv"

# The dataset has no header, so we specify column names
columns = ['target', 'ids', 'date', 'flag', 'user', 'text']

df = pd.read_csv(data_path, encoding='latin-1', names=columns)
print(f"Dataset shape: {df.shape}")

Dataset shape: (1600000, 6)


In [4]:
df.head()

Unnamed: 0,target,ids,date,flag,user,text
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."


In [5]:
# Use a smaller sample for faster processing
df = df.sample(n=1000, random_state=42).reset_index(drop=True)
print(f"Sample size: {df.shape}")

Sample size: (1000, 6)


In [6]:
# Convert target: 0 -> negative, 4 -> positive
df['sentiment'] = df['target'].map({0: 'negative', 4: 'positive'})
df[['text', 'sentiment']].head(10)

Unnamed: 0,text,sentiment
0,@chrishasboobs AHHH I HOPE YOUR OK!!!,negative
1,"@misstoriblack cool , i have no tweet apps fo...",negative
2,@TiannaChaos i know just family drama. its la...,negative
3,School email won't open and I have geography ...,negative
4,upper airways problem,negative
5,Going to miss Pastor's sermon on Faith...,negative
6,on lunch....dj should come eat with me,positive
7,@piginthepoke oh why are you feeling like that?,negative
8,gahh noo!peyton needs to live!this is horrible,negative
9,@mrstessyman thank you glad you like it! There...,positive


---
# 1. Lowercase Conversion

In [7]:
print("Before:", df['text'][0])
df['text_clean'] = df['text'].str.lower()
print("After:", df['text_clean'][0])

Before: @chrishasboobs AHHH I HOPE YOUR OK!!! 
After: @chrishasboobs ahhh i hope your ok!!! 


---
# 2. Remove URLs

In [8]:
def remove_urls(text):
    url_pattern = re.compile(r'https?://\S+|www\.\S+')
    return url_pattern.sub('', text)

# Test
test_text = "Check this https://example.com for more"
print("Before:", test_text)
print("After:", remove_urls(test_text))

df['text_clean'] = df['text_clean'].apply(remove_urls)

Before: Check this https://example.com for more
After: Check this  for more


---
# 3. Remove HTML Tags

In [9]:
def remove_html_tags(text):
    html_pattern = re.compile(r'<.*?>')
    return html_pattern.sub('', text)

# Test
test_text = "This is <b>bold</b> and <br/> new line"
print("Before:", test_text)
print("After:", remove_html_tags(test_text))

df['text_clean'] = df['text_clean'].apply(remove_html_tags)

Before: This is <b>bold</b> and <br/> new line
After: This is bold and  new line


---
# 4. Remove Twitter Elements (@mentions, #hashtags, RT)

In [10]:
def remove_twitter_elements(text):
    text = re.sub(r'@\w+', '', text)  # Remove @mentions
    text = re.sub(r'#', '', text)      # Remove # but keep word
    text = re.sub(r'\brt\b', '', text) # Remove RT
    return text

# Test
test_text = "RT @john: I love #MachineLearning @everyone"
print("Before:", test_text)
print("After:", remove_twitter_elements(test_text))

df['text_clean'] = df['text_clean'].apply(remove_twitter_elements)

Before: RT @john: I love #MachineLearning @everyone
After: RT : I love MachineLearning 


---
# 5. Remove Punctuation

In [11]:
def remove_punctuation(text):
    translator = str.maketrans('', '', string.punctuation)
    return text.translate(translator)

# Test
test_text = "Hello! How are you? I'm great..."
print("Before:", test_text)
print("After:", remove_punctuation(test_text))

df['text_clean'] = df['text_clean'].apply(remove_punctuation)

Before: Hello! How are you? I'm great...
After: Hello How are you Im great


---
# 6. Chat Word Conversion

In [12]:
chat_words = {
    'u': 'you', 'ur': 'your', 'r': 'are', 'y': 'why',
    'pls': 'please', 'plz': 'please', 'thx': 'thanks',
    'ty': 'thank you', 'bc': 'because', 'b4': 'before',
    'gr8': 'great', 'l8r': 'later', 'w8': 'wait',
    'omg': 'oh my god', 'lol': 'laugh out loud',
    'brb': 'be right back', 'btw': 'by the way',
    'idk': 'i do not know', 'tbh': 'to be honest',
    'gonna': 'going to', 'wanna': 'want to',
    'dont': 'do not', 'cant': 'cannot', 'wont': 'will not'
}

def convert_chat_words(text):
    words = text.split()
    return ' '.join([chat_words.get(w, w) for w in words])

# Test
test_text = "u r gr8 lol thx btw"
print("Before:", test_text)
print("After:", convert_chat_words(test_text))

df['text_clean'] = df['text_clean'].apply(convert_chat_words)

Before: u r gr8 lol thx btw
After: you are great laugh out loud thanks by the way


---
# 7. Remove Emojis

In [13]:
def remove_emojis(text):
    emoji_pattern = re.compile(
        "["
        u"\U0001F600-\U0001F64F"
        u"\U0001F300-\U0001F5FF"
        u"\U0001F680-\U0001F6FF"
        u"\U0001F1E0-\U0001F1FF"
        u"\U00002702-\U000027B0"
        "]+", flags=re.UNICODE)
    return emoji_pattern.sub('', text)

df['text_clean'] = df['text_clean'].apply(remove_emojis)

---
# 8. Remove Stopwords

In [14]:
stop_words = set(stopwords.words('english'))
print(f"Number of stopwords: {len(stop_words)}")

def remove_stopwords(text):
    words = text.split()
    return ' '.join([w for w in words if w not in stop_words])

# Test
test_text = "this is a sample sentence with stopwords"
print("Before:", test_text)
print("After:", remove_stopwords(test_text))

df['text_clean'] = df['text_clean'].apply(remove_stopwords)

Number of stopwords: 198
Before: this is a sample sentence with stopwords
After: sample sentence stopwords


---
# 9. Tokenization

In [15]:
text = "Hello world how are you"

# Method 1: split()
print("split():", text.split())

# Method 2: Regex
print("regex:", re.findall(r'\b\w+\b', text))

# Method 3: NLTK
print("NLTK:", word_tokenize(text))

split(): ['Hello', 'world', 'how', 'are', 'you']
regex: ['Hello', 'world', 'how', 'are', 'you']
NLTK: ['Hello', 'world', 'how', 'are', 'you']


In [23]:
# Method 4: spaCy (best for production)
# Run these two lines first (only once) to install spaCy:
!pip install spacy -q
!python -m spacy download en_core_web_sm -q


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.2[0m[39;49m -> [0m[32;49m26.0[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.2[0m[39;49m -> [0m[32;49m26.0[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
[38;5;2mâœ” Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [24]:
# Now use spaCy
import spacy
nlp = spacy.load('en_core_web_sm')

doc = nlp(text)
print("spaCy:", [token.text for token in doc])

spaCy: ['Hello', 'world', 'how', 'are', 'you']


---
# 10. Stemming
Reduces words to root form (fast, but may produce non-words)

In [17]:
stemmer = PorterStemmer()

words = ['running', 'runs', 'easily', 'happily']
for word in words:
    print(f"{word} -> {stemmer.stem(word)}")

running -> run
runs -> run
easily -> easili
happily -> happili


---
# 11. Lemmatization
Reduces words to dictionary form (slower, but more accurate)

In [18]:
lemmatizer = WordNetLemmatizer()

words = ['running', 'runs', 'easily', 'cats', 'better']
for word in words:
    print(f"{word} -> {lemmatizer.lemmatize(word)}")

running -> running
runs -> run
easily -> easily
cats -> cat
better -> better


In [19]:
def apply_lemmatization(text):
    words = text.split()
    return ' '.join([lemmatizer.lemmatize(w) for w in words])

df['text_clean'] = df['text_clean'].apply(apply_lemmatization)

---
# Final Results

In [20]:
# Compare original vs cleaned
for i in range(5):
    print(f"\n--- Tweet {i+1} ---")
    print(f"Original: {df['text'][i]}")
    print(f"Cleaned:  {df['text_clean'][i]}")
    print(f"Sentiment: {df['sentiment'][i]}")


--- Tweet 1 ---
Original: @chrishasboobs AHHH I HOPE YOUR OK!!! 
Cleaned:  ahhh hope ok
Sentiment: negative

--- Tweet 2 ---
Original: @misstoriblack cool , i have no tweet apps  for my razr 2
Cleaned:  cool tweet apps razr 2
Sentiment: negative

--- Tweet 3 ---
Original: @TiannaChaos i know  just family drama. its lame.hey next time u hang out with kim n u guys like have a sleepover or whatever, ill call u
Cleaned:  know family drama lamehey next time hang kim n guy like sleepover whatever ill call
Sentiment: negative

--- Tweet 4 ---
Original: School email won't open  and I have geography stuff on there to revise! *Stupid School* :'(
Cleaned:  school email open geography stuff revise stupid school
Sentiment: negative

--- Tweet 5 ---
Original: upper airways problem 
Cleaned:  upper airway problem
Sentiment: negative


In [21]:
df[['text', 'text_clean', 'sentiment']].head(10)

Unnamed: 0,text,text_clean,sentiment
0,@chrishasboobs AHHH I HOPE YOUR OK!!!,ahhh hope ok,negative
1,"@misstoriblack cool , i have no tweet apps fo...",cool tweet apps razr 2,negative
2,@TiannaChaos i know just family drama. its la...,know family drama lamehey next time hang kim n...,negative
3,School email won't open and I have geography ...,school email open geography stuff revise stupi...,negative
4,upper airways problem,upper airway problem,negative
5,Going to miss Pastor's sermon on Faith...,going miss pastor sermon faith,negative
6,on lunch....dj should come eat with me,lunchdj come eat,positive
7,@piginthepoke oh why are you feeling like that?,oh feeling like,negative
8,gahh noo!peyton needs to live!this is horrible,gahh noopeyton need livethis horrible,negative
9,@mrstessyman thank you glad you like it! There...,thank glad like product review bit site enjoy ...,positive


---
# Complete Pipeline Function

In [22]:
def preprocess_text(text):
    """Complete text preprocessing pipeline"""
    text = text.lower()
    text = re.sub(r'https?://\S+|www\.\S+', '', text)
    text = re.sub(r'<.*?>', '', text)
    text = re.sub(r'@\w+', '', text)
    text = re.sub(r'#', '', text)
    text = text.translate(str.maketrans('', '', string.punctuation))
    words = text.split()
    words = [chat_words.get(w, w) for w in words]
    words = [w for w in words if w not in stop_words]
    words = [lemmatizer.lemmatize(w) for w in words]
    return ' '.join(words)

# Test
test = "@john OMG!! Check https://test.com #AI is gr8!!!"
print("Original:", test)
print("Cleaned:", preprocess_text(test))

Original: @john OMG!! Check https://test.com #AI is gr8!!!
Cleaned: oh my god check ai great
