In [1]:
# Import necessary libraries
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

# Download NLTK data files (only need to run once)
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

# Load the dataset
df = pd.read_csv('../data/processed/processed_sentiment140.csv')

# Display the first few rows of the dataset
print(df.head())

# Function to clean text
def clean_text(text):
    # Remove URLs
    text = re.sub(r'http\S+', '', text)
    # Remove mentions
    text = re.sub(r'@\w+', '', text)
    # Remove hashtags
    text = re.sub(r'#\w+', '', text)
    # Remove special characters and numbers
    text = re.sub(r'[^A-Za-z\s]', '', text)
    # Convert to lowercase
    text = text.lower()
    return text

# Apply the cleaning function to the text column
df['cleaned_text'] = df['text'].apply(clean_text)

# Tokenize the cleaned text
df['tokens'] = df['cleaned_text'].apply(word_tokenize)

# Remove stop words
stop_words = set(stopwords.words('english'))
df['tokens'] = df['tokens'].apply(lambda tokens: [word for word in tokens if word not in stop_words])

# Lemmatize the tokens
lemmatizer = WordNetLemmatizer()
df['tokens'] = df['tokens'].apply(lambda tokens: [lemmatizer.lemmatize(word) for word in tokens])

# Join tokens into a single string
df['tokens'] = df['tokens'].apply(' '.join)

# Limit the dataset size (adjust as needed)
df = df.sample(n=100000, random_state=42)

# Save the cleaned and tokenized data
df.to_csv('../data/processed/cleaned_tokenized_sentiment140.csv', index=False)
print("Cleaned and tokenized data saved to ../data/processed/cleaned_tokenized_sentiment140.csv")

# Display basic information about the processed dataset
print(df.info())

# Display the first few rows of the processed dataset
print(df.head())

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/reetmitra/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /Users/reetmitra/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/reetmitra/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


   target         ids                          date      flag  \
0       0  1467810369  Mon Apr 06 22:19:45 PDT 2009  NO_QUERY   
1       0  1467810672  Mon Apr 06 22:19:49 PDT 2009  NO_QUERY   
2       0  1467810917  Mon Apr 06 22:19:53 PDT 2009  NO_QUERY   
3       0  1467811184  Mon Apr 06 22:19:57 PDT 2009  NO_QUERY   
4       0  1467811193  Mon Apr 06 22:19:57 PDT 2009  NO_QUERY   

              user                                               text  
0  _TheSpecialOne_  @switchfoot http://twitpic.com/2y1zl - Awww, t...  
1    scotthamilton  is upset that he can't update his Facebook by ...  
2         mattycus  @Kenichan I dived many times for the ball. Man...  
3          ElleCTF    my whole body feels itchy and like its on fire   
4           Karoli  @nationwideclass no, it's not behaving at all....  
Cleaned and tokenized data saved to ../data/processed/cleaned_tokenized_sentiment140.csv
<class 'pandas.core.frame.DataFrame'>
Index: 100000 entries, 541200 to 429504
Data colum