In [15]:

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize,sent_tokenize
from nltk.stem import WordNetLemmatizer


In [2]:
import pandas as pd

# Path from notebooks folder to data/raw folder
csv_path = "../data/raw/training.1600000.processed.noemoticon.csv"

try:
    dataset = pd.read_csv(csv_path, encoding='latin-1')
    print("✓ Dataset loaded successfully!")
    print(f"Shape: {dataset.shape}")
    
    # Add proper column names
    column_names = ['sentiment', 'id', 'date', 'query', 'user', 'text']
    dataset.columns = column_names
    print(dataset.head())
    
except Exception as e:
    print(f"Error: {e}")

✓ Dataset loaded successfully!
Shape: (1599999, 6)
   sentiment          id                          date     query  \
0          0  1467810672  Mon Apr 06 22:19:49 PDT 2009  NO_QUERY   
1          0  1467810917  Mon Apr 06 22:19:53 PDT 2009  NO_QUERY   
2          0  1467811184  Mon Apr 06 22:19:57 PDT 2009  NO_QUERY   
3          0  1467811193  Mon Apr 06 22:19:57 PDT 2009  NO_QUERY   
4          0  1467811372  Mon Apr 06 22:20:00 PDT 2009  NO_QUERY   

            user                                               text  
0  scotthamilton  is upset that he can't update his Facebook by ...  
1       mattycus  @Kenichan I dived many times for the ball. Man...  
2        ElleCTF    my whole body feels itchy and like its on fire   
3         Karoli  @nationwideclass no, it's not behaving at all....  
4       joy_wolf                      @Kwesidei not the whole crew   


In [3]:
dataset.head()

Unnamed: 0,sentiment,id,date,query,user,text
0,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
1,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
2,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
3,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."
4,0,1467811372,Mon Apr 06 22:20:00 PDT 2009,NO_QUERY,joy_wolf,@Kwesidei not the whole crew


In [4]:
dataset.shape

(1599999, 6)

In [5]:
dataset.isnull().sum()

sentiment    0
id           0
date         0
query        0
user         0
text         0
dtype: int64

In [6]:
dataset.drop_duplicates(inplace=True)

In [7]:
dataset.shape

(1599999, 6)

In [8]:
dataset.replace({'sentiment': {4:1}},inplace=True)

In [9]:
dataset['sentiment'].value_counts()

sentiment
1    800000
0    799999
Name: count, dtype: int64

In [10]:
import re
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# Download required NLTK data (run this once)
nltk.download('stopwords', quiet=True)
nltk.download('wordnet', quiet=True)
nltk.download('punkt', quiet=True)

def preprocess(text):
    """
    Preprocesses text by:
    1. Converting to lowercase
    2. Removing non-alphabetic characters
    3. Tokenizing into words
    4. Removing stopwords
    5. Lemmatizing words
    6. Joining back into a string
    """
    try:
        # Handle missing/null values
        if pd.isna(text) or text is None:
            return ""
        
        # Convert to string if not already
        text = str(text)
        
        # Initialize lemmatizer and get stopwords
        word_net = WordNetLemmatizer()
        stop_words = set(stopwords.words('english'))
        
        # Convert to lowercase
        text = text.lower()
        
        # Remove non-alphabetic characters (keep only letters and spaces)
        text = re.sub(r'[^a-zA-Z\s]', ' ', text)
        
        # Split into words
        words = text.split()
        
        # Remove stopwords and lemmatize
        processed_words = [
            word_net.lemmatize(word) 
            for word in words 
            if word not in stop_words and len(word) > 2  # Also remove very short words
        ]
        
        # Join back into string
        processed_text = ' '.join(processed_words)
        
        return processed_text
        
    except Exception as e:
        print(f"Error processing text: {text[:50]}... Error: {e}")
        return ""

# Now apply the preprocessing
dataset['preprocessed_text'] = dataset['text'].apply(preprocess)

# Check the results
print("Original vs Preprocessed text (first 5 rows):")
print(dataset[['text', 'preprocessed_text']].head())

Original vs Preprocessed text (first 5 rows):
                                                text  \
0  is upset that he can't update his Facebook by ...   
1  @Kenichan I dived many times for the ball. Man...   
2    my whole body feels itchy and like its on fire    
3  @nationwideclass no, it's not behaving at all....   
4                      @Kwesidei not the whole crew    

                                   preprocessed_text  
0  upset update facebook texting might cry result...  
1  kenichan dived many time ball managed save res...  
2                    whole body feel itchy like fire  
3                   nationwideclass behaving mad see  
4                                kwesidei whole crew  


In [14]:
import os

# Create the directory if it doesn't exist
os.makedirs("notebooks/preprocess", exist_ok=True)

# Now save the file
dataset.to_csv("notebooks/preprocess/imdb_clean.csv", index=False)
print("✅ Preprocessed data saved!")

✅ Preprocessed data saved!
