<a href="https://colab.research.google.com/github/suganyaGaurav/LLM_llm-finetuning-eval-deploy/blob/main/LLM_data_preprocessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# LLM_data_preprocessing.ipynb
---

##  Goal
- Load raw IMDB dataset (50K reviews)
- Clean text (remove HTML, punctuation, lowercase)
- Encode sentiment labels
- Split into train/test
- Save cleaned CSVs for future use


In [4]:
# ==============================
# Step 1: Import Libraries
# ==============================

import pandas as pd
import re
from sklearn.model_selection import train_test_split

In [5]:
# ==============================
# Step 2: Load Dataset
# ==============================
# 'IMDB Dataset.csv' is uploaded to the google colab path, lets load it to DF

df = pd.read_csv('/content/IMDB Dataset.csv')

# let's see how the DF looks
df.head(5)

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [6]:
# Lets check the Data shape and description

print(df.shape)
df.describe

(50000, 2)


In [7]:
# ==============================
# Step 3: Clean Text Function
# ==============================

def cleantext(text):
  text = re.sub(r"<.*?>","",text) # remove the html tags
  text = text.lower() #lowercase the review text
  text = re.sub(r"[^a-z \s]", "" , text) # remove the punctuations and numbers
  text = re.sub(r"< \s + >", " " , text) # remove unwanted spaces
  return text.strip()

# For cleaning the review text for pre-processing
df['review'] = df['review'].apply(cleantext)

# To encode the postitve & negative values as (1,0)
df['sentiment'] = df['sentiment'].map({'negative' : 0 , 'positive' : 1})

print("Class Distribution \n", df['sentiment'].value_counts())
print("Review Text after pre-processing \n", df.head(5))

Class Distribution 
 sentiment
1    25000
0    25000
Name: count, dtype: int64
Review Text after pre-processing 
                                               review  sentiment
0  one of the other reviewers has mentioned that ...          1
1  a wonderful little production the filming tech...          1
2  i thought this was a wonderful way to spend ti...          1
3  basically theres a family where a little boy j...          0
4  petter matteis love in the time of money is a ...          1


In [8]:
df.head(4)

Unnamed: 0,review,sentiment
0,one of the other reviewers has mentioned that ...,1
1,a wonderful little production the filming tech...,1
2,i thought this was a wonderful way to spend ti...,1
3,basically theres a family where a little boy j...,0


In [9]:
# ==============================
# Step 4: Train-Test Split
# ==============================
train_texts, test_texts, train_labels, test_labels = train_test_split( df["review"].values, df["sentiment"].values, test_size=0.2,
    random_state=42,stratify=df["sentiment"].values)

print("Train size:", len(train_texts))
print("Test size:", len(test_texts))

Train size: 40000
Test size: 10000


In [10]:
# ==============================
# Step 5: Save Cleaned Data
# ==============================
train_df = pd.DataFrame({"review": train_texts, "sentiment": train_labels})
test_df  = pd.DataFrame({"review": test_texts, "sentiment": test_labels})

# Save CSVs for later use
train_df.to_csv("imdb_train_clean.csv", index=False)
test_df.to_csv("imdb_test_clean.csv", index=False)

print("Files saved: imdb_train_clean.csv, imdb_test_clean.csv")


Files saved: imdb_train_clean.csv, imdb_test_clean.csv
