#### Import Libraries

In [1]:
import os
import pandas as pd
import numpy as np
from datasets import load_dataset
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\16476\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\16476\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\16476\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

#### Load Dataset

In [2]:
# Retrieve the access token from the environment variable
hf_token = os.getenv("llm_project_token")

# Load the dataset using the access token
ds = load_dataset("stanfordnlp/imdb", token=hf_token)

# Convert to pandas DataFrame for easier manipulation
train_df = pd.DataFrame(ds['train'])
test_df = pd.DataFrame(ds['test'])
unsupervised_df = pd.DataFrame(ds['unsupervised'])

#### Handle Missing Values

In [3]:
# Check for missing values
print("Missing values in train set:", train_df.isnull().sum())
print("Missing values in test set:", test_df.isnull().sum())
print("Missing values in unsupervised set:", unsupervised_df.isnull().sum())

# Drop duplicates if any
train_df.drop_duplicates(inplace=True)
test_df.drop_duplicates(inplace=True)
unsupervised_df.drop_duplicates(inplace=True)

Missing values in train set: text     0
label    0
dtype: int64
Missing values in test set: text     0
label    0
dtype: int64
Missing values in unsupervised set: text     0
label    0
dtype: int64


#### Perform Text Cleaning

In [4]:
# Initialize stopwords and lemmatizer
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def clean_text(text):
    # Remove HTML tags
    text = re.sub(r'<.*?>', '', text)
    # Remove special characters and digits
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    # Tokenize
    words = word_tokenize(text)
    # Remove stopwords and lemmatize
    words = [lemmatizer.lemmatize(word) for word in words if word.lower() not in stop_words]
    return ' '.join(words)

# Apply text cleaning
train_df['clean_text'] = train_df['text'].apply(clean_text)
test_df['clean_text'] = test_df['text'].apply(clean_text)
unsupervised_df['clean_text'] = unsupervised_df['text'].apply(clean_text)

#### Save and Compress CSV

In [6]:
import gzip
import shutil

In [7]:
def save_and_compress(df, filename):
    temp_csv = filename + '.csv'
    compressed_csv = filename + '.csv.gz'
    
    # Save the dataframe to a CSV file
    df.to_csv(temp_csv, index=False)
    
    # Compress the CSV file
    with open(temp_csv, 'rb') as f_in, gzip.open(compressed_csv, 'wb') as f_out:
        shutil.copyfileobj(f_in, f_out)
    
    # Remove the temporary CSV file
    os.remove(temp_csv)

# Save and compress the train dataset
save_and_compress(train_df, 'cleaned_train')

# Save and compress the test dataset
save_and_compress(test_df, 'cleaned_test')

# Save and compress the unsupervised dataset
save_and_compress(unsupervised_df, 'cleaned_unsupervised')