In [None]:
import pandas as pd
from datetime import datetime
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re
from data_loader import load_raw_data # Import from the data_loader script

# --- Initialization and Downloads ---

def setup_nltk():
    """Sets up NLTK resources required for text cleaning."""
    try:
        nltk.data.find('corpora/stopwords')
    except nltk.downloader.DownloadError:
        nltk.download('stopwords', quiet=True)
    try:
        nltk.data.find('corpora/wordnet')
    except nltk.downloader.DownloadError:
        nltk.download('wordnet', quiet=True)
    try:
        nltk.data.find('tokenizers/punkt')
    except nltk.downloader.DownloadError:
        nltk.download('punkt', quiet=True)
        
    global lemmatizer, stop_words
    lemmatizer = WordNetLemmatizer()
    stop_words = set(stopwords.words('english'))

# --- Core NLP Cleaning Function ---
def clean_text(text):
    """Performs lowercasing, punctuation removal, tokenization, stopword removal, and lemmatization."""
    if not isinstance(text, str):
        return ""
        
    # 1. Lowercase and remove non-alphabetic characters
    text = text.lower()
    text = re.sub(r'[^a-z\s]', '', text)

    # 2. Tokenization, Stopword Removal, and Lemmatization
    tokens = nltk.word_tokenize(text)
    tokens = [word for word in tokens if word not in stop_words]
    tokens = [lemmatizer.lemmatize(word) for word in tokens]

    # 3. Re-join tokens
    return ' '.join(tokens)


# --- Data Filtering and Merging ---
def extract_main_category(categories):
    """Extracts the top-level category from the list structure."""
    if categories and categories[0]:
        return categories[0][0]
    return 'Unknown'

def preprocess_and_merge_data(review_df, metadata_df):
    """Filters, cleans, and merges the review and metadata DataFrames."""
    
    # 1. Filter Reviews by Date
    review_df['reviewDate'] = pd.to_datetime(review_df['unixReviewTime'], unit='s')
    START_DATE = datetime(2010, 1, 1)
    filtered_df = review_df[review_df['reviewDate'] >= START_DATE].copy()
    filtered_df['year'] = filtered_df['reviewDate'].dt.year

    # 2. Clean Metadata
    metadata_df = metadata_df[['asin', 'categories']].copy()
    metadata_df['main_category'] = metadata_df['categories'].apply(extract_main_category)
    metadata_df = metadata_df.drop(columns=['categories'])

    # 3. Merge DataFrames
    final_df = pd.merge(
        filtered_df,
        metadata_df,
        on='asin',
        how='left'
    )
    
    # 4. Select final columns and drop NaNs
    final_df = final_df[['reviewText', 'overall', 'year', 'main_category']].rename(
        columns={'overall': 'rating'}
    )
    final_df.dropna(subset=['reviewText', 'main_category', 'rating'], inplace=True)
    
    # 5. Apply Text Cleaning
    setup_nltk() 
    print("Starting text cleaning...")
    final_df['cleaned_review'] = final_df['reviewText'].apply(clean_text)
    print("Cleaning complete.")

    return final_df

if __name__ == '__main__':
   
    review_data, metadata_data = load_raw_data()
    final_df = preprocess_and_merge_data(review_data, metadata_data)
    print(f"Final Clean Dataset Shape: {final_df.shape}")