In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from data_preprocessing import preprocess_and_merge_data # For demonstration

# --- Target Mapping Function ---
def map_rating_to_sentiment(rating):
    """Maps 1-5 star ratings to Positive, Neutral, or Negative sentiment."""
    if rating >= 4:
        return 'Positive'
    elif rating == 3:
        return 'Neutral'
    else: # rating 1 or 2
        return 'Negative'

def perform_feature_engineering(final_df):
    """Applies TF-IDF vectorization and prepares all target variables."""
    
    # 1. Initialize and Fit TF-IDF Vectorizer
    vectorizer = TfidfVectorizer(max_features=10000, min_df=5)
    print("Starting TF-IDF Vectorization...")
    X = vectorizer.fit_transform(final_df['cleaned_review'])
    print("TF-IDF Vectorization complete.")
    print(f"Shape of Feature Matrix (X): {X.shape}")

    # 2. Prepare Target Variables
    y_sentiment = final_df['rating'].apply(map_rating_to_sentiment)
    y_category = final_df['main_category'] # For the MLP model
    
    return X, y_sentiment, y_category, vectorizer

def split_data(X, y_sentiment, y_category):
    """Splits data for the two main tasks: Classification and Predictive Modeling."""
    
    # Split for Sentiment Classification (Logistic Regression, RF, SVM)
    X_train, X_test, y_train, y_test = train_test_split(
        X,
        y_sentiment,
        test_size=0.2,
        random_state=42,
        stratify=y_sentiment
    )
    
    
    # to define X_train_pred/X_test_pred
    X_train_pred, X_test_pred, y_train_cat, y_test_cat = train_test_split(
        X,
        y_category,
        test_size=0.2,
        random_state=42,
        stratify=y_category
    )

    print(f"Training set size: {X_train.shape[0]}")
    return X_train, X_test, y_train, y_test, X_train_pred, X_test_pred, y_train_cat, y_test_cat

if __name__ == '__main9':
    
    pass