In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import os

# Define paths
raw_data_path = '../data/raw/real_user_profiles.csv'
synthetic_data_path = '../data/processed/synthetic_user_profiles.csv'
processed_train_data_path = '../data/processed/train_user_profiles.csv'
processed_test_data_path = '../data/processed/test_user_profiles.csv'

# Create directories if they don't exist
os.makedirs(os.path.dirname(processed_train_data_path), exist_ok=True)
os.makedirs(os.path.dirname(processed_test_data_path), exist_ok=True)

# Load the raw data
print("Loading raw data...")
raw_data = pd.read_csv(raw_data_path)

# Display first few rows of raw data
print("Raw data preview:")
print(raw_data.head())

# Generate synthetic data (assuming it's already generated and available)
print("Loading synthetic data...")
synthetic_data = pd.read_csv(synthetic_data_path)

# Display first few rows of synthetic data
print("Synthetic data preview:")
print(synthetic_data.head())

# Combine raw and synthetic data
print("Combining raw and synthetic data...")
combined_data = pd.concat([raw_data, synthetic_data], ignore_index=True)

# Display combined data preview
print("Combined data preview:")
print(combined_data.head())

# Define features and target
print("Defining features and target...")
features = combined_data.drop(columns=['user_id', 'item_id', 'rating'])
target = combined_data['rating']

# Display features and target preview
print("Features preview:")
print(features.head())
print("Target preview:")
print(target.head())

# Split the data into training and testing sets
print("Splitting data into training and testing sets...")
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)

# Display shapes of the splits
print(f"Training features shape: {X_train.shape}")
print(f"Testing features shape: {X_test.shape}")
print(f"Training target shape: {y_train.shape}")
print(f"Testing target shape: {y_test.shape}")

# Standardize the features
print("Standardizing features...")
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Combine standardized features with target
print("Combining standardized features with target...")
train_data = np.hstack((X_train_scaled, y_train.values.reshape(-1, 1)))
test_data = np.hstack((X_test_scaled, y_test.values.reshape(-1, 1)))

# Convert to DataFrame
print("Converting to DataFrame...")
train_df = pd.DataFrame(train_data, columns=list(features.columns) + ['rating'])
test_df = pd.DataFrame(test_data, columns=list(features.columns) + ['rating'])

# Save the processed data
print(f"Saving processed training data to {processed_train_data_path}...")
train_df.to_csv(processed_train_data_path, index=False)
print(f"Saving processed testing data to {processed_test_data_path}...")
test_df.to_csv(processed_test_data_path, index=False)

print("Data preprocessing complete.")
