In [14]:
import pandas as pd
import numpy as np
from sklearn.impute import KNNImputer
from sklearn.linear_model import LinearRegression
import joblib
import warnings
import os

warnings.filterwarnings('ignore')

# === Load test data ===
test_data = pd.read_csv("/Users/rahulsharma/Developer/zelestra_challenge-2/dataset/test.csv")
print(f"Original test data shape: {test_data.shape}")

# === Step 1: Convert types ===
test_data['humidity'] = pd.to_numeric(test_data['humidity'], errors='coerce')
test_data['wind_speed'] = pd.to_numeric(test_data['wind_speed'], errors='coerce')
test_data['pressure'] = pd.to_numeric(test_data['pressure'], errors='coerce')

# === Step 2: Handle missing ===
test_data['panel_age'] = test_data.groupby('string_id')['panel_age'].transform(lambda x: x.fillna(x.median()))

error_string_median = test_data.groupby(['string_id', 'error_code'])['maintenance_count'].median()
string_id_median = test_data.groupby('string_id')['maintenance_count'].median()
overall_median = test_data['maintenance_count'].median()

missing_mask = test_data['maintenance_count'].isna()
for idx in test_data[missing_mask].index:
    string_id = test_data.loc[idx, 'string_id']
    error_code = test_data.loc[idx, 'error_code']
    if pd.notna(error_code) and (string_id, error_code) in error_string_median:
        test_data.loc[idx, 'maintenance_count'] = error_string_median[(string_id, error_code)]

missing_mask = test_data['maintenance_count'].isna()
for idx in test_data[missing_mask].index:
    string_id = test_data.loc[idx, 'string_id']
    if string_id in string_id_median:
        test_data.loc[idx, 'maintenance_count'] = string_id_median[string_id]

test_data['maintenance_count'].fillna(overall_median, inplace=True)

maintenance_bins = pd.cut(test_data['maintenance_count'], bins=5, include_lowest=True)
maintenance_soiling_median = test_data.groupby(maintenance_bins, observed=False)['soiling_ratio'].median()

missing_soiling_mask = test_data['soiling_ratio'].isna()
for idx in test_data[missing_soiling_mask].index:
    maintenance_val = test_data.loc[idx, 'maintenance_count']
    if pd.notna(maintenance_val):
        for bin_range, median_soiling in maintenance_soiling_median.items():
            if maintenance_val >= bin_range.left and maintenance_val <= bin_range.right:
                test_data.loc[idx, 'soiling_ratio'] = median_soiling
                break

test_data['soiling_ratio'].fillna(test_data['soiling_ratio'].median(), inplace=True)

# === Step 3: Outliers + power ===
test_data.loc[test_data['voltage'] > 150, 'voltage'] = np.nan
test_data.loc[test_data['current'] > 10, 'current'] = np.nan

if test_data[['voltage', 'current']].isna().any().any():
    imputer = KNNImputer(n_neighbors=5)
    test_data[['voltage', 'current']] = imputer.fit_transform(test_data[['voltage', 'current']])

test_data['power'] = test_data['current'] * test_data['voltage']

# === Step 4: Module temperature prediction ===
if test_data['module_temperature'].isna().any():
    temp_available = test_data['temperature'].notna()
    temp_missing = test_data['module_temperature'].isna()
    both_available = temp_available & test_data['module_temperature'].notna()
    if both_available.sum() > 10:
        lr = LinearRegression()
        lr.fit(test_data.loc[both_available, 'temperature'].values.reshape(-1, 1),
               test_data.loc[both_available, 'module_temperature'])
        predict_mask = temp_missing & temp_available
        if predict_mask.any():
            test_data.loc[predict_mask, 'module_temperature'] = lr.predict(
                test_data.loc[predict_mask, 'temperature'].values.reshape(-1, 1))

# === Step 5: KNN imputation for other numerical columns ===
numerical_features = ['temperature', 'irradiance', 'module_temperature', 'humidity',
                      'cloud_coverage', 'wind_speed', 'pressure']

for feature in numerical_features:
    if test_data[feature].isna().any():
        if feature == 'irradiance':
            test_data.loc[test_data[feature] == 0, feature] = np.nan
            test_data[feature] = np.clip(test_data[feature], 0, 1500)
        elif feature == 'humidity':
            test_data.loc[test_data[feature] == 0, feature] = np.nan
            test_data[feature] = np.clip(test_data[feature], 0, 100)

        related_features = {
            'temperature': ['humidity', 'module_temperature', 'irradiance', 'cloud_coverage'],
            'irradiance': ['temperature', 'module_temperature', 'humidity', 'cloud_coverage'],
            'humidity': ['temperature', 'module_temperature', 'cloud_coverage', 'wind_speed'],
            'cloud_coverage': ['temperature', 'humidity', 'irradiance', 'module_temperature'],
            'wind_speed': ['temperature', 'humidity', 'pressure', 'cloud_coverage'],
            'pressure': ['temperature', 'humidity', 'wind_speed', 'cloud_coverage']
        }.get(feature, ['temperature', 'humidity'])

        use_features = [f for f in related_features if f in test_data.columns] + [feature]

        if len(use_features) > 1:
            knn_data = test_data[use_features].apply(pd.to_numeric, errors='coerce')
            imputer = KNNImputer(n_neighbors=5)
            imputed_values = imputer.fit_transform(knn_data)
            test_data[feature] = imputed_values[:, -1]
        else:
            test_data[feature].fillna(test_data[feature].median(), inplace=True)

# === Step 6: Handle categoricals ===
test_data['error_code'] = test_data['error_code'].fillna('Unknown')
test_data['installation_type'] = test_data['installation_type'].fillna('Unknown')

# === Step 7: New feature ===
test_data['adjusted_irradiance'] = test_data['irradiance'] * test_data['soiling_ratio']

# === Step 8: Encode categoricals ===
string_id_map = {'A1': 0, 'B2': 1, 'C3': 2, 'D4': 3}
error_code_map = {'E00': 0, 'E01': 1, 'E02': 2, 'Unknown': 3}
installation_type_map = {'dual-axis': 0, 'fixed': 1, 'tracking': 2, 'Unknown': 3}

test_data['string_id_encoded'] = test_data['string_id'].map(string_id_map).fillna(0)
test_data['error_code_encoded'] = test_data['error_code'].map(error_code_map).fillna(3)
test_data['installation_type_encoded'] = test_data['installation_type'].map(installation_type_map).fillna(3)

test_data = test_data.drop(['string_id', 'error_code', 'installation_type'], axis=1)

# === Step 9: Scale using saved scaler ===
scaler = joblib.load('/Users/rahulsharma/Developer/zelestra_challenge-2/processed_dataset/scaler.joblib')

X_test = test_data.drop(['id'], axis=1)

# Ensure consistent column order with training
X_test = X_test[scaler.feature_names_in_]

X_test_scaled = scaler.transform(X_test)

# Combine and Save
X_test_scaled_df = pd.DataFrame(X_test_scaled, columns=scaler.feature_names_in_)
X_test_scaled_df.insert(0, 'id', test_data['id'].values)

os.makedirs('processed_dataset', exist_ok=True)
X_test_scaled_df.to_csv('processed_dataset/test_data_processed.csv', index=False)

print("✅ Test data preprocessing complete!")
print(f"Final shape: {X_test_scaled_df.shape}")
print(f"Missing values: {X_test_scaled_df.isna().sum().sum()}")
print("Saved as 'processed_dataset/test_data_processed.csv'")


Original test data shape: (12000, 16)
✅ Test data preprocessing complete!
Final shape: (12000, 18)
Missing values: 0
Saved as 'processed_dataset/test_data_processed.csv'
