In [4]:
import pandas as pd
import numpy as np
from sklearn.impute import KNNImputer
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
import joblib
import os
import warnings

warnings.filterwarnings('ignore')

# === LOAD TRAINING DATA ===
train_data = pd.read_csv("/Users/rahulsharma/Developer/zelestra_challenge-2/dataset/train.csv")

print(f"Original training data shape: {train_data.shape}")

# === STEP 1: Convert data types ===
train_data['humidity'] = pd.to_numeric(train_data['humidity'], errors='coerce')
train_data['wind_speed'] = pd.to_numeric(train_data['wind_speed'], errors='coerce')
train_data['pressure'] = pd.to_numeric(train_data['pressure'], errors='coerce')

# === STEP 2: Handle missing values ===
# 2.1 Panel Age
train_data['panel_age'] = train_data.groupby('string_id')['panel_age'].transform(
    lambda x: x.fillna(x.median())
)

# 2.2 Maintenance Count
error_string_median = train_data.groupby(['string_id', 'error_code'])['maintenance_count'].median()
string_id_median = train_data.groupby('string_id')['maintenance_count'].median()
overall_median = train_data['maintenance_count'].median()

missing_mask = train_data['maintenance_count'].isna()
for idx in train_data[missing_mask].index:
    string_id = train_data.loc[idx, 'string_id']
    error_code = train_data.loc[idx, 'error_code']
    if pd.notna(error_code) and (string_id, error_code) in error_string_median:
        train_data.loc[idx, 'maintenance_count'] = error_string_median[(string_id, error_code)]

missing_mask = train_data['maintenance_count'].isna()
for idx in train_data[missing_mask].index:
    string_id = train_data.loc[idx, 'string_id']
    if string_id in string_id_median:
        train_data.loc[idx, 'maintenance_count'] = string_id_median[string_id]

train_data['maintenance_count'].fillna(overall_median, inplace=True)

# 2.3 Soiling Ratio
maintenance_bins = pd.cut(train_data['maintenance_count'], bins=5, include_lowest=True)
maintenance_soiling_median = train_data.groupby(maintenance_bins, observed=False)['soiling_ratio'].median()

missing_soiling_mask = train_data['soiling_ratio'].isna()
for idx in train_data[missing_soiling_mask].index:
    maintenance_val = train_data.loc[idx, 'maintenance_count']
    if pd.notna(maintenance_val):
        for bin_range, median_soiling in maintenance_soiling_median.items():
            if maintenance_val >= bin_range.left and maintenance_val <= bin_range.right:
                train_data.loc[idx, 'soiling_ratio'] = median_soiling
                break

train_data['soiling_ratio'].fillna(train_data['soiling_ratio'].median(), inplace=True)

# === STEP 3: Clean outliers and generate power ===
train_data.loc[train_data['voltage'] > 150, 'voltage'] = np.nan
train_data.loc[train_data['current'] > 10, 'current'] = np.nan

if train_data[['voltage', 'current']].isna().any().any():
    imputer = KNNImputer(n_neighbors=5)
    train_data[['voltage', 'current']] = imputer.fit_transform(train_data[['voltage', 'current']])

train_data['power'] = train_data['current'] * train_data['voltage']

# === STEP 4: Handle other numerical features ===
if train_data['module_temperature'].isna().any():
    temp_available = train_data['temperature'].notna()
    temp_missing = train_data['module_temperature'].isna()
    both_available = temp_available & train_data['module_temperature'].notna()

    if both_available.sum() > 10:
        lr = LinearRegression()
        lr.fit(train_data.loc[both_available, 'temperature'].values.reshape(-1, 1),
               train_data.loc[both_available, 'module_temperature'])

        predict_mask = temp_missing & temp_available
        if predict_mask.any():
            train_data.loc[predict_mask, 'module_temperature'] = lr.predict(
                train_data.loc[predict_mask, 'temperature'].values.reshape(-1, 1))

numerical_features = ['temperature', 'irradiance', 'module_temperature', 'humidity',
                      'cloud_coverage', 'wind_speed', 'pressure']

for feature in numerical_features:
    if train_data[feature].isna().any():
        if feature == 'irradiance':
            train_data.loc[train_data[feature] == 0, feature] = np.nan
            train_data[feature] = np.clip(train_data[feature], 0, 1500)
        elif feature == 'humidity':
            train_data.loc[train_data[feature] == 0, feature] = np.nan
            train_data[feature] = np.clip(train_data[feature], 0, 100)

        if feature == 'temperature':
            related_features = ['humidity', 'module_temperature', 'irradiance', 'cloud_coverage']
        elif feature == 'irradiance':
            related_features = ['temperature', 'module_temperature', 'humidity', 'cloud_coverage']
        elif feature == 'humidity':
            related_features = ['temperature', 'module_temperature', 'cloud_coverage', 'wind_speed']
        elif feature == 'cloud_coverage':
            related_features = ['temperature', 'humidity', 'irradiance', 'module_temperature']
        elif feature == 'wind_speed':
            related_features = ['temperature', 'humidity', 'pressure', 'cloud_coverage']
        elif feature == 'pressure':
            related_features = ['temperature', 'humidity', 'wind_speed', 'cloud_coverage']
        else:
            related_features = ['temperature', 'humidity']

        use_features = [f for f in related_features if f in train_data.columns] + [feature]

        if len(use_features) > 1:
            knn_data = train_data[use_features].apply(pd.to_numeric, errors='coerce')
            imputer = KNNImputer(n_neighbors=5)
            imputed_values = imputer.fit_transform(knn_data)
            train_data[feature] = imputed_values[:, -1]
        else:
            train_data[feature].fillna(train_data[feature].median(), inplace=True)

# === STEP 5: Handle categoricals ===
train_data['error_code'] = train_data['error_code'].fillna('Unknown')
train_data['installation_type'] = train_data['installation_type'].fillna('Unknown')

# === STEP 6: Generate additional features ===
train_data['adjusted_irradiance'] = train_data['irradiance'] * train_data['soiling_ratio']

# === STEP 7: Encode categoricals ===
string_id_map = {'A1': 0, 'B2': 1, 'C3': 2, 'D4': 3}
error_code_map = {'E00': 0, 'E01': 1, 'E02': 2, 'Unknown': 3}
installation_type_map = {'dual-axis': 0, 'fixed': 1, 'tracking': 2, 'Unknown': 3}

train_data['string_id_encoded'] = train_data['string_id'].map(string_id_map).fillna(0)
train_data['error_code_encoded'] = train_data['error_code'].map(error_code_map).fillna(3)
train_data['installation_type_encoded'] = train_data['installation_type'].map(installation_type_map).fillna(3)

train_data = train_data.drop(['string_id', 'error_code', 'installation_type'], axis=1)

# === STEP 8: Separate label and scale ===
label = train_data['efficiency']  # Replace 'target' with actual target column name
X = train_data.drop(['id', 'efficiency'], axis=1)

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Create directory before saving
os.makedirs('processed_dataset', exist_ok=True)

# Save scaler
joblib.dump(scaler, 'processed_dataset/scaler.joblib')


# Combine and save
X_scaled_df = pd.DataFrame(X_scaled, columns=X.columns)
X_scaled_df.insert(0, 'id', train_data['id'].values)
X_scaled_df['target'] = label.values

os.makedirs('processed_dataset', exist_ok=True)
X_scaled_df.to_csv('processed_dataset/train_data_processed.csv', index=False)

print("✅ Train data preprocessing complete!")
print(f"Final shape: {X_scaled_df.shape}")
print(f"Missing values: {X_scaled_df.isna().sum().sum()}")
print("Saved as 'processed_dataset/train_data_processed.csv'")


Original training data shape: (20000, 17)
✅ Train data preprocessing complete!
Final shape: (20000, 19)
Missing values: 0
Saved as 'processed_dataset/train_data_processed.csv'
