# Wind Power Prediction - Complete Model Training & Export

### Full Pipeline: Data Loading → Feature Engineering → Model Training → .pkl Export

**Project:** Wind Power Prediction (Regression)

**Task:** Predict hourly wind power output from meteorological features

**Data Location:** `/content/drive/MyDrive/Suchitra/`

**Output Files:** `model.pkl`, `scaler.pkl`

## Step 0: Mount Google Drive

In [None]:
from google.colab import drive
drive.mount('/content/drive')
print('Google Drive mounted successfully')

## Step 1: Install & Import Libraries

In [None]:
!pip install -q scikit-learn pandas numpy joblib
print('Libraries installed')

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import joblib
import warnings
warnings.filterwarnings('ignore')
print('All libraries imported')

## Step 2: Load Data

In [None]:
train_path = '/content/drive/MyDrive/Suchitra/Train.csv'
test_path = '/content/drive/MyDrive/Suchitra/Test.csv'

train_df = pd.read_csv(train_path)
test_df = pd.read_csv(test_path)

print(f'Train Shape: {train_df.shape}')
print(f'Test Shape: {test_df.shape}')
print('\nTrain Data:')
print(train_df.head())

## Step 3: Data Cleaning

In [None]:
if 'Unnamed: 0' in train_df.columns:
    train_df = train_df.drop(columns=['Unnamed: 0'])

train_df = train_df.fillna(train_df.mean(numeric_only=True))
test_df = test_df.fillna(test_df.mean(numeric_only=True))

print(f'Missing in train: {train_df.isnull().sum().sum()}')
print(f'Missing in test: {test_df.isnull().sum().sum()}')

## Step 4: Encode Categorical Features

In [None]:
label_enc = LabelEncoder()
cat_cols = train_df.select_dtypes(include=['object']).columns

for col in cat_cols:
    if col != 'Time':
        combined = pd.concat([train_df[col], test_df[col]], axis=0)
        label_enc.fit(combined.astype(str))
        train_df[col] = label_enc.transform(train_df[col].astype(str))
        test_df[col] = label_enc.transform(test_df[col].astype(str))

print(f'Encoded {len(cat_cols)} categorical columns')

## Step 5: Feature Engineering

In [None]:
train_df['Time'] = pd.to_datetime(train_df['Time'], format='%d-%m-%Y %H:%M')

train_df['hour'] = train_df['Time'].dt.hour
train_df['day'] = train_df['Time'].dt.day
train_df['month'] = train_df['Time'].dt.month
train_df['dayofweek'] = train_df['Time'].dt.dayofweek
train_df['is_weekend'] = train_df['dayofweek'].isin([5, 6]).astype(int)

print('Temporal features created')

In [None]:
for col in ['WD_10m', 'WD_100m']:
    train_df[col + '_sin'] = np.sin(np.deg2rad(train_df[col]))
    train_df[col + '_cos'] = np.cos(np.deg2rad(train_df[col]))

print('Circular encoding for wind direction created')

In [None]:
for col in ['WS_10m', 'WS_100m']:
    train_df[col + '_sq'] = train_df[col] ** 2
    train_df[col + '_cu'] = train_df[col] ** 3

print('Polynomial wind speed features created')

In [None]:
train_df['temp_humidity'] = train_df['Temp_2m'] * train_df['RelHum_2m']
train_df['temp_dew_diff'] = train_df['Temp_2m'] - train_df['DP_2m']
train_df['wind_shear'] = train_df['WS_100m'] - train_df['WS_10m']

print(f'Interaction features created')
print(f'Total features: {train_df.shape[1] - 1}')

## Step 6: Prepare Features and Target

In [None]:
X_train = train_df.drop(columns=['Power', 'Time'])
y_train = train_df['Power']

if 'Time' in test_df.columns:
    X_test = test_df.drop(columns=['Time'])
else:
    X_test = test_df.copy()

print(f'X_train shape: {X_train.shape}')
print(f'y_train shape: {y_train.shape}')
print(f'X_test shape: {X_test.shape}')

## Step 7: Scale Features

In [None]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print('Features scaled successfully')

## Step 8: Train-Validation Split

In [None]:
X_train_split, X_val_split, y_train_split, y_val_split = train_test_split(
    X_train_scaled, y_train, test_size=0.2, random_state=42
)

print(f'Train set: {X_train_split.shape[0]} samples')
print(f'Validation set: {X_val_split.shape[0]} samples')

## Step 9: Train Random Forest Model

In [None]:
print('Training Random Forest model...\n')

model = RandomForestRegressor(
    n_estimators=300,
    max_depth=None,
    min_samples_split=2,
    min_samples_leaf=1,
    random_state=42,
    n_jobs=-1,
    verbose=1
)

model.fit(X_train_split, y_train_split)
print('\nModel training completed')

## Step 10: Model Validation

In [None]:
y_val_pred = model.predict(X_val_split)

mae = mean_absolute_error(y_val_split, y_val_pred)
mse = mean_squared_error(y_val_split, y_val_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_val_split, y_val_pred)

print('MODEL PERFORMANCE METRICS')
print('='*50)
print(f'MAE: {mae:.6f}')
print(f'RMSE: {rmse:.6f}')
print(f'R2 Score: {r2:.6f}')
print('='*50)

## Step 11: Feature Importance

In [None]:
feature_importance = pd.DataFrame({
    'feature': X_train.columns,
    'importance': model.feature_importances_
}).sort_values('importance', ascending=False)

print('Top 10 Important Features:')
print(feature_importance.head(10).to_string(index=False))

## Step 12: Generate Test Predictions

In [None]:
y_test_pred = model.predict(X_test_scaled)

print('First 10 predictions:')
print(y_test_pred[:10])
print(f'\nPrediction Min: {y_test_pred.min():.6f}')
print(f'Prediction Max: {y_test_pred.max():.6f}')
print(f'Prediction Mean: {y_test_pred.mean():.6f}')

## Step 13: Save Predictions CSV

In [None]:
output_df = pd.DataFrame({
    'ID': range(1, len(y_test_pred) + 1),
    'Predicted_Power': y_test_pred
})

output_path = '/content/drive/MyDrive/Suchitra/Predicted_Power_Output.csv'
output_df.to_csv(output_path, index=False)

print(f'Predictions saved to: {output_path}')
print(output_df.head())

## Step 14: Serialize Model and Scaler (.pkl)

In [None]:
import os

model_path = '/content/drive/MyDrive/Suchitra/model.pkl'
scaler_path = '/content/drive/MyDrive/Suchitra/scaler.pkl'

joblib.dump(model, model_path)
print(f'Model saved: {model_path}')

joblib.dump(scaler, scaler_path)
print(f'Scaler saved: {scaler_path}')

model_size = os.path.getsize(model_path) / (1024 * 1024)
scaler_size = os.path.getsize(scaler_path) / (1024)

print(f'\nModel size: {model_size:.2f} MB')
print(f'Scaler size: {scaler_size:.2f} KB')

## Step 15: Verify Model Loading

In [None]:
loaded_model = joblib.load(model_path)
loaded_scaler = joblib.load(scaler_path)

print('Model loaded successfully')

sample_test = X_test_scaled[:1]
sample_pred = loaded_model.predict(sample_test)

print(f'Test prediction: {sample_pred[0]:.6f}')
print(f'Original: {y_test_pred[0]:.6f}')
print(f'\nModel working correctly!')

## Step 16: Project Summary

In [None]:
print('\n' + '='*60)
print('WIND POWER PREDICTION - COMPLETE')
print('='*60)

print('\nPROJECT SUMMARY:')
print(f'Training Samples: {X_train_split.shape[0]:,}')
print(f'Validation Samples: {X_val_split.shape[0]:,}')
print(f'Test Samples: {X_test.shape[0]:,}')
print(f'Total Features: {X_train.shape[1]}')

print('\nMODEL DETAILS:')
print(f'Algorithm: Random Forest (300 trees)')

print('\nPERFORMANCE METRICS:')
print(f'MAE: {mae:.6f}')
print(f'RMSE: {rmse:.6f}')
print(f'R2 Score: {r2:.6f}')

print('\nFILES SAVED:')
print(f'Model: {model_path}')
print(f'Scaler: {scaler_path}')
print(f'Predictions: {output_path}')

print('\n' + '='*60)
print('STATUS: READY FOR DEPLOYMENT')
print('='*60)