# Imports

In [1]:
import time
import gc
gc.enable()

import warnings
warnings.filterwarnings('ignore')

from scipy.stats import mode
import numpy as np
import pandas as pd
pd.set_option("precision", 4)

import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set(style="darkgrid")

SEED = 2311

In [2]:
%%capture
!pip install scikit-learn==1.0.2
!pip install scikit-learn-intelex

In [3]:
#-----scikit-learn imports-----#
from sklearnex import patch_sklearn
patch_sklearn()

from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error

Intel(R) Extension for Scikit-learn* enabled (https://github.com/intel/scikit-learn-intelex)


In [4]:
%%capture
!pip install xgboost==1.5.0

In [5]:
from xgboost import XGBRegressor

# Data summary

[Data source and description](https://machinehack.com/hackathon/wipro_sustainability_machine_learning_challenge/data)

In [6]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [7]:
DATA_DIR = '/content/drive/MyDrive/Colab/MH_Wipro_Sustainaility_MLC/data/'

train = pd.read_csv(DATA_DIR + 'train.csv')

test = pd.read_csv(DATA_DIR + 'test.csv')

In [8]:
train.head()

Unnamed: 0,Year,Month,Day,Hour,Minute,Clearsky DHI,Clearsky DNI,Clearsky GHI,Cloud Type,Dew Point,Temperature,Pressure,Relative Humidity,Solar Zenith Angle,Precipitable Water,Wind Direction,Wind Speed,Fill Flag
0,2009,1,1,0,0,0,0,0,0,0.0,5.0,1010,75.34,106.15,0.499,346.1,3.1,0
1,2009,1,1,0,30,0,0,0,0,1.0,5.0,1010,80.81,112.28,0.49,346.1,3.1,0
2,2009,1,1,1,0,0,0,0,4,0.0,5.0,1010,78.27,118.5,0.482,347.9,3.2,0
3,2009,1,1,1,30,0,0,0,4,0.0,4.0,1010,78.27,124.78,0.478,347.9,3.1,0
4,2009,1,1,2,0,0,0,0,4,0.0,4.0,1010,76.45,131.12,0.475,350.0,3.0,0


In [9]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 175296 entries, 0 to 175295
Data columns (total 18 columns):
 #   Column              Non-Null Count   Dtype  
---  ------              --------------   -----  
 0   Year                175296 non-null  int64  
 1   Month               175296 non-null  int64  
 2   Day                 175296 non-null  int64  
 3   Hour                175296 non-null  int64  
 4   Minute              175296 non-null  int64  
 5   Clearsky DHI        175296 non-null  int64  
 6   Clearsky DNI        175296 non-null  int64  
 7   Clearsky GHI        175296 non-null  int64  
 8   Cloud Type          175296 non-null  int64  
 9   Dew Point           175296 non-null  float64
 10  Temperature         175296 non-null  float64
 11  Pressure            175296 non-null  int64  
 12  Relative Humidity   175296 non-null  float64
 13  Solar Zenith Angle  175296 non-null  float64
 14  Precipitable Water  175296 non-null  float64
 15  Wind Direction      175296 non-nul

In [10]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17520 entries, 0 to 17519
Data columns (total 18 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Year                17520 non-null  int64  
 1   Month               17520 non-null  int64  
 2   Day                 17520 non-null  int64  
 3   Hour                17520 non-null  int64  
 4   Minute              17520 non-null  int64  
 5   Cloud Type          17520 non-null  int64  
 6   Dew Point           17520 non-null  float64
 7   Temperature         17520 non-null  float64
 8   Pressure            17520 non-null  int64  
 9   Relative Humidity   17520 non-null  float64
 10  Solar Zenith Angle  17520 non-null  float64
 11  Precipitable Water  17520 non-null  float64
 12  Wind Direction      17520 non-null  int64  
 13  Wind Speed          17520 non-null  float64
 14  Fill Flag           17520 non-null  int64  
 15  Clearsky DHI        0 non-null      float64
 16  Clea

In [11]:
TASK = 'reg' #clf->classification, reg->regression

TARGET = ['Clearsky DHI', 'Clearsky DNI', 'Clearsky GHI']

features = [f for f in train.columns if f not in TARGET]

cat_features = ['Cloud Type', 'Fill Flag']
num_features = [f for f in features if f not in cat_features]

# Baseline

In [12]:
train[cat_features] = train[cat_features].astype('category')
test[cat_features] = test[cat_features].astype('category')

In [13]:
xtrain, xval, ytrain, yval = train_test_split(
    train[features], 
    train[TARGET],
    test_size=0.2, 
    shuffle=True, 
    random_state=SEED
)

In [43]:
baseline = XGBRegressor(
    n_estimators=25000,
    learning_rate=0.3,
    objective='reg:squarederror',
    eval_metric='rmse',
    booster='gbtree',
    tree_method='gpu_hist',
    enable_categorical=True,
    random_state=SEED
)

Clearsky DHI (TARGET[0])

In [44]:
baseline.fit(
    xtrain, ytrain[0],
    eval_set=[(xval, yval[0])],
    early_stopping_rounds=200,
    verbose=2000,
)

[0]	validation_0-rmse:59.30350
[2000]	validation_0-rmse:7.28230
[4000]	validation_0-rmse:7.00427
[6000]	validation_0-rmse:6.92911
[8000]	validation_0-rmse:6.90646
[10000]	validation_0-rmse:6.89510
[12000]	validation_0-rmse:6.88968
[14000]	validation_0-rmse:6.88649
[16000]	validation_0-rmse:6.88482
[18000]	validation_0-rmse:6.88376
[20000]	validation_0-rmse:6.88337
[21259]	validation_0-rmse:6.88312


XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, enable_categorical=True,
             eval_metric='rmse', gamma=0, gpu_id=0, importance_type=None,
             interaction_constraints='', learning_rate=0.3, max_delta_step=0,
             max_depth=6, min_child_weight=1, missing=nan,
             monotone_constraints='()', n_estimators=25000, n_jobs=2,
             num_parallel_tree=1, predictor='auto', random_state=2311,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
             tree_method='gpu_hist', validate_parameters=1, verbosity=None)

In [45]:
baseline.best_score, baseline.best_iteration

(6.883105, 21059)

In [46]:
pred0 = baseline.predict(test[features], iteration_range=(0, baseline.best_iteration))

Clearsky DNI (TARGET[1])

In [47]:
baseline.fit(
    xtrain, ytrain[1],
    eval_set=[(xval, yval[1])],
    early_stopping_rounds=200,
    verbose=2000,
)

[0]	validation_0-rmse:348.96506
[2000]	validation_0-rmse:21.28405
[4000]	validation_0-rmse:20.49628
[6000]	validation_0-rmse:20.29828
[8000]	validation_0-rmse:20.21743
[10000]	validation_0-rmse:20.18126
[12000]	validation_0-rmse:20.16437
[14000]	validation_0-rmse:20.15569
[16000]	validation_0-rmse:20.15108
[17543]	validation_0-rmse:20.14881


XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, enable_categorical=True,
             eval_metric='rmse', gamma=0, gpu_id=0, importance_type=None,
             interaction_constraints='', learning_rate=0.3, max_delta_step=0,
             max_depth=6, min_child_weight=1, missing=nan,
             monotone_constraints='()', n_estimators=25000, n_jobs=2,
             num_parallel_tree=1, predictor='auto', random_state=2311,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
             tree_method='gpu_hist', validate_parameters=1, verbosity=None)

In [48]:
baseline.best_score, baseline.best_iteration

(20.148712, 17343)

In [49]:
pred1 = baseline.predict(test[features], iteration_range=(0, baseline.best_iteration))

Clearsky GHI (TARGET[2])

In [50]:
baseline.fit(
    xtrain, ytrain[2],
    eval_set=[(xval, yval[2])],
    early_stopping_rounds=200,
    verbose=2000,
)

[0]	validation_0-rmse:290.20300
[2000]	validation_0-rmse:3.85514
[4000]	validation_0-rmse:3.74012
[6000]	validation_0-rmse:3.71322
[8000]	validation_0-rmse:3.70136
[10000]	validation_0-rmse:3.69632
[12000]	validation_0-rmse:3.69397
[13677]	validation_0-rmse:3.69312


XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, enable_categorical=True,
             eval_metric='rmse', gamma=0, gpu_id=0, importance_type=None,
             interaction_constraints='', learning_rate=0.3, max_delta_step=0,
             max_depth=6, min_child_weight=1, missing=nan,
             monotone_constraints='()', n_estimators=25000, n_jobs=2,
             num_parallel_tree=1, predictor='auto', random_state=2311,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
             tree_method='gpu_hist', validate_parameters=1, verbosity=None)

In [51]:
baseline.best_score, baseline.best_iteration

(3.69309, 13477)

In [52]:
pred2 = baseline.predict(test[features], iteration_range=(0, baseline.best_iteration))

# Feature Engineering

### Feature importances

### Feature selection

### Feature extraction / Dimensionality reduction

# Modelling

### Hyperparameter tuning

In [None]:
def objective(trial, xtrain, xval, ytrain, yval, params):
    

In [None]:
study = optuna.create_study(
    direction='-----', 
    sampler=TPESampler(), 
    study_name='-----'
)

In [None]:
xtrain, xval, ytrain, yval = train_test_split(
    train[features], 
    train[TARGET],
    test_size='-----', 
    shuffle=True, 
    stratify=train[TARGET],
    random_state=SEED
)

### Creating folds

In [None]:
N_SPLITS = 5

def create_folds(df):
    if TASK == 'clf':
        skf = StratifiedKFold(n_splits=N_SPLITS, shuffle=True, random_state=SEED)
        for fold, (_, val_idx) in enumerate(skf.split(X=df, y=df[TARGET])):
            df.loc[val_idx, 'fold'] = fold
    else:
        kf = KFold(n_splits=N_SPLITS, shuffle=True, random_state=SEED):
        for fold, (_, val_idx) in enumerate(kf.split(X=df)):
            df.loc[val_idx, 'fold'] = fold

    return df

### Cross-validation + Inference

In [None]:
def fit_model(model_name, params, xtrain, xval, features):
    if model_name == '-----':
        pass
    elif model_name == '-----':
        pass
    else:
        print('Invalid model name!')
        return

    return model

In [None]:
def evaluate_model(model_name, params, train, test, features):
    oof_preds = {}  #out-of-fold predictions
    test_preds = []
    cv_scores = []

    fold_col = '-----'   #5folds, 10folds, etc.
    for fold in range(N_SPLITS):
        xtrain = train[train[fold_col] != fold].reset_index(drop=True)
        ytrain = xtrain[TARGET]

        xval = train[train[fold_col] == fold].reset_index(drop=True)
        yval = xval[TARGET]
        val_idx = xval.index.to_list()

        fold_start = time.time()

        model = fit_model(model_name, params, xtrain, xval, features)

        val_preds = model.predict(xval[features])
        oof_preds.update(dict(zip(val_idx, val_preds)))

        score = '-----'
        cv_scores.append(score)

        fold_end = time.time()

        print(f'Fold #{fold}: Accuracy = {acc:.5f} \
        [Time: {fold_end - fold_start:.2f}s]')
        
        test_preds.append(model.predict(test[features]))
        
    cv_end = time.time()

    print(f'Average score = {np.mean(scores):.5f} \
    with std. dev. = {np.std(scores):.5f}')
    print(f'[Total time: {cv_end - cv_start:.2f}s]')

    oof_preds = pd.DataFrame.from_dict(oof_preds, orient='index').reset_index()
    # test_preds = mode(np.column_stack(test_preds), axis=1).mode #clf
    # test_preds = np.mean(np.column_stack(test_preds), axis=1)   #reg

    return oof_preds, test_preds

In [None]:
model_names = {
    '-----'
}

In [None]:
oof_preds, test_preds = evaluate_model('-----'))

# Submissions

In [53]:
baseline_sub = pd.DataFrame({
    TARGET[0]: pred0,
    TARGET[1]: pred1,
    TARGET[2]: pred2
})

baseline_sub.to_csv('baseline_sub.csv', index=False)

In [54]:
!head baseline_sub.csv

Clearsky DHI,Clearsky DNI,Clearsky GHI
2.4057481,-22.166214,-2.8244681
1.1805557,-7.995976,-1.901115
1.5906572,-0.6560051,-1.7534056
1.1690676,-5.8586645,-1.6653191
-0.45344254,-12.291299,-0.7154342
-0.8583965,-14.278685,-0.9539284
0.26357195,-17.218288,-0.4249243
-2.2760062,-11.26508,-0.374541
1.5500247,-14.039718,-1.0398345


Postprocessing

In [55]:
baseline_sub_postprocessed = baseline_sub.applymap(lambda x: max(0.0, x))

In [56]:
baseline_sub_postprocessed.to_csv('baseline_sub_postprocessed.csv', index=False)
!head baseline_sub_postprocessed.csv

Clearsky DHI,Clearsky DNI,Clearsky GHI
2.405748128890991,0.0,0.0
1.1805557012557983,0.0,0.0
1.5906572341918945,0.0,0.0
1.169067621231079,0.0,0.0
0.0,0.0,0.0
0.0,0.0,0.0
0.26357194781303406,0.0,0.0
0.0,0.0,0.0
1.5500247478485107,0.0,0.0


Time to submit!