# **<span style="color:#F7B2B0;">Goal / Problem Statement</span>**

The WiDS Datathon 2023 is a prediction problem involving forecasting sub-seasonal temperatures.

The task is to predict the arithmetic mean of the maximum and minimum temperature over over a two-week period/the next 14 days, for each location and start date within the United States.

**Target**: contest-tmp2m-14d__tmp2m: the arithmetic mean of the max and min observed temperature over the next 14 days for each location and start date, computed as (measured max temperature + measured min temperature) / 2

# **<span style="color:#F7B2B0;">Data Description</span>**
The dataset consists of weather and climate information for a number of US locations, for a number of start dates for the two-week observation, as well as the forecasted temperature and precipitation from a number of weather forecast models. Each row in the data corresponds to a single location and a single start date for the two-week period.

# **<span style="color:#F7B2B0;">Model Evaluation Metric</span>**

The evaluation metric for this competition is Root Mean Squared Error (RMSE).

In [None]:
# pip install catboost

In [None]:
import warnings
warnings.filterwarnings('ignore')

### Date & Time
from datetime import datetime

### Data Processing
import numpy as np
import pandas as pd

### Data Stats
from scipy import stats
import statsmodels.formula.api as sm

### Visualization
import matplotlib.pyplot as plt
plt.rcParams.update({'font.size': 18})
plt.style.use('ggplot')
import seaborn as sns

### Data Transformation
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler
from sklearn.decomposition import PCA

### ML Model Building
from sklearn.model_selection import train_test_split

### ML Models/Algorithms
from sklearn.svm import SVC

### ML Model Evaluation
from sklearn import metrics
from sklearn.metrics import mean_squared_error, confusion_matrix

np.random.seed(123)
import joblib

# ols library
import statsmodels.api as sm
import statsmodels.formula.api as smf

import missingno as mno
from sklearn.cluster import DBSCAN
from sklearn.cluster import OPTICS

# import zscore for scaling the data
from scipy.stats import zscore
from sklearn.preprocessing import StandardScaler, MinMaxScaler, PowerTransformer, RobustScaler

# from sklearn.metrics import silhouette_score
# from sklearn.cluster import KMeans

# pre-processing methods
from sklearn.preprocessing import PolynomialFeatures

from sklearn.compose import TransformedTargetRegressor

# the regression models
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge,Lasso
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import BaggingRegressor
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import GradientBoostingRegressor
from xgboost import XGBRegressor
# from catboost import CatBoostRegressor
from lightgbm import LGBMRegressor

# cross-validation methods
from sklearn.model_selection import KFold, GridSearchCV, RandomizedSearchCV

from sklearn.pipeline import Pipeline

# feature-selection methods
from sklearn.feature_selection import SelectFromModel

# bootstrap sampling
from sklearn.utils import resample

print('Libraries Imported Succesfully!')

Libraries Imported Succesfully!


In [None]:
features = pd.read_csv("/content/drive/MyDrive/1_WiDS-Datathon-Kaggle_Feb2023/input/features_70.csv")
target = pd.read_csv("/content/drive/MyDrive/1_WiDS-Datathon-Kaggle_Feb2023/input/target_70.csv")
print('Features & Target Files Read Succesfully!')

Features & Target Files Read Succesfully!


In [None]:
features.shape, len(target)

((375734, 115), 375734)

In [None]:
train_features, val_features, train_target, val_target = train_test_split(features, target, test_size = 0.20, random_state = 1)
train_features.shape, val_features.shape, len(train_target), len(val_target)

((300587, 115), (75147, 115), 300587, 75147)

### Model Definition

In [None]:
def train_validate_model(model, name, train_features, val_features, train_target, val_target, index):

  # print(model)

  pipeline = Pipeline([('model', model)])

  pipeline.fit(train_features, train_target) # Fit Model on Train Dataset
  predictions_ = pipeline.predict(val_features) # Make Predictions on Validation Dataset
  predictions = np.rint(predictions_)

  r2 = metrics.r2_score(val_target, predictions) ### R2 on Validation Predictions
  rmse = np.sqrt(metrics.mean_squared_error(val_target, predictions)) ### RMSE on Validation Predictions

  ### Train Accuracy
  train_acc_score = pipeline.score(train_features, train_target)

  ### Validation Accuracy
  val_acc_score = pipeline.score(val_features, val_target)

  resultsdf = pd.DataFrame({'Model Name': name, 'R Squared Error':round(r2,2), 'RMSE Error':round(rmse,2), 'Train Accuracy':round(train_acc_score*100,2),
                           'Validation Accuracy':round(val_acc_score*100,2)}, index=[index])

  print('Model Metrics:\n', resultsdf)
  print('**************************************************')

  return resultsdf, pipeline, predictions


### Define ML Regression Models

In [None]:
index = 1
resultsDf = pd.DataFrame()

models=[
    # ['SVR', SVR(kernel='rbf')],
    # ['SVR', SVR(kernel='linear')],

    # ['LinearRegression',LinearRegression()],
    # ['Ridge',Ridge(random_state = 1)],
    # ['Lasso',Lasso(random_state = 1)],

    # ['KNeighborsRegressor',KNeighborsRegressor(n_neighbors = 3)],

    # ['DecisionTreeRegressor', DecisionTreeRegressor(random_state=1)],
    # R Squared Error  RMSE Error  Train Accuracy  Validation Accuracy
    # 1   0.99              0.92           100.0   99.23

    ['RandomForestRegressor', RandomForestRegressor(random_state=1)],
    # R Squared Error  RMSE Error  Train Accuracy Validation Accuracy
    # 1.0             0.56           99.97                   99.77

    # ['CatBoostRegressor', CatBoostRegressor(random_state=1, verbose=False)],
    # ['BaggingRegressor', BaggingRegressor(random_state=1)],

    # ['ExtraTreesRegressor', ExtraTreesRegressor(random_state=1)],
    # ['AdaBoostRegressor', AdaBoostRegressor(random_state=1)],
    # ['GradientBoostingRegressor', GradientBoostingRegressor(random_state=1)],

    # ['XGBRegressor', XGBRegressor()]
    ]

In [None]:
for name, regressor in models:
  # Train and Test the model
  tmp_resultsDf, model, predictions = train_validate_model(regressor, name, train_features, val_features, train_target, val_target, index)

  # Store the accuracy results for each model in a dataframe for final comparison
  resultsDf = pd.concat([resultsDf, tmp_resultsDf])

  index = index + 1

  # R Squared Error  RMSE Error  Train Accuracy  Validation Accuracy
  # 0.98	            1.27	          99.78	       98.43 - PCA 50 - 6.379 with Corr 0.6
  # 0.96	            1.88	          99.5	       96.46 - PCA 10 - 11.61 with Corr 0.6

  # All Terminated to choose Optimal PCs - 28 now
  # 0.97              1.59            99.65        97.49 - PCA - 28 (Optimal n) with Corr 0.6
  # 0.97              1.56            99.66        97.58 - PCA - 26 (Optimal n) with Corr 0.5

  # 0.97              1.56            99.66        97.58   - PCA - 26 (Optimal n) with Corr 0.7
  # 1.0               0.51            99.97        99.81  - with Corr 0.7; No PCA


Model Metrics:
               Model Name  R Squared Error  RMSE Error  Train Accuracy  \
1  RandomForestRegressor              1.0        0.51           99.97   

   Validation Accuracy  
1                99.81  
**************************************************


In [None]:
resultsDf

Unnamed: 0,Model Name,R Squared Error,RMSE Error,Train Accuracy,Validation Accuracy
1,RandomForestRegressor,1.0,0.51,99.97,99.81


In [None]:
model_FileName = '/content/drive/MyDrive/1_WiDS-Datathon-Kaggle_Feb2023/input/RF_Model_New.pkl'
joblib.dump(model,  model_FileName)
print('Region RF Model pickle file saved successfully!')

Region RF Model pickle file saved successfully!


### Test Predictions

In [None]:
test = pd.read_csv('/content/drive/MyDrive/1_WiDS-Datathon-Kaggle_Feb2023/input/test_data.csv')
# print("Test No of Rows \t === ", test.shape[0])
# print("Test No of Columns \t === ", test.shape[1])
#
print('Test Dimension === ', test.shape)
# print('Test Columns === ', test.columns.to_list())

Test Dimension ===  (31354, 245)


In [None]:
features_train = pd.read_csv("/content/drive/MyDrive/1_WiDS-Datathon-Kaggle_Feb2023/input/features_70.csv")
print('Train Dimension: ', features_train.shape)

train_cols = features_train.columns.to_list()
print('Train Columns: ', len(train_cols))

Train Dimension:  (375734, 115)
Train Columns:  115


In [None]:
# test.info()

print(test.select_dtypes(include=['object']).columns)

print('Test Dimension === ', test.shape)

test['startdate'] = pd.to_datetime(test['startdate'])
print('Min/Max - Start Date', test['startdate'].min(), ' & ', test['startdate'].max())

print('NANs === ', test.isna().sum().sum()) #
# print('Test Dimension Before Dropping NANs Rows: ', test.shape)
# print(test.isna().sum().sum())

# test_1 = test.dropna()
# print('Test Dimension After Dropping NANs Rows: ', test_1.shape)
# test_1.head(1)

Index(['startdate', 'climateregions__climateregion'], dtype='object')
Test Dimension ===  (31354, 245)
Min/Max - Start Date 2022-11-01 00:00:00  &  2022-12-31 00:00:00
NANs ===  0


In [None]:
test.drop(columns=['startdate'], inplace=True)
print(test.shape)

print(test.select_dtypes(include=['object']).columns)

(31354, 244)
Index(['climateregions__climateregion'], dtype='object')


In [None]:
leRegionEnc_FileName = r'/content/drive/MyDrive/1_WiDS-Datathon-Kaggle_Feb2023/input/leRegionEncoder.pkl'
le = joblib.load(leRegionEnc_FileName)
print('Region Label Encoder pickle file loaded successfully!')

test['climateregions__climateregion'] = le.transform(test['climateregions__climateregion'])
print(test.shape)

Region Label Encoder pickle file loaded successfully!
(31354, 244)


In [None]:
test = test.reindex(columns=features_train.columns, fill_value=0)
print(test.shape)

(31354, 115)


In [None]:
RF_Predictions_Shakti_2 = pd.read_csv('/content/drive/MyDrive/1_WiDS-Datathon-Kaggle_Feb2023/input/sample_solution.csv')
print('Before Predictions: ', RF_Predictions_Shakti_2.shape)

# RF_Predictions_Shakti_2['contest-tmp2m-14d__tmp2m'] = rfModel.predict(df_features_pca_test.values)

RF_Predictions_Shakti_2['contest-tmp2m-14d__tmp2m'] = model.predict(test.values)

RF_Predictions_Shakti_2.to_csv('/content/drive/MyDrive/1_WiDS-Datathon-Kaggle_Feb2023/input/submission-0.7Corr.csv',index = False)

print('After Predictions: ',RF_Predictions_Shakti_2.shape)

Before Predictions:  (31354, 2)
After Predictions:  (31354, 2)
