In [61]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestRegressor,GradientBoostingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
import warnings
warnings.filterwarnings('ignore')

In [4]:
# Load dataset
data = pd.read_csv('/Users/divyasandiman/Downloads/temperature.csv')


In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7752 entries, 0 to 7751
Data columns (total 25 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   station           7750 non-null   float64
 1   Date              7750 non-null   object 
 2   Present_Tmax      7682 non-null   float64
 3   Present_Tmin      7682 non-null   float64
 4   LDAPS_RHmin       7677 non-null   float64
 5   LDAPS_RHmax       7677 non-null   float64
 6   LDAPS_Tmax_lapse  7677 non-null   float64
 7   LDAPS_Tmin_lapse  7677 non-null   float64
 8   LDAPS_WS          7677 non-null   float64
 9   LDAPS_LH          7677 non-null   float64
 10  LDAPS_CC1         7677 non-null   float64
 11  LDAPS_CC2         7677 non-null   float64
 12  LDAPS_CC3         7677 non-null   float64
 13  LDAPS_CC4         7677 non-null   float64
 14  LDAPS_PPT1        7677 non-null   float64
 15  LDAPS_PPT2        7677 non-null   float64
 16  LDAPS_PPT3        7677 non-null   float64


In [6]:
data.describe()

Unnamed: 0,station,Present_Tmax,Present_Tmin,LDAPS_RHmin,LDAPS_RHmax,LDAPS_Tmax_lapse,LDAPS_Tmin_lapse,LDAPS_WS,LDAPS_LH,LDAPS_CC1,...,LDAPS_PPT2,LDAPS_PPT3,LDAPS_PPT4,lat,lon,DEM,Slope,Solar radiation,Next_Tmax,Next_Tmin
count,7750.0,7682.0,7682.0,7677.0,7677.0,7677.0,7677.0,7677.0,7677.0,7677.0,...,7677.0,7677.0,7677.0,7752.0,7752.0,7752.0,7752.0,7752.0,7725.0,7725.0
mean,13.0,29.768211,23.225059,56.759372,88.374804,29.613447,23.512589,7.097875,62.505019,0.368774,...,0.485003,0.2782,0.269407,37.544722,126.991397,61.867972,1.257048,5341.502803,30.274887,22.93222
std,7.211568,2.969999,2.413961,14.668111,7.192004,2.947191,2.345347,2.183836,33.730589,0.262458,...,1.762807,1.161809,1.206214,0.050352,0.079435,54.27978,1.370444,429.158867,3.12801,2.487613
min,1.0,20.0,11.3,19.794666,58.936283,17.624954,14.272646,2.88258,-13.603212,0.0,...,0.0,0.0,0.0,37.4562,126.826,12.37,0.098475,4329.520508,17.4,11.3
25%,7.0,27.8,21.7,45.963543,84.222862,27.673499,22.089739,5.678705,37.266753,0.146654,...,0.0,0.0,0.0,37.5102,126.937,28.7,0.2713,4999.018555,28.2,21.3
50%,13.0,29.9,23.4,55.039024,89.79348,29.703426,23.760199,6.54747,56.865482,0.315697,...,0.0,0.0,0.0,37.5507,126.995,45.716,0.618,5436.345215,30.5,23.1
75%,19.0,32.0,24.9,67.190056,93.743629,31.71045,25.152909,8.032276,84.223616,0.575489,...,0.018364,0.007896,4.1e-05,37.5776,127.042,59.8324,1.7678,5728.316406,32.6,24.6
max,25.0,37.6,29.9,98.524734,100.000153,38.542255,29.619342,21.857621,213.414006,0.967277,...,21.621661,15.841235,16.655469,37.645,127.135,212.335,5.17823,5992.895996,38.9,29.8


In [7]:
data.isnull().sum()

station              2
Date                 2
Present_Tmax        70
Present_Tmin        70
LDAPS_RHmin         75
LDAPS_RHmax         75
LDAPS_Tmax_lapse    75
LDAPS_Tmin_lapse    75
LDAPS_WS            75
LDAPS_LH            75
LDAPS_CC1           75
LDAPS_CC2           75
LDAPS_CC3           75
LDAPS_CC4           75
LDAPS_PPT1          75
LDAPS_PPT2          75
LDAPS_PPT3          75
LDAPS_PPT4          75
lat                  0
lon                  0
DEM                  0
Slope                0
Solar radiation      0
Next_Tmax           27
Next_Tmin           27
dtype: int64

In [8]:
data = data.dropna(subset=['Next_Tmax', 'Next_Tmin'])

In [9]:
data['Date'] = pd.to_datetime(data['Date'])

  data['Date'] = pd.to_datetime(data['Date'])


In [10]:
data['Year'] = data['Date'].dt.year
data['Month'] = data['Date'].dt.month
data['Day'] = data['Date'].dt.day

In [11]:
data = data.drop(columns=['Date'])

In [12]:
X = data.drop(columns=['Next_Tmax', 'Next_Tmin'])
y_tmax = data['Next_Tmax']
y_tmin = data['Next_Tmin']

In [33]:
numerical_features = X.columns

In [42]:
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

In [43]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num',numerical_transformer,numerical_features)
    ]
)

In [44]:
model_pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('model', RandomForestRegressor())
    ])


In [45]:
model_grid=[
    {'model':[RandomForestRegressor()]},
    {'model':[LinearRegression()]},
    {'model':[SVR()]},
    {'model':[GradientBoostingRegressor()]}
]

In [48]:
# Train/Test Split
X_train, X_test, y_tmax_train, y_tmax_test = train_test_split(X, y_tmax, test_size=0.2, random_state=0)
X_train, X_test, y_tmin_train, y_tmin_test = train_test_split(X, y_tmin, test_size=0.2, random_state=0)


In [49]:
gscv_max=GridSearchCV(model_pipeline,model_grid,cv=5,scoring='r2',verbose=2)

In [50]:
gscv_max.fit(X_train, y_tmax_train)

Fitting 5 folds for each of 4 candidates, totalling 20 fits
[CV] END ......................model=RandomForestRegressor(); total time=   5.4s
[CV] END ......................model=RandomForestRegressor(); total time=   5.8s
[CV] END ......................model=RandomForestRegressor(); total time=   5.6s
[CV] END ......................model=RandomForestRegressor(); total time=   5.6s
[CV] END ......................model=RandomForestRegressor(); total time=   5.7s
[CV] END ...........................model=LinearRegression(); total time=   0.0s
[CV] END ...........................model=LinearRegression(); total time=   0.0s
[CV] END ...........................model=LinearRegression(); total time=   0.0s
[CV] END ...........................model=LinearRegression(); total time=   0.0s
[CV] END ...........................model=LinearRegression(); total time=   0.0s
[CV] END ........................................model=SVR(); total time=   1.8s
[CV] END ........................................

In [51]:
gscv_min=GridSearchCV(model_pipeline,model_grid,cv=5,scoring='r2',verbose=2)

In [52]:
gscv_min.fit(X_train, y_tmin_train)

Fitting 5 folds for each of 4 candidates, totalling 20 fits
[CV] END ......................model=RandomForestRegressor(); total time=   5.6s
[CV] END ......................model=RandomForestRegressor(); total time=   5.5s
[CV] END ......................model=RandomForestRegressor(); total time=   5.4s
[CV] END ......................model=RandomForestRegressor(); total time=   5.5s
[CV] END ......................model=RandomForestRegressor(); total time=   5.6s
[CV] END ...........................model=LinearRegression(); total time=   0.0s
[CV] END ...........................model=LinearRegression(); total time=   0.0s
[CV] END ...........................model=LinearRegression(); total time=   0.0s
[CV] END ...........................model=LinearRegression(); total time=   0.0s
[CV] END ...........................model=LinearRegression(); total time=   0.0s
[CV] END ........................................model=SVR(); total time=   1.7s
[CV] END ........................................

In [55]:
gscv_max.best_params_

{'model': RandomForestRegressor()}

In [56]:
gscv_min.best_params_

{'model': RandomForestRegressor()}

##### Now since we have the model as Random Forest regressor, we will not perform hyperparameter Tuning

In [63]:
param_grid = {
    'model__n_estimators': [100, 200, 300],          
    'model__max_features': ['auto', 'sqrt', 'log2'],  
    'model__max_depth': [None, 10, 20, 30],
    'model__min_samples_split': [2, 5, 10],
}


In [69]:
gscv_mx = GridSearchCV(estimator=model_pipeline, param_grid=param_grid, cv=5, scoring='r2', verbose=2)

In [70]:
gscv_mx.fit(X_train, y_tmax_train)

Fitting 5 folds for each of 108 candidates, totalling 540 fits
[CV] END model__max_depth=None, model__max_features=auto, model__min_samples_split=2, model__n_estimators=100; total time=   5.9s
[CV] END model__max_depth=None, model__max_features=auto, model__min_samples_split=2, model__n_estimators=100; total time=   5.8s
[CV] END model__max_depth=None, model__max_features=auto, model__min_samples_split=2, model__n_estimators=100; total time=   6.0s
[CV] END model__max_depth=None, model__max_features=auto, model__min_samples_split=2, model__n_estimators=100; total time=   6.2s
[CV] END model__max_depth=None, model__max_features=auto, model__min_samples_split=2, model__n_estimators=100; total time=   6.6s
[CV] END model__max_depth=None, model__max_features=auto, model__min_samples_split=2, model__n_estimators=200; total time=  13.4s
[CV] END model__max_depth=None, model__max_features=auto, model__min_samples_split=2, model__n_estimators=200; total time=  13.4s
[CV] END model__max_depth=N

In [71]:
gscv_mn = GridSearchCV(estimator=model_pipeline, param_grid=param_grid, cv=5, scoring='r2', verbose=2)

In [72]:
gscv_mn.fit(X_train,y_tmin_train)

Fitting 5 folds for each of 108 candidates, totalling 540 fits
[CV] END model__max_depth=None, model__max_features=auto, model__min_samples_split=2, model__n_estimators=100; total time=   5.9s
[CV] END model__max_depth=None, model__max_features=auto, model__min_samples_split=2, model__n_estimators=100; total time=   5.8s
[CV] END model__max_depth=None, model__max_features=auto, model__min_samples_split=2, model__n_estimators=100; total time=   5.7s
[CV] END model__max_depth=None, model__max_features=auto, model__min_samples_split=2, model__n_estimators=100; total time=   5.8s
[CV] END model__max_depth=None, model__max_features=auto, model__min_samples_split=2, model__n_estimators=100; total time=   6.0s
[CV] END model__max_depth=None, model__max_features=auto, model__min_samples_split=2, model__n_estimators=200; total time=  12.0s
[CV] END model__max_depth=None, model__max_features=auto, model__min_samples_split=2, model__n_estimators=200; total time=  12.4s
[CV] END model__max_depth=N

In [73]:
gscv_mx.best_score_

0.91851793898176

In [74]:
gscv_mx.best_params_

{'model__max_depth': None,
 'model__max_features': 'sqrt',
 'model__min_samples_split': 2,
 'model__n_estimators': 300}

In [76]:
model_max=gscv_mx.best_estimator_

In [77]:
predmax=model_max.predict(X_test)

In [78]:
# Evaluate models
print("Maximum Temperature Prediction Metrics:")
print(f"R^2 Score: {r2_score(y_tmax_test, predmax)}")
print(f"Mean Squared Error: {mean_squared_error(y_tmax_test, predmax)}")


Maximum Temperature Prediction Metrics:
R^2 Score: 0.9186984111676171
Mean Squared Error: 0.8223344262495526


In [79]:
gscv_mn.best_score_

0.9155581555590011

In [80]:
gscv_mn.best_params_

{'model__max_depth': None,
 'model__max_features': 'sqrt',
 'model__min_samples_split': 2,
 'model__n_estimators': 300}

In [82]:
model_min=gscv_mn.best_estimator_

In [83]:
predmin=model_min.predict(X_test)

In [84]:
print("\nMinimum Temperature Prediction Metrics:")
print(f"R^2 Score: {r2_score(y_tmin_test, predmin)}")
print(f"Mean Squared Error: {mean_squared_error(y_tmin_test, predmin)}")



Minimum Temperature Prediction Metrics:
R^2 Score: 0.9205112480964385
Mean Squared Error: 0.49797229967637513


# Hence the r2 scores for max temperature and minimum temperatures are close to 92% which is very good considering the MSE is also very low. Suggesting that the models is performing quite well and prediction are very close to actuals.