In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('./data/Pasion et al dataset.csv')

In [3]:
df.head()

Unnamed: 0,Location,Date,Time,Latitude,Longitude,Altitude,YRMODAHRMI,Month,Hour,Season,Humidity,AmbientTemp,PolyPwr,Wind.Speed,Visibility,Pressure,Cloud.Ceiling
0,Camp Murray,20171203,1145,47.11,-122.57,84,201712000000.0,12,11,Winter,81.71997,12.86919,2.42769,5,10.0,1010.6,722
1,Camp Murray,20171203,1315,47.11,-122.57,84,201712000000.0,12,13,Winter,96.64917,9.66415,2.46273,0,10.0,1011.3,23
2,Camp Murray,20171203,1330,47.11,-122.57,84,201712000000.0,12,13,Winter,93.61572,15.44983,4.46836,5,10.0,1011.6,32
3,Camp Murray,20171204,1230,47.11,-122.57,84,201712000000.0,12,12,Winter,77.21558,10.36659,1.65364,5,2.0,1024.4,6
4,Camp Murray,20171204,1415,47.11,-122.57,84,201712000000.0,12,14,Winter,54.80347,16.85471,6.57939,3,3.0,1023.7,9


#### We'll first build the model using all the availabel columns and then improve it in next step

In [8]:
X = df.drop(labels=['PolyPwr'], axis=1)
y = df[['PolyPwr']]

In [10]:
X.head()

Unnamed: 0,Location,Date,Time,Latitude,Longitude,Altitude,YRMODAHRMI,Month,Hour,Season,Humidity,AmbientTemp,Wind.Speed,Visibility,Pressure,Cloud.Ceiling
0,Camp Murray,20171203,1145,47.11,-122.57,84,201712000000.0,12,11,Winter,81.71997,12.86919,5,10.0,1010.6,722
1,Camp Murray,20171203,1315,47.11,-122.57,84,201712000000.0,12,13,Winter,96.64917,9.66415,0,10.0,1011.3,23
2,Camp Murray,20171203,1330,47.11,-122.57,84,201712000000.0,12,13,Winter,93.61572,15.44983,5,10.0,1011.6,32
3,Camp Murray,20171204,1230,47.11,-122.57,84,201712000000.0,12,12,Winter,77.21558,10.36659,5,2.0,1024.4,6
4,Camp Murray,20171204,1415,47.11,-122.57,84,201712000000.0,12,14,Winter,54.80347,16.85471,3,3.0,1023.7,9


In [11]:
y.head()

Unnamed: 0,PolyPwr
0,2.42769
1,2.46273
2,4.46836
3,1.65364
4,6.57939


In [15]:
# segregating the numerical_cols and categorical columns 
cat_cols = X.select_dtypes(include='object').columns
numerical_cols = X.select_dtypes(exclude='object').columns

In [19]:
print(f"Categorical columns: {cat_cols}\n\nNumerical columns: {numerical_cols}")

Categorical columns: Index(['Location', 'Season'], dtype='object')

Numerical columns: Index(['Date', 'Time', 'Latitude', 'Longitude', 'Altitude', 'YRMODAHRMI',
       'Month', 'Hour', 'Humidity', 'AmbientTemp', 'Wind.Speed', 'Visibility',
       'Pressure', 'Cloud.Ceiling'],
      dtype='object')


In [44]:
# defininig custom rankings for the categorical columns 
season_categories = ['Winter', 'Fall', 'Spring', 'Summer']
location_categories = ['Grissom', 'Malmstrom', 'MNANG', 'Camp Murray', 'Peterson', 'USAFA',
       'Travis', 'March AFB', 'Offutt', 'Hill Weber', 'Kahului', 'JDMT']

In [45]:
df['Location'].unique()

array(['Camp Murray', 'Grissom', 'Hill Weber', 'JDMT', 'Kahului',
       'Malmstrom', 'March AFB', 'MNANG', 'Offutt', 'Peterson', 'Travis',
       'USAFA'], dtype=object)

In [46]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OrdinalEncoder

# pipelines 
from sklearn.pipeline import Pipeline
from sklearn.compose import  ColumnTransformer

In [47]:
# Numerical pipeline 
num_pipeline = Pipeline(
    steps=[
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler())
    ]
)

In [55]:
# Categorical pipeline 
cat_pipeline = Pipeline(
    steps=[
        ('ordincalencoder', OrdinalEncoder(categories=[location_categories, season_categories])),
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('scaler', StandardScaler())
    ]
)

In [56]:
# to combine both categorical pipeline and numerical pipeline we use the column transformer 
preprocessor = ColumnTransformer(
    [
        ('cat_pipeline', cat_pipeline, cat_cols),
        ('num_pipeline', num_pipeline,numerical_cols)
        
    ]
)

In [50]:
### Train test split 
from sklearn.model_selection import train_test_split

In [51]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=3)

In [52]:
X_train.head()

Unnamed: 0,Location,Date,Time,Latitude,Longitude,Altitude,YRMODAHRMI,Month,Hour,Season,Humidity,AmbientTemp,Wind.Speed,Visibility,Pressure,Cloud.Ceiling
7900,Malmstrom,20180114,1500,47.52,-111.18,1043,201801000000.0,1,15,Winter,62.17651,4.60106,9,10.0,907.0,80
4178,Hill Weber,20180507,1100,41.15,-111.99,1370,201805000000.0,5,11,Spring,22.98584,32.22023,6,10.0,856.3,722
15725,Peterson,20180927,1500,38.82,-104.71,1879,201809000000.0,9,15,Fall,6.9519,31.63864,15,10.0,810.0,250
5258,JDMT,20170712,1145,26.98,-80.11,2,201707000000.0,7,11,Summer,82.07397,29.71764,8,10.0,1018.2,41
4633,Hill Weber,20180721,1400,41.15,-111.99,1370,201807000000.0,7,14,Summer,12.76855,49.60243,8,10.0,854.9,722


In [53]:
y_train.head()

Unnamed: 0,PolyPwr
7900,3.02564
4178,20.36269
15725,13.9496
5258,4.85731
4633,20.02154


In [57]:
X_train = pd.DataFrame(preprocessor.fit_transform(X_train), columns=preprocessor.get_feature_names_out())
X_test = pd.DataFrame(preprocessor.transform(X_test), columns=preprocessor.get_feature_names_out())

In [58]:
X_train.head()

Unnamed: 0,cat_pipeline__Location,cat_pipeline__Season,num_pipeline__Date,num_pipeline__Time,num_pipeline__Latitude,num_pipeline__Longitude,num_pipeline__Altitude,num_pipeline__YRMODAHRMI,num_pipeline__Month,num_pipeline__Hour,num_pipeline__Humidity,num_pipeline__AmbientTemp,num_pipeline__Wind.Speed,num_pipeline__Visibility,num_pipeline__Pressure,num_pipeline__Cloud.Ceiling
0,-1.473297,-1.650439,0.630661,1.389904,1.467572,-0.156599,0.313056,0.630969,-1.868144,1.421243,1.047049,-1.978,-0.212139,0.226586,-0.217633,-1.451408
1,1.054144,0.138701,0.7167,-0.99967,0.458697,-0.206316,0.737463,0.718535,-0.522704,-0.97326,-0.597063,0.241416,-0.683083,0.226586,-0.812514,0.678211
2,-0.525507,-0.755869,0.80865,1.389904,0.089673,0.240523,1.398084,0.806101,0.822736,1.421243,-1.269713,0.194681,0.729748,0.226586,-1.355769,-0.88749
3,1.686005,1.033271,-1.427701,-0.730843,-1.785536,1.750448,-1.038037,-1.426837,0.150016,-0.97326,1.88178,0.040314,-0.369121,0.226586,1.087117,-1.580777
4,1.054144,1.033271,0.76355,0.79251,0.458697,-0.206316,0.737463,0.762318,0.150016,0.822618,-1.025695,1.638212,-0.369121,0.226586,-0.828941,0.678211


## Model Training 

In [61]:
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

In [72]:
# model evaluation function 

def model_evaluation(y_test, y_pred):
    mse = mean_squared_error(y_true=y_test, y_pred= y_pred)
    mae = mean_absolute_error(y_true=y_test, y_pred= y_pred)
    r2 = r2_score(y_true= y_test, y_pred= y_pred)

    return mse, mae, r2

In [73]:
# Train Multiple models 
# Model evaluation 

models = {
    'LinearRegression': LinearRegression(),
    'Lasso': Lasso(),
    'Ridge': Ridge(),
    'DecisionTreeRegressor': DecisionTreeRegressor(),
    'RandomForestRegressor': RandomForestRegressor(),
    'GradientBoostingRegressor': GradientBoostingRegressor(),
    'SVR': SVR(),
    'KNeighborsRegressor': KNeighborsRegressor()
}

trained_models_list = []
models_list = []
r2_list = []

for i in range(len(list(models))):
    model = list(models.values())[i]
    model.fit(X_train, y_train)


    # make predictions 
    y_pred = model.predict(X_test)

    mse, mae, r2 = model_evaluation(y_test, y_pred)

    print(list(models.keys())[i])
    models_list.append(list(models.keys())[i])

    print('Model Training Performance')
    print('MSE', mse)
    print('MAE', mae)
    print('R2 Score', r2*100)

    r2_list.append(r2)

    print('='*35)
    print("\n")

LinearRegression
Model Training Performance
MSE 24.264144159435975
MAE 3.8116851950451816
R2 Score 51.57555013348457


Lasso
Model Training Performance
MSE 30.06685639519846
MAE 4.463563397195261
R2 Score 39.994958380314415


Ridge
Model Training Performance
MSE 24.265817234679975
MAE 3.814101670837512
R2 Score 51.57221114292535


DecisionTreeRegressor
Model Training Performance
MSE 30.484576300695785
MAE 3.374697817548305
R2 Score 39.16130620247181




  return fit_method(estimator, *args, **kwargs)


RandomForestRegressor
Model Training Performance
MSE 15.979324823713096
MAE 2.5808224234558126
R2 Score 68.10973390438977




  y = column_or_1d(y, warn=True)


GradientBoostingRegressor
Model Training Performance
MSE 18.02210868308497
MAE 3.051484221754035
R2 Score 64.03290828316499




  y = column_or_1d(y, warn=True)


SVR
Model Training Performance
MSE 18.433687471801168
MAE 2.736246841585104
R2 Score 63.21151205796336


KNeighborsRegressor
Model Training Performance
MSE 20.23332929426669
MAE 3.0087037624326896
R2 Score 59.6199299837291




In [66]:
linear_model = LinearRegression()
linear_model.fit(X_train, y_train)

In [67]:
y_pred = linear_model.predict(X_test)

In [68]:
r2_score(y_true= y_test, y_pred=y_pred)

0.5157555013348457

In [69]:
random_f = RandomForestRegressor()
random_f.fit(X_train, y_train)

  return fit_method(estimator, *args, **kwargs)


In [70]:
random_f_prediction = random_f.predict(X_test)


In [71]:
r2_score(y_true=y_test, y_pred=random_f_prediction)

0.6824593537975925