In [None]:
!pip install auto-sklearn

In [None]:
!pip install scipy==1.7.0

In [1]:
from pprint import pprint

import sklearn.datasets
import sklearn.metrics
from sklearn.model_selection import train_test_split
import autosklearn.regression
import matplotlib.pyplot as plt

In [2]:
import pandas as pd
import numpy as np

In [3]:
#table of data frame
data = pd.read_csv(r'/content/weather.csv')
data

Unnamed: 0,Data.Precipitation,Date.Full,Date.Month,Date.Week of,Date.Year,Station.City,Station.Code,Station.Location,Station.State,Data.Temperature.Avg Temp,Data.Temperature.Max Temp,Data.Temperature.Min Temp,Data.Wind.Direction,Data.Wind.Speed
0,0.00,2016-01-03,1,3,2016,Birmingham,BHM,"Birmingham, AL",Alabama,39,46,32,33,4.33
1,0.00,2016-01-03,1,3,2016,Huntsville,HSV,"Huntsville, AL",Alabama,39,47,31,32,3.86
2,0.16,2016-01-03,1,3,2016,Mobile,MOB,"Mobile, AL",Alabama,46,51,41,35,9.73
3,0.00,2016-01-03,1,3,2016,Montgomery,MGM,"Montgomery, AL",Alabama,45,52,38,32,6.86
4,0.01,2016-01-03,1,3,2016,Anchorage,ANC,"Anchorage, AK",Alaska,34,38,29,19,7.80
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16738,0.08,2017-01-01,1,1,2017,Casper,CPR,"Casper, WY",Wyoming,23,32,15,23,19.98
16739,0.00,2017-01-01,1,1,2017,Cheyenne,CYS,"Cheyenne, WY",Wyoming,32,42,21,26,15.16
16740,0.00,2017-01-01,1,1,2017,Lander,LND,"Lander, WY",Wyoming,17,29,4,26,1.65
16741,0.06,2017-01-01,1,1,2017,Rawlins,RWL,"Rawlins, WY",Wyoming,23,31,13,24,18.16


In [None]:
#list of all column names and example of data
data.describe

In [7]:
from sklearn.preprocessing import LabelEncoder
#encoding states as categorical numerical values
label_encoder = LabelEncoder()
state_encoded = label_encoder.fit_transform(data["Station.State"])
data["Encoded.State"] = state_encoded

In [12]:
#Data after dropping unnecessary columns
X = data.drop(["Date.Year","Date.Full","Station.City","Station.Code","Station.Location","Station.State","Data.Temperature.Avg Temp","Data.Temperature.Max Temp","Data.Temperature.Min Temp"], axis = 1)
X

Unnamed: 0,Data.Precipitation,Date.Month,Date.Week of,Data.Wind.Direction,Data.Wind.Speed,Encoded.State
0,0.00,1,3,33,4.33,0
1,0.00,1,3,32,3.86,0
2,0.16,1,3,35,9.73,0
3,0.00,1,3,32,6.86,0
4,0.01,1,3,19,7.80,1
...,...,...,...,...,...,...
16738,0.08,1,1,23,19.98,52
16739,0.00,1,1,26,15.16,52
16740,0.00,1,1,26,1.65,52
16741,0.06,1,1,24,18.16,52


In [13]:
#Target column to be predicted
y = data["Data.Temperature.Avg Temp"]
y

0        39
1        39
2        46
3        45
4        34
         ..
16738    23
16739    32
16740    17
16741    23
16742    21
Name: Data.Temperature.Avg Temp, Length: 16743, dtype: int64

In [30]:
X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(X, y, random_state=1)


In [31]:
#making the model ensemble limited to 120 seconds and no more than 30 seconds per individual model
automl = autosklearn.regression.AutoSklearnRegressor(
    time_left_for_this_task=300,
    per_run_time_limit=45,
    tmp_folder='/tmp/testing',
)
automl.fit(X_train, y_train, dataset_name='temp')

AutoSklearnRegressor(per_run_time_limit=45, time_left_for_this_task=300,
                     tmp_folder='/tmp/testing')

In [32]:
print(automl.leaderboard()) #shows the best runs and their weights in the ensemble
#pprint(automl.show_models(), indent=4)

          rank  ensemble_weight               type      cost   duration
model_id                                                               
3            1             0.30  gradient_boosting  0.118352   5.905216
13           2             0.36  gradient_boosting  0.118409   3.233913
19           3             0.28  gradient_boosting  0.123061   6.744082
2            4             0.06      random_forest  0.146328  16.343871


In [33]:
test_predictions = automl.predict(X_test)

In [34]:
from sklearn.metrics import mean_absolute_error

print("Sklearn AutoML R2 score:", sklearn.metrics.r2_score(y_test, test_predictions))
print("Sklearn AutoML Mean Absolute Error:", sklearn.metrics.mean_absolute_error(y_test, test_predictions))

Sklearn AutoML R2 score: 0.8987710001250424
Sklearn AutoML Mean Absolute Error: 4.124299030192107


In [35]:
from sklearn.ensemble import RandomForestRegressor

rf = RandomForestRegressor()
rf_model = rf.fit(X_train,y_train)
rf_pred = rf_model.predict(X_test)

In [36]:
print("Sklearn Random Forest Regressor R2 score:", sklearn.metrics.r2_score(y_test, rf_pred))
print("Sklearn Random Forest Regressor Mean Absolute Error:", sklearn.metrics.mean_absolute_error(y_test, rf_pred))

Sklearn Random Forest Regressor R2 score: 0.8744001443572691
Sklearn Random Forest Regressor Mean Absolute Error: 4.519054188565058


In [37]:
from sklearn.ensemble import GradientBoostingRegressor

gb = GradientBoostingRegressor()
gb_model = gb.fit(X_train, y_train)
gb_pred = gb_model.predict(X_test)

In [38]:
print("Sklearn Gradient Boosting Regressor R2 score:", sklearn.metrics.r2_score(y_test, gb_pred))
print("Sklearn Gradient Boosting Regressor Mean Absolute Error:", sklearn.metrics.mean_absolute_error(y_test, gb_pred))

Sklearn Gradient Boosting Regressor R2 score: 0.8197489730827267
Sklearn Gradient Boosting Regressor Mean Absolute Error: 5.9203894859708095
