In [13]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

from sklearn.preprocessing import LabelEncoder, OneHotEncoder

from sklearn.compose import ColumnTransformer 
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor, GradientBoostingRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import LinearSVR
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import SGDRegressor

from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV

from sklearn.metrics import mean_squared_log_error

In [2]:
train_X = pd.read_csv('data.csv')
train_y = pd.read_csv('train_label.csv')
test_X = pd.read_csv('test.csv')
test_y = pd.read_csv('test_label.csv')

In [3]:
train_X.head()

Unnamed: 0,datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed
0,5/2/2012 19:00,Summer,0,1,Clear + Few clouds,22.14,25.76,77,16.9979
1,9/5/2012 4:00,Fall,0,1,Clear + Few clouds,28.7,33.335,79,19.0012
2,1/13/2011 9:00,Spring,0,1,Clear + Few clouds,5.74,6.06,50,22.0028
3,11/18/2011 16:00,Winter,0,1,Clear + Few clouds,13.94,16.665,29,8.9981
4,9/13/2011 13:00,Fall,0,1,Clear + Few clouds,30.34,33.335,51,19.0012


# Dropping off outliers

In [4]:
indexes = train_X[train_X['windspeed'] > 30].index
train_X.drop(indexes, axis=0, inplace=True)
train_y.drop(indexes, axis=0, inplace=True)

# Splitting Weather into two categorical variables

In [5]:
# import the BaseEstimator
from sklearn.base import BaseEstimator

# define the class OutletTypeEncoder
# This will be our custom transformer that will create 3 new binary columns
# custom transformer must have methods fit and transform
class WeatherTransformer(BaseEstimator):

    def __init__(self):
        pass

    def fit(self, documents, y=None):
        return self

    def transform(self, x_dataset):
        if 'weather' in x_dataset.columns:
            x_dataset['weather'] = x_dataset['weather'].str.replace(',','+')
            x_dataset[['weather1','weather2']] = x_dataset['weather'].str.split('+', 2, expand=True)
        return x_dataset


# Extracting multiple features from datetime

In [6]:
# import the BaseEstimator
from sklearn.base import BaseEstimator

# define the class OutletTypeEncoder
# This will be our custom transformer that will create 3 new binary columns
# custom transformer must have methods fit and transform
class DateTimeEncoder(BaseEstimator):

    def __init__(self):
        pass

    def fit(self, documents, y=None):
        return self

    def transform(self, x_dataset):
        if 'datetime' in x_dataset.columns:
            x_dataset['datetime'] = pd.to_datetime(x_dataset['datetime'])
            x_dataset['year'] = x_dataset.datetime.dt.year
            x_dataset['month'] = x_dataset.datetime.dt.month 
            x_dataset['day'] = x_dataset.datetime.dt.day
            x_dataset['hour'] = x_dataset.datetime.dt.hour     
        
        return x_dataset
        

# Dropping of unwanted columns

In [7]:
# import the BaseEstimator
from sklearn.base import BaseEstimator

# define the class OutletTypeEncoder
# This will be our custom transformer that will create 3 new binary columns
# custom transformer must have methods fit and transform
class DropColumns(BaseEstimator):

    def __init__(self):
        self.columns = ['datetime','atemp','weather']
        pass

    def fit(self, documents, y=None):
        return self

    def transform(self, x_dataset:pd.DataFrame):
        for column in self.columns:
            if column in x_dataset.columns:
                x_dataset.drop(columns=[column], axis=1, inplace=True)        
        
        return x_dataset
        

In [8]:
feature_transformer = Pipeline(steps=[
                                ('split_weather', WeatherTransformer()),
                                 ('split_datetime', DateTimeEncoder()),
                                 ('drop',DropColumns())])

# 7. Converting categorical variables into one hot encoder

In [9]:
categorical_features = ['season', 'weather1', 'weather2']

categorical_transformer = OneHotEncoder(handle_unknown='ignore', sparse=False)

preprocessor = ColumnTransformer(remainder='passthrough',
    transformers=[
        ('cat', categorical_transformer, categorical_features)])

In [10]:
regressors = {'Linear': LinearRegression(), 
              'Lasso': Lasso(), 
              'Ridge': Ridge(), 
              'DecisionTree': DecisionTreeRegressor(),
              'SVR': LinearSVR(),
             'RandomForest': RandomForestRegressor(),
             'AdaBoost': AdaBoostRegressor(),
             'GBoost': GradientBoostingRegressor(),
             'Naive Bayes': GaussianNB(),
             'KNN': KNeighborsRegressor(),
             'SBD':SGDRegressor()}

In [11]:
regression_df = pd.DataFrame(columns=['Model', 'Score'])
for model in regressors:
    clf = Pipeline(steps=[('feature_transformer', feature_transformer),
                        ('preprocessor', preprocessor),
                      ('regression', regressors[model])])
    clf.fit(train_X, train_y)
    score = "%.3f" % clf.score(test_X, test_y)
    regression_df = regression_df.append({'Model':model, 'Score':score}, ignore_index=True)

In [12]:
regression_df.sort_values(by=['Score'], ascending=False)

Unnamed: 0,Model,Score
5,RandomForest,0.943
3,DecisionTree,0.885
7,GBoost,0.863
6,AdaBoost,0.668
9,KNN,0.5
1,Lasso,0.4
0,Linear,0.398
2,Ridge,0.398
8,Naive Bayes,0.0
10,SBD,-3.946579511422973e+24


In [13]:
kf = KFold(n_splits=5)

X = train_X
y = train_y

kf.get_n_splits(X)
print(kf)

KFold(n_splits=5, random_state=None, shuffle=False)


In [14]:
scores = []

for train_index, test_index in kf.split(X):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    
    clf = Pipeline(steps=[('feature_transformer', feature_transformer),
                        ('preprocessor', preprocessor),
                      ('regression', RandomForestRegressor())])
    clf.fit(X_train, y_train)
    scores.append(float("%.3f" % clf.score(X_test, y_test)))
scores

[0.933, 0.943, 0.941, 0.942, 0.93]

In [15]:
np.average(scores)

0.9377999999999999

# 11. Perform Hyper-parameter tuning on the best model using GridSearchCVand print the best parameters using model.best_params_

In [39]:
pipeline = Pipeline(steps=[('feature_transformer', feature_transformer),
                        ('preprocessor', preprocessor),
                      ('clf', RandomForestRegressor())])


tuned_parameters = {'clf__bootstrap': [True, False],
 'clf__max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, None],
 'clf__max_features': ['auto', 'sqrt'],
 'clf__min_samples_leaf': [1, 2, 4],
 'clf__min_samples_split': [2, 5, 10],
 'clf__n_estimators': [100]}

clf = GridSearchCV(pipeline, tuned_parameters, n_jobs=-1, verbose=10)
clf.fit(X_train, y_train)
#score = "%.3f" % clf.score(test_X, test_y)
print("Best Params",clf.best_params_)

Fitting 5 folds for each of 396 candidates, totalling 1980 fits
Best Params {'clf__bootstrap': True, 'clf__max_depth': None, 'clf__max_features': 'auto', 'clf__min_samples_leaf': 1, 'clf__min_samples_split': 2, 'clf__n_estimators': 100}


In [40]:
pd.DataFrame(clf.cv_results_).sort_values(by=['rank_test_score'])

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_clf__bootstrap,param_clf__max_depth,param_clf__max_features,param_clf__min_samples_leaf,param_clf__min_samples_split,param_clf__n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
180,3.612402,0.086250,0.076602,0.009329,True,,auto,1,2,100,"{'clf__bootstrap': True, 'clf__max_depth': Non...",0.927657,0.932990,0.944653,0.942035,0.938153,0.937098,0.006141,1
144,3.977589,0.294480,0.069804,0.003761,True,90,auto,1,2,100,"{'clf__bootstrap': True, 'clf__max_depth': 90,...",0.928109,0.932360,0.943980,0.941026,0.937264,0.936547,0.005737,2
162,4.233001,0.070082,0.094197,0.016067,True,100,auto,1,2,100,"{'clf__bootstrap': True, 'clf__max_depth': 100...",0.929357,0.931396,0.944002,0.940604,0.935798,0.936231,0.005483,3
126,3.605600,0.033058,0.069596,0.007118,True,80,auto,1,2,100,"{'clf__bootstrap': True, 'clf__max_depth': 80,...",0.929032,0.931789,0.943281,0.940795,0.935876,0.936154,0.005333,4
72,4.314803,0.126315,0.090599,0.021608,True,50,auto,1,2,100,"{'clf__bootstrap': True, 'clf__max_depth': 50,...",0.929647,0.932416,0.942643,0.940810,0.935202,0.936144,0.004919,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17,0.963448,0.037340,0.060405,0.012094,True,10,sqrt,4,10,100,"{'clf__bootstrap': True, 'clf__max_depth': 10,...",0.713318,0.729229,0.738290,0.722838,0.716197,0.723974,0.009032,392
16,0.916445,0.036693,0.060805,0.014389,True,10,sqrt,4,5,100,"{'clf__bootstrap': True, 'clf__max_depth': 10,...",0.710465,0.728451,0.740834,0.734577,0.701262,0.723118,0.014909,393
13,0.854834,0.035753,0.062006,0.008653,True,10,sqrt,2,5,100,"{'clf__bootstrap': True, 'clf__max_depth': 10,...",0.722011,0.725874,0.736724,0.712046,0.711588,0.721649,0.009367,394
11,1.080051,0.033983,0.089608,0.022707,True,10,sqrt,1,10,100,"{'clf__bootstrap': True, 'clf__max_depth': 10,...",0.709065,0.720715,0.735931,0.712763,0.708422,0.717379,0.010256,395


# 12. Perform prediction on the test set and print the mean_squared_log_error

In [18]:
best_model = RandomForestRegressor(bootstrap= True, 
                  max_depth= None, 
                  max_features='auto', 
                  min_samples_leaf= 1, 
                  min_samples_split= 2, 
                  n_estimators= 1000)

In [20]:
clf = Pipeline(steps=[('feature_transformer', feature_transformer),
                        ('preprocessor', preprocessor),
                      ('regression', best_model)])
clf.fit(train_X, train_y)
y_predict = clf.predict(test_X)
print(f"mean_squared_log_error: {mean_squared_log_error(y_predict, test_y)}")

mean_squared_log_error: 0.12196866063978024
