In [30]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

from sklearn.preprocessing import LabelEncoder, OneHotEncoder

from sklearn.compose import ColumnTransformer 
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor, GradientBoostingRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import LinearSVR
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import SGDRegressor

In [2]:
train_X = pd.read_csv('data.csv')
train_y = pd.read_csv('train_label.csv')
test_X = pd.read_csv('test.csv')
test_y = pd.read_csv('test_label.csv')

In [3]:
train_X.head()

Unnamed: 0,datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed
0,5/2/2012 19:00,Summer,0,1,Clear + Few clouds,22.14,25.76,77,16.9979
1,9/5/2012 4:00,Fall,0,1,Clear + Few clouds,28.7,33.335,79,19.0012
2,1/13/2011 9:00,Spring,0,1,Clear + Few clouds,5.74,6.06,50,22.0028
3,11/18/2011 16:00,Winter,0,1,Clear + Few clouds,13.94,16.665,29,8.9981
4,9/13/2011 13:00,Fall,0,1,Clear + Few clouds,30.34,33.335,51,19.0012


In [4]:
# import the BaseEstimator
from sklearn.base import BaseEstimator

# define the class OutletTypeEncoder
# This will be our custom transformer that will create 3 new binary columns
# custom transformer must have methods fit and transform
class WeatherTransformer(BaseEstimator):

    def __init__(self):
        pass

    def fit(self, documents, y=None):
        return self

    def transform(self, x_dataset):
        if 'weather' in x_dataset.columns:            
            x_dataset[['weather1','weather2']] = x_dataset['weather'].str.split('+', 2, expand=True)
        return x_dataset


In [5]:
# import the BaseEstimator
from sklearn.base import BaseEstimator

# define the class OutletTypeEncoder
# This will be our custom transformer that will create 3 new binary columns
# custom transformer must have methods fit and transform
class DateTimeEncoder(BaseEstimator):

    def __init__(self):
        pass

    def fit(self, documents, y=None):
        return self

    def transform(self, x_dataset):
        if 'datetime' in x_dataset.columns:
            x_dataset['datetime'] = pd.to_datetime(x_dataset['datetime'])
            x_dataset['year'] = x_dataset.datetime.dt.year
            x_dataset['month'] = x_dataset.datetime.dt.month 
            x_dataset['day'] = x_dataset.datetime.dt.day
            x_dataset['hour'] = x_dataset.datetime.dt.hour     
        
        return x_dataset
        

In [6]:
# import the BaseEstimator
from sklearn.base import BaseEstimator

# define the class OutletTypeEncoder
# This will be our custom transformer that will create 3 new binary columns
# custom transformer must have methods fit and transform
class DropColumns(BaseEstimator):

    def __init__(self):
        self.columns = ['datetime','atemp','weather']
        pass

    def fit(self, documents, y=None):
        return self

    def transform(self, x_dataset:pd.DataFrame):
        for column in self.columns:
            if column in x_dataset.columns:
                x_dataset.drop(columns=[column], axis=1, inplace=True)        
        
        return x_dataset
        

In [7]:
feature_transformer = Pipeline(steps=[('split_weather', WeatherTransformer()),
                                 ('split_datetime', DateTimeEncoder()),
                                 ('drop',DropColumns())])

In [8]:
categorical_features = ['season', 'weather1', 'weather2']

categorical_transformer = OneHotEncoder(handle_unknown='ignore', sparse=False)

preprocessor = ColumnTransformer(remainder='passthrough',
    transformers=[
        ('cat', categorical_transformer, categorical_features)])

model score: 0.866


In [32]:
regressors = {'Linear': LinearRegression(), 
              'Lasso': Lasso(), 
              'Ridge': Ridge(), 
              'DecisionTree': DecisionTreeRegressor(),
              'SVR': LinearSVR(),
             'RandomForest': RandomForestRegressor(),
             'AdaBoost': AdaBoostRegressor(),
             'GBoost': GradientBoostingRegressor(),
             'Naive Bayes': GaussianNB(),
             'KNN': KNeighborsRegressor(),
             'SBD':SGDRegressor()}

In [46]:
regression_df = pd.DataFrame(columns=['Model', 'Score'])
for model in regressors:
    clf = Pipeline(steps=[('feature_transformer', feature_transformer),
                        ('preprocessor', preprocessor),
                      ('regression', regressors[model])])
    clf.fit(train_X, train_y)
    score = "%.3f" % clf.score(test_X, test_y)
    regression_df = regression_df.append({'Model':model, 'Score':score}, ignore_index=True)

Unnamed: 0,Model,Score
0,Linear,0.398
1,Lasso,0.4
2,Ridge,0.398
3,DecisionTree,0.886
4,SVR,0.347
5,RandomForest,0.943
6,AdaBoost,0.635
7,GBoost,0.866
8,Naive Bayes,0.0
9,KNN,0.498


In [47]:
regression_df.sort_values(by=['Score'], ascending=False)

Unnamed: 0,Model,Score
5,RandomForest,0.943
3,DecisionTree,0.886
7,GBoost,0.866
6,AdaBoost,0.635
9,KNN,0.498
1,Lasso,0.4
0,Linear,0.398
2,Ridge,0.398
4,SVR,0.347
8,Naive Bayes,0.0


In [44]:
"%.3f" % 3.980000e-01

'0.398'