In [2]:
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin
import numpy as np
import pandas as pd
# BaseEstimator is the base class for all estimators in sklearn. It implements a fit method to learn from data.
# TransformerMixin is just an object that responds to fit, transform, and fit_transform. it is the mixin class for all transformers in sklearn.
# Only standard transformers such as MinMaxScaler, StandardScaler, LabelEncoder etc. are made available in sklearn but with the BaseEstimator and TransformerMixin, 
# we can create custom transformers to fit some data pre-processing needed

    
class Preprocess(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass
    def fit(self, X=None, y=None):
        return self
        

    def transform(self, X=None):
        return self
    

class ExtractData(Preprocess):

    def transform(self, X = None):
        X['year'] = pd.DatetimeIndex(X['datetime']).year
        X['month'] = pd.DatetimeIndex(X['datetime']).month
        X['hour'] = pd.DatetimeIndex(X['datetime']).hour
        X['dayofweek'] = pd.DatetimeIndex(X['datetime']).day_name()
        return X

     
        
class DataType(Preprocess):
    categorical = ["season", "holiday", "workingday", "weather", "year", "month", "hour", "dayofweek"]

    def transform(self , X = None):
        d = {}
        for cat in self.categorical:
            d[cat]  = "category"
        X = X.astype(d)
        return X
    
#class Encoding(Preprocess):

    #def transform(self, X= None):
       # X = pd.get_dummies(X, columns=['weather', 'season', 'year', 'month', 'hour', 'dayofweek'])
        #return X
    
class Encoding(Preprocess): #the Encoding class has to have a fit method to ensure it creates the same set of columns for any input data. 

    def fit(self, X, y=None):
        self.columns = pd.get_dummies(X, columns=['weather', 'season', 'year', 'month', 'hour', 'dayofweek']).columns
        return self

    def transform(self, X=None):
        X = pd.get_dummies(X, columns=['weather', 'season', 'year', 'month', 'hour', 'dayofweek'])
        missing_cols = set(self.columns) - set(X.columns)
        for col in missing_cols:
            X[col] = 0
        return X[self.columns]

    
class Drop(Preprocess):
    
    def transform(self, X = None):
        try:
            X = X.drop(["casual", "registered", "atemp", "datetime","count"], axis =1)
        except:
            X = X.drop(["atemp", "datetime"], axis =1)
        return X



## Load data

In [5]:
train  = pd.read_csv("../data/train.csv")
test = pd.read_csv("../data/test.csv")
y = np.log(train['count'])
display(train.head())
display(test.head())

Unnamed: 0,datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,casual,registered,count
0,2011-01-01 00:00:00,1,0,0,1,9.84,14.395,81,0.0,3,13,16
1,2011-01-01 01:00:00,1,0,0,1,9.02,13.635,80,0.0,8,32,40
2,2011-01-01 02:00:00,1,0,0,1,9.02,13.635,80,0.0,5,27,32
3,2011-01-01 03:00:00,1,0,0,1,9.84,14.395,75,0.0,3,10,13
4,2011-01-01 04:00:00,1,0,0,1,9.84,14.395,75,0.0,0,1,1


Unnamed: 0,datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed
0,2011-01-20 00:00:00,1,0,1,1,10.66,11.365,56,26.0027
1,2011-01-20 01:00:00,1,0,1,1,10.66,13.635,56,0.0
2,2011-01-20 02:00:00,1,0,1,1,10.66,13.635,56,0.0
3,2011-01-20 03:00:00,1,0,1,1,10.66,12.88,56,11.0014
4,2011-01-20 04:00:00,1,0,1,1,10.66,12.88,56,11.0014


In [53]:
test.iloc[[100]]

Unnamed: 0,datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed
100,2011-01-24 07:00:00,1,0,1,1,0.82,6.06,48,0.0


## test pipelines

In [44]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVR

preprocess = Pipeline([('extract',ExtractData()),
                            ('datatype', DataType()),
                            ('encoding', Encoding()),
                            ('drop',Drop())])

model = Pipeline([('scaler', StandardScaler()), 
                       ('svr', SVR())], verbose = True)

In [100]:
test = preprocess.fit_transform(X = test)
train = preprocess.fit_transform(X = train)

In [101]:
train

Unnamed: 0,holiday,workingday,temp,humidity,windspeed,weather_1,weather_2,weather_3,weather_4,season_1,...,hour_21,hour_22,hour_23,dayofweek_Friday,dayofweek_Monday,dayofweek_Saturday,dayofweek_Sunday,dayofweek_Thursday,dayofweek_Tuesday,dayofweek_Wednesday
0,0,0,9.84,81,0.0000,1,0,0,0,1,...,0,0,0,0,0,1,0,0,0,0
1,0,0,9.02,80,0.0000,1,0,0,0,1,...,0,0,0,0,0,1,0,0,0,0
2,0,0,9.02,80,0.0000,1,0,0,0,1,...,0,0,0,0,0,1,0,0,0,0
3,0,0,9.84,75,0.0000,1,0,0,0,1,...,0,0,0,0,0,1,0,0,0,0
4,0,0,9.84,75,0.0000,1,0,0,0,1,...,0,0,0,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10881,0,1,15.58,50,26.0027,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
10882,0,1,14.76,57,15.0013,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
10883,0,1,13.94,61,15.0013,1,0,0,0,0,...,1,0,0,0,0,0,0,0,0,1
10884,0,1,13.94,61,6.0032,1,0,0,0,0,...,0,1,0,0,0,0,0,0,0,1


In [102]:
model.fit(train , y)
int(model.predict(test[:1]))

[Pipeline] ............ (step 1 of 2) Processing scaler, total=   0.0s
[Pipeline] ............... (step 2 of 2) Processing svr, total=   6.9s


2

# The final pipeline

In [4]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVR

# define the preprocessing steps
preprocess = Pipeline([('extract', ExtractData()),
                       ('datatype', DataType()),
                       ('encoding', Encoding()),
                       ('drop', Drop())])

# define the model pipeline
model = Pipeline([('scaler', StandardScaler()),
                  ('svr', SVR())])

# combine the preprocessing and model pipelines
pipe = Pipeline([('preprocess', preprocess),
                 ('model', model)])

# fit the pipeline on training data
pipe.fit(train, y)

# make predictions on test data using the fitted pipeline
#y_pred = pipe.predict(test).round()


In [5]:
import numpy as np
row = test.iloc[[1]]  # get the first row of the test data as a DataFrame
prediction = int(pipe.predict(row)) # make a prediction on the row
prediction


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['year'] = pd.DatetimeIndex(X['datetime']).year
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['month'] = pd.DatetimeIndex(X['datetime']).month
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['hour'] = pd.DatetimeIndex(X['datetime']).hour
A value is trying to be set on a copy of a slice from a 

1

## Generate pickle file

In [6]:
import pickle

pickle.dump(pipe, open('pipe.pkl', 'wb'))

## Test the pickle file

In [4]:
import pickle
pipe = pickle.load(open('pipe.pkl', 'rb'))

In [51]:
pipe.predict(test.iloc[[100]])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['year'] = pd.DatetimeIndex(X['datetime']).year
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['month'] = pd.DatetimeIndex(X['datetime']).month
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['hour'] = pd.DatetimeIndex(X['datetime']).hour
A value is trying to be set on a copy of a slice from a 

array([4.18412396])

In [48]:
import pandas as pd
input_list =['2011-01-20 00:00:00','1','0','1','1',10.66,11.365,56,26.0027]
# define the column names in the same order as the input_list
column_names = ['datetime', 'season', 'holiday', 'workingday', 'weather', 'temp', 'atemp', 'humidity', 'windspeed']

# create a DataFrame using the input_list and column names
input_df = pd.DataFrame([input_list], columns=column_names)

# make the prediction using the pipeline
prediction = pipe.predict(input_df)


In [50]:
int(prediction)

2