In [1]:
%matplotlib inline
from feature_engineering import *

#regression imports 
from sklearn.linear_model import LinearRegression,Ridge,Lasso,RidgeCV
from sklearn.ensemble import RandomForestRegressor,BaggingRegressor,GradientBoostingRegressor,AdaBoostRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor

#model selection
from sklearn.model_selection import train_test_split,cross_validate

from sklearn.metrics import mean_squared_log_error

  from numpy.core.umath_tests import inner1d


## Import data

In [2]:
# Read in data
train_df = pd.read_csv('./data/train.csv')
test_df = pd.read_csv('./data/test.csv')

In [3]:
# drop end station 
train_df = train_df.drop(['end_station'], axis = 1)
#test_df = test_df.drop(['end_station'], axis = 1)
train_df = train_df.rename({'start_station': 'station', 'total_in': 'out_count', 'total_out': 'in_count'}, axis=1)
train_df = train_df.rename({'start_station': 'station', 'total_in': 'out_count', 'total_out': 'in_count'}, axis=1) 

In [4]:
train_df.head()

Unnamed: 0.1,Unnamed: 0,station,start_date,start_hour,member_type,weather_description,temp_in_f,humidity_in_%,pressure,precipitation_in_inches,...,wind_speed_in_mph,temp_in_f_delta,pressure_delta,humidity_delta,visibility_delta,precipitation_delta,wind_speed_delta,out_count,in_count,flow
0,0,10th & E St NW,2018-01-01,2,Member,Clear,14.0,64.0,30.0,0.0,...,7.0,4.0,0.0,-8.0,0.0,0.0,6.0,1,1.0,0.0
1,1,10th & E St NW,2018-01-01,12,Member,Partly Cloudy,21.0,38.0,30.0,0.0,...,7.7,0.0,0.0,-4.0,0.0,0.0,-1.3,1,1.0,0.0
2,2,10th & E St NW,2018-01-01,15,Member,Partly Cloudy,23.0,36.0,30.0,0.0,...,10.3,1.0,0.0,1.0,0.0,0.0,2.6,1,1.0,0.0
3,3,10th & E St NW,2018-01-01,21,Member,Clear,13.0,55.0,30.0,0.0,...,7.0,-4.0,0.0,8.0,0.0,0.0,-0.7,1,1.0,0.0
4,4,10th & E St NW,2018-01-02,10,Member,Clear,17.0,50.0,30.0,0.0,...,6.3,5.0,0.0,-15.0,0.0,0.0,0.3,1,1.0,0.0


In [5]:
test_df.head()

Unnamed: 0.1,Unnamed: 0,start_station,start_date,start_hour,member_type,weather_description,temp_in_f,humidity_in_%,pressure,precipitation_in_inches,...,wind_speed_in_mph,temp_in_f_delta,pressure_delta,humidity_delta,visibility_delta,precipitation_delta,wind_speed_delta,total_in,total_out,flow
0,0,10th & E St NW,2018-01-19,8,Member,Clear,37.0,34.0,30.0,0.0,...,5.7,7.0,0.0,-17.0,0.0,0.0,4.7,1,1.0,0.0
1,1,10th & E St NW,2018-01-19,10,Member,Clear,46.0,23.0,30.0,0.0,...,8.0,3.0,0.0,-1.0,0.0,0.0,-0.3,1,1.0,0.0
2,2,10th & E St NW,2018-01-19,13,Member,Clear,51.0,16.0,30.0,0.0,...,8.3,2.0,0.0,-3.0,0.0,0.0,1.3,2,6.0,-4.0
3,3,10th & E St NW,2018-01-19,15,Member,Clear,52.0,17.0,30.0,0.0,...,7.7,-2.0,0.0,2.0,0.0,0.0,0.0,3,1.0,2.0
4,4,10th & E St NW,2018-01-19,16,Casual,Clear,47.0,22.0,30.0,0.0,...,5.3,-5.0,0.0,5.0,-1.0,0.0,-2.4,1,1.0,0.0


## Feature engineering

In [6]:
# do feature engineering 
train_df = holiday(train_df)
train_df = season(train_df)
train_df = date_feature(train_df)
train_df = member_type(train_df)
train_df = weather(train_df)
train_df = wind_direction(train_df)
train_df = top_i_station_onehot(train_df)

test_df = holiday(test_df)
test_df = season(test_df)
test_df = date_feature(test_df)
test_df = member_type(test_df)
test_df = weather(test_df)
test_df = wind_direction(test_df)

In [7]:
train_df.fillna(0, inplace = True) # for now we put 0s in nans 

## Models

In [None]:
# Splitting train data into cross validation elements.
# Right now we predict total_in
X_train, X_test, y_train, y_test = train_test_split(train_df.drop(['station', 'in_count', 'out_count', 'flow'], axis = 1),
                                                    train_df['in_count'], test_size = 0.20, random_state = 42)

# SVR removed for now cause f**
models = [RandomForestRegressor(max_depth = 5, n_estimators = 100), AdaBoostRegressor(), BaggingRegressor(), KNeighborsRegressor()] #SVR()
model_names = ['RandomForest', 'AdaBoost', 'Bagging', 'Kneighnours'] # SVR
rmse = []
d = {}
for model in range (len(models)):
    print('Running model: ', model_names[model])
    clf = models[model]
    clf.fit(X_train, y_train)
    print('model fitted..')
    test_pred = clf.predict(X_test)
    print('predictions done..')
    model_rmse = np.sqrt(mean_squared_log_error(test_pred, y_test))
    rmse.append(model_rmse)
    print(model_rmse)

d = {'Modelling Algo':model_names,'RMSE':rmse} 

rmse_df = pd.DataFrame(d)
rmse_df

Running model:  RandomForest


In [None]:
### Tuning the model 
