In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
original_df = pd.read_csv(r'/kaggle/input/bike-sharing-demand/train.csv')
test_df = pd.read_csv(r'/kaggle/input/bike-sharing-demand/test.csv')
original_df['datetime'] = pd.to_datetime(original_df['datetime'])
test_df['datetime'] = pd.to_datetime(test_df['datetime'])
total_df = pd.concat([original_df,test_df],axis=0).reset_index().drop(columns=['index'])

In [None]:
pd.set_option('display.max_columns', None)
#pd.set_option('display.max_rows', None)

import matplotlib.pyplot as plt
import seaborn as sns
import missingno as msno
from sklearn.preprocessing import StandardScaler

import warnings
warnings.filterwarnings("ignore")

In [None]:
total_df.info()

In [None]:
total_df.describe()

In [None]:
total_df.nunique()

In [None]:
total_df['year'] = total_df['datetime'].dt.year
total_df['month'] = total_df['datetime'].dt.month
total_df['hour'] = total_df['datetime'].dt.hour
total_df['dayofweek'] = total_df['datetime'].dt.dayofweek

In [None]:
total_df.head()

In [None]:
# Arrumando outliers que não tem sentido.
# Existem valores de humidade igual a zero (todos registrados no mesmo dia)
    # Vamos substituir esse valores pelo valor mediano de humidade por tempo
total_df['humidity'][total_df['humidity']==0] = np.nan
total_df['humidity'] = total_df['humidity'].fillna(total_df.groupby(['weather'])['humidity'].transform('median'))

# removendo coluna atemp (alta correlcao com temp)
total_df.drop(columns=['atemp','casual','registered'],inplace=True)

# criando variaveis
total_df['humidity_temp'] = total_df['humidity']/total_df['temp']
total_df['windspeed_temp'] = total_df['windspeed']/total_df['temp']
total_df['windspeed_humidity'] = total_df['windspeed']/total_df['humidity']
total_df['clima'] = total_df['humidity']+total_df['temp']+total_df['windspeed']

In [None]:
train = total_df[total_df['datetime'].isin(original_df['datetime'])]
train.drop(columns=['datetime'],inplace=True)
test = total_df[total_df['datetime'].isin(test_df['datetime'])]
test_dat = test.copy()
test.drop(columns=['datetime','count'],inplace=True)

In [None]:
x = train.drop(columns=['count'])
scaler = StandardScaler()
x = pd.DataFrame(scaler.fit_transform(x),columns=x.columns)
y = pd.DataFrame(train['count'],columns=['count'])

In [None]:
# Modelos de Regressão
from sklearn.neural_network import MLPRegressor
from sklearn.linear_model import LinearRegression, Ridge, Lasso, BayesianRidge, LogisticRegression, TweedieRegressor, SGDRegressor, PassiveAggressiveRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.cross_decomposition import PLSRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor, AdaBoostRegressor, GradientBoostingRegressor
from xgboost import XGBRegressor
from sklearn.model_selection import cross_val_score

def model_test(x,y):
    mlp = MLPRegressor()
    LiRegr = LinearRegression()
    ridge = Ridge()
    lasso = Lasso()
    bayesian = BayesianRidge()
    LogRegr = LogisticRegression()
    tweedie = TweedieRegressor()
    sgd = SGDRegressor()
    passiveaggr = PassiveAggressiveRegressor()
    svr = SVR()
    knearest = KNeighborsRegressor()
    gaussian = GaussianProcessRegressor()
    pls = PLSRegression()
    decisiontree = DecisionTreeRegressor()
    rdmforest = RandomForestRegressor()
    extratree = ExtraTreesRegressor()
    adaboost = AdaBoostRegressor()
    gradientboost = GradientBoostingRegressor()
    xgb = XGBRegressor()

    models_dict = {'MLP':mlp,"Linear Regression":LiRegr,"Ridge Regression":ridge,
                   'Lasso Regression':lasso,"Bayesian Ridge":bayesian,
                   'Logistic Regression':LogRegr,'Tweedie Regressor':tweedie,
                  'SGD Regressor':sgd,"Passive Aggressive":passiveaggr,
                  'SVR':svr,'K Nearest':knearest,'Gaussian Process':gaussian,
                  'PLS Regression':pls,'Decision Tree':decisiontree,'Random Forest':rdmforest,
                  'Extra Trees':extratree, 'Ada Boost':adaboost,'Gradient Boosting':gradientboost,
                  'XGB Regressor':xgb}
    
    values_dict = {}
    for model in models_dict:
        print(model)
        values_dict[model] =cross_val_score(models_dict[model],x,y,cv=5,scoring='neg_root_mean_squared_error')
    
    return values_dict

In [None]:
from sklearn.ensemble import GradientBoostingRegressor

params = {'n_estimators': 150, 'max_depth': 5, 'random_state': 0, 'min_samples_leaf' : 10, 'learning_rate': 0.1, 'subsample': 0.7, 'loss': 'ls'}
gbm_model = GradientBoostingRegressor(**params)
gbm_model.fit(x,y)

predict = gbm_model.predict(pd.DataFrame(scaler.transform(test),columns=test.columns))
predictions = pd.concat([pd.DataFrame(test_dat.loc[:,'datetime'],columns=['datetime']).reset_index(),pd.DataFrame(predict,columns=['count'])],axis=1).set_index('datetime')
predictions.drop(columns=['index'],inplace=True)
predictions['count'][predictions['count']<0]=0
predictions=predictions.round(0)
predictions.to_csv('submission.csv')

In [None]:
predictions