In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir("../input"))

# Any results you write to the current directory are saved as output.

In [None]:
# Carregar os dados
train = pd.read_csv('../input/train.csv', parse_dates=[0])

test = pd.read_csv('../input/test.csv')

In [None]:
train.shape, test.shape

In [None]:
# Verificando quantidades e tipos
train.info()

In [None]:
test.info()

In [None]:
# Verificando o dataframe
train.head().T

In [None]:
test.head().T

In [None]:
# Transformar a coluina datetime dos dados de teste
test['datetime'] = pd.to_datetime(test['datetime'])

In [None]:
test.info()

In [None]:
# converter a coluna 'count'
train['count'] = np.log(train['count'])

In [None]:
# Vamos jubntar os dados de teste e treino para facilitar as transformações
df = train.append(test)

In [None]:
df.tail().T

In [None]:
# Ordenar o dataframe pelo datetime
df.sort_values('datetime', inplace=True)

## Feature Engineering

In [None]:
# Criando colunas para data e hora
df['hour'] = df['datetime'].dt.hour
df['day'] = df['datetime'].dt.day
df['dayofweek'] = df['datetime'].dt.dayofweek
df['month'] = df['datetime'].dt.month
df['year'] = df['datetime'].dt.year

In [None]:
# Coluna com a diferença de temperatura
df['diff_temp'] = df['atemp'] - df['temp']

In [None]:
# Coluna com a temperatura da hora anterior
df['temp_shift_1'] = df['temp'].shift(1)
df['temp_shift_2'] = df['temp'].shift(2)

'''
for i in range(3):
    df[f'temp_shift_{i}] = df['temp'].shift(i)
'''

In [None]:
# Coluna com a sensação térmica da hora anterior
df['atemp_shift_1'] = df['atemp'].shift(1)
df['atemp_shift_2'] = df['atemp'].shift(2)

In [None]:
# Coluna com a diferença de temperatura da hora anterior
df['diff_shift_1'] = df['diff_temp'].shift(1)
df['diff_shift_2'] = df['diff_temp'].shift(2)

In [None]:
df.head().T

In [None]:
# Criando uma coluna com a média da temperatura das últimas 4 horas
df['rolling_temp'] = df['temp'].rolling(4, min_periods=1).mean()

In [None]:
# Verificando o resultado
df[['temp', 'rolling_temp']].head(10)

## Treinando o modelo

In [None]:
# Separando os modelos
train = df[~df['count'].isnull()]
test = df[df['count'].isnull()]

train.shape, test.shape

In [None]:
# salvando os dados do dataframe de treino
train_raw = train.copy()

In [None]:
# Separando os dados de treino em treino e validação
from sklearn.model_selection import train_test_split
train, valid = train_test_split(train, random_state=42)

train.shape, valid.shape

In [None]:
# Separando as features e o resultado
remove_cols = ['count', 'casual', 'registered', 'datetime']

In [None]:
# Separando as coluinas a serem usada para o treino
cols = []
for c in train.columns:
    if c not in remove_cols:
        cols.append(c)
        
cols        

In [None]:
# Separando as colunas a serem usadas para o treino
feats = [c for c in train.columns if c not in remove_cols]

In [None]:
# Importar os modelos
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor
from sklearn.ensemble import AdaBoostRegressor, GradientBoostingRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR

In [None]:
# Dicionário de modelos
models = {'Random Forest': RandomForestRegressor(random_state=42),
         'ExtraTrees': ExtraTreesRegressor(random_state=42),
         'GradientBoosting': GradientBoostingRegressor(random_state=42),
         'DecisionTree': DecisionTreeRegressor(random_state=42),
         'AdaBoost': AdaBoostRegressor(random_state=42),
         'KNN 11': KNeighborsRegressor(n_neighbors=11),
         'SVR': SVR(),
         'Linear Regression': LinearRegression()}

In [None]:
# Importabndo a métrica
from sklearn.metrics import mean_squared_error

In [None]:
# Função para treino dos modelos
def run_model(model, train, valid, feats, y_name):
    model.fit(train[feats], train[y_name])
    preds = model.predict(valid[feats])
    return mean_squared_error(valid[y_name], preds) **(1/2)

In [None]:
# Executando os modelos
scores = []
for name, model in models.items():
    score = run_model(model, train.fillna(-1), valid.fillna(-1), feats, 'count')
    scores.append(score)
    print(name, ':', score)