In [24]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir("../input"))

# Any results you write to the current directory are saved as output.

['test.csv', 'sampleSubmission.csv', 'train.csv']


In [25]:
# Carregar os dados
df = pd.read_csv('../input/train.csv', parse_dates=[0])
test = pd.read_csv('../input/test.csv', parse_dates=[0])

In [26]:
df.shape, test.shape

((10886, 12), (6493, 9))

In [27]:
# Transformando a coluna count
df['count'] = np.log(df['count'])

In [28]:
# Pre-processamento dos dados

# Juntar os dataframes
df = df.append(test, sort=True)

In [29]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 17379 entries, 0 to 6492
Data columns (total 12 columns):
atemp         17379 non-null float64
casual        10886 non-null float64
count         10886 non-null float64
datetime      17379 non-null datetime64[ns]
holiday       17379 non-null int64
humidity      17379 non-null int64
registered    10886 non-null float64
season        17379 non-null int64
temp          17379 non-null float64
weather       17379 non-null int64
windspeed     17379 non-null float64
workingday    17379 non-null int64
dtypes: datetime64[ns](1), float64(6), int64(5)
memory usage: 1.7 MB


In [30]:
# Transformação da coluna datetime (feature engineering)
df['year'] = df['datetime'].dt.year
df['month'] = df['datetime'].dt.month
df['day'] = df['datetime'].dt.day
df['dayofweek'] = df['datetime'].dt.dayofweek
df['hour'] = df['datetime'].dt.hour

In [31]:
# Ordenar os dados pela coluna datetime
df.sort_values('datetime', inplace=True)

In [32]:
# Criando a coluna rolling_temp
df['rolling_temp'] = df['temp'].rolling(4, min_periods=1).mean()

In [33]:
# Criando a coluna rolling_atemp
df['rolling_atemp'] = df['atemp'].rolling(4, min_periods=1).mean()

In [34]:
# Separando os dataframes
test = df[df['count'].isnull()]
df = df[~df['count'].isnull()]

In [35]:
df.shape, test.shape

((10886, 19), (6493, 19))

In [36]:
# Separando o df em treino e validação
from sklearn.model_selection import train_test_split

In [37]:
train, valid = train_test_split(df, random_state=42)

In [38]:
train.shape, valid.shape

((8164, 19), (2722, 19))

In [39]:
# Selecionar as colunas a serem usadas no trainamento e validação

# Lista das colunas não usadas
removed_cols = ['count', 'casual', 'registered', 'datetime']

# Lista das features
feats = [c for c in df.columns if c not in removed_cols]

In [41]:
# Usar o modelo de RandomForest

# Importar o modelo
from sklearn.ensemble import RandomForestRegressor

In [42]:
# Instanciar o modelo
rf = RandomForestRegressor(random_state=42)

In [44]:
# Treinar o modelo
rf.fit(train[feats], train['count'])



RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
           oob_score=False, random_state=42, verbose=0, warm_start=False)

In [45]:
# Fazendo as previsões
preds = rf.predict(valid[feats])

In [46]:
# Analisar as previsões com base na métrica

# Importando a métrica
from sklearn.metrics import mean_squared_error

In [47]:
# Validando as previsões
mean_squared_error(valid['count'], preds) ** (1/2)

0.3542274745096707

In [48]:
# Melhorando o modelo de RandomForest
rf = RandomForestRegressor(random_state=42, n_estimators=200, n_jobs=-1)

In [49]:
# Treinar o modelo
rf.fit(train[feats], train['count'])

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=200, n_jobs=-1,
           oob_score=False, random_state=42, verbose=0, warm_start=False)

In [50]:
# Fazendo as previsões
preds = rf.predict(valid[feats])

In [51]:
# Validando as previsões
mean_squared_error(valid['count'], preds) ** (1/2)

0.32538436026486894

In [52]:
# Preparando os dados para o kaggle

# Criando as previsões para os dados de teste
preds_test = rf.predict(test[feats])

In [53]:
# Adicionar as previsões ao dataframe
test['count'] = np.exp(preds_test)

In [54]:
# Salvando o arquivo para o Kaggle
test[['datetime', 'count']].to_csv('rf.csv', index=False)