In [9]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import accuracy_score # para a exibição da acurácia do modelo
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score # função para validação cruzada

## Carregando Dados de treino e teste

In [10]:
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")


## Modelo Random Forest


In [11]:
model = RandomForestClassifier(n_estimators=100, n_jobs=-1, random_state=0)

Modelo só trabalha com variáveis numéricas
A variável datetime é string, logo vamos trabalhar com ela por enquanto...
Primeiro vamos separar data de hora...

In [12]:
def splitDate(value):
    return value.split(" ")[0]
def splitTime(value):
    return value.split(" ")[1]


train["date"] = train["datetime"].map(splitDate)
train["time"] = train["datetime"].map(splitTime)

train.head()

Unnamed: 0,datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,casual,registered,count,date,time
0,2011-01-01 00:00:00,1,0,0,1,9.84,14.395,81,0.0,3,13,16,2011-01-01,00:00:00
1,2011-01-01 01:00:00,1,0,0,1,9.02,13.635,80,0.0,8,32,40,2011-01-01,01:00:00
2,2011-01-01 02:00:00,1,0,0,1,9.02,13.635,80,0.0,5,27,32,2011-01-01,02:00:00
3,2011-01-01 03:00:00,1,0,0,1,9.84,14.395,75,0.0,3,10,13,2011-01-01,03:00:00
4,2011-01-01 04:00:00,1,0,0,1,9.84,14.395,75,0.0,0,1,1,2011-01-01,04:00:00


Agora que separamos, podemos alterar os valores de string data e string time para valores numéricos que sejam interessantes pro nosso modelo. Um estudo de interesse é saber se estamos lidando com finais de semana ou dias úteis. Além disso, outr informação interessante é saber se estamos lidando com aluguéis de bike no período da manhã, tarde, ou noite. Para fazermos essas análises, iremos usar a biblioteca datetime

In [13]:
from datetime import datetime

def isWeekend(value):
    value = datetime.strptime(value, "%Y-%m-%d").strftime('%A')

    if value == "Saturday" or value == "Sunday":
        return 1
    else:
        return 0

train["weekend"] = train["date"].map(isWeekend)
train.head()


Unnamed: 0,datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,casual,registered,count,date,time,weekend
0,2011-01-01 00:00:00,1,0,0,1,9.84,14.395,81,0.0,3,13,16,2011-01-01,00:00:00,1
1,2011-01-01 01:00:00,1,0,0,1,9.02,13.635,80,0.0,8,32,40,2011-01-01,01:00:00,1
2,2011-01-01 02:00:00,1,0,0,1,9.02,13.635,80,0.0,5,27,32,2011-01-01,02:00:00,1
3,2011-01-01 03:00:00,1,0,0,1,9.84,14.395,75,0.0,3,10,13,2011-01-01,03:00:00,1
4,2011-01-01 04:00:00,1,0,0,1,9.84,14.395,75,0.0,0,1,1,2011-01-01,04:00:00,1


Com a coluna weekend pronta, podemos agora trabalhar com a coluna time, a fim de dividir os períodos de tempo em madrugada/manhã/tarde/noite;

In [14]:
def isWeekend(value):
    value = int(datetime.strptime(value,  "%H:%M:%S").strftime('%H'))
    if value <= 5:
        return 0
    elif value >= 6 and value <= 11:
        return 1
    elif value >= 12 and value <= 17:
        return 2
    elif value >= 18 and value <= 23:
        return 3

train["period"] = train["time"].map(isWeekend)
train.head()

Unnamed: 0,datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,casual,registered,count,date,time,weekend,period
0,2011-01-01 00:00:00,1,0,0,1,9.84,14.395,81,0.0,3,13,16,2011-01-01,00:00:00,1,0
1,2011-01-01 01:00:00,1,0,0,1,9.02,13.635,80,0.0,8,32,40,2011-01-01,01:00:00,1,0
2,2011-01-01 02:00:00,1,0,0,1,9.02,13.635,80,0.0,5,27,32,2011-01-01,02:00:00,1,0
3,2011-01-01 03:00:00,1,0,0,1,9.84,14.395,75,0.0,3,10,13,2011-01-01,03:00:00,1,0
4,2011-01-01 04:00:00,1,0,0,1,9.84,14.395,75,0.0,0,1,1,2011-01-01,04:00:00,1,0


In [15]:
j = []
for i in train["temp"].iteritems() :
    if i[1] < 10:
        x = 0
        j.append(x)

    elif i[1] > 10 and i[1] < 20:
        x = 1
        j.append(x)
    elif i[1] > 20 and i[1] < 30:
        x = 2
        j.append(x)
    else:
        x = 3
        j.append(x)
train["faixa_temp"] = j
train.head()

Unnamed: 0,datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,casual,registered,count,date,time,weekend,period,faixa_temp
0,2011-01-01 00:00:00,1,0,0,1,9.84,14.395,81,0.0,3,13,16,2011-01-01,00:00:00,1,0,0
1,2011-01-01 01:00:00,1,0,0,1,9.02,13.635,80,0.0,8,32,40,2011-01-01,01:00:00,1,0,0
2,2011-01-01 02:00:00,1,0,0,1,9.02,13.635,80,0.0,5,27,32,2011-01-01,02:00:00,1,0,0
3,2011-01-01 03:00:00,1,0,0,1,9.84,14.395,75,0.0,3,10,13,2011-01-01,03:00:00,1,0,0
4,2011-01-01 04:00:00,1,0,0,1,9.84,14.395,75,0.0,0,1,1,2011-01-01,04:00:00,1,0,0


In [None]:
P = sns.pairplot(train) # gera graficos relacionando todas as variaveis duas a duas

In [None]:
sns.boxplot(x = "season",y="temp",data=train) # grafico

In [None]:
# Outro modelo para testar
#B_model = SVC(gamma="auto")
#B_model.fit(x_treino,y_treino)
#previsao = B_model.predict(x_treino)

#print('Acurácia com dados de Treinamento: ',accuracy_score(y_treino, previsao))