In [1]:
import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats

In [2]:
train = pd.read_csv("./train.csv", parse_dates=["datetime"])
test = pd.read_csv("./test.csv", parse_dates=["datetime"])

In [3]:
train["year"] = train["datetime"].dt.year
train["month"] = train["datetime"].dt.month
train["day"] = train["datetime"].dt.day
train["hour"] = train["datetime"].dt.hour
train["minute"] = train["datetime"].dt.minute
train["second"] = train["datetime"].dt.second

test["year"] = test["datetime"].dt.year
test["month"] = test["datetime"].dt.month
test["day"] = test["datetime"].dt.day
test["hour"] = test["datetime"].dt.hour
test["minute"] = test["datetime"].dt.minute
test["second"] = test["datetime"].dt.second

In [4]:
train.head()

Unnamed: 0,datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,casual,registered,count,year,month,day,hour,minute,second
0,2011-01-01 00:00:00,1,0,0,1,9.84,14.395,81,0.0,3,13,16,2011,1,1,0,0,0
1,2011-01-01 01:00:00,1,0,0,1,9.02,13.635,80,0.0,8,32,40,2011,1,1,1,0,0
2,2011-01-01 02:00:00,1,0,0,1,9.02,13.635,80,0.0,5,27,32,2011,1,1,2,0,0
3,2011-01-01 03:00:00,1,0,0,1,9.84,14.395,75,0.0,3,10,13,2011,1,1,3,0,0
4,2011-01-01 04:00:00,1,0,0,1,9.84,14.395,75,0.0,0,1,1,2011,1,1,4,0,0


In [5]:
test.head()

Unnamed: 0,datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,year,month,day,hour,minute,second
0,2011-01-20 00:00:00,1,0,1,1,10.66,11.365,56,26.0027,2011,1,20,0,0,0
1,2011-01-20 01:00:00,1,0,1,1,10.66,13.635,56,0.0,2011,1,20,1,0,0
2,2011-01-20 02:00:00,1,0,1,1,10.66,13.635,56,0.0,2011,1,20,2,0,0
3,2011-01-20 03:00:00,1,0,1,1,10.66,12.88,56,11.0014,2011,1,20,3,0,0
4,2011-01-20 04:00:00,1,0,1,1,10.66,12.88,56,11.0014,2011,1,20,4,0,0


In [6]:
train = train.drop(columns=["casual", "registered", "minute", "second"], axis=1)
test = test.drop(columns=["minute", "second"], axis=1)

In [7]:
train.corr()

Unnamed: 0,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,count,year,month,day,hour
season,1.0,0.029368,-0.008126,0.008879,0.258689,0.264744,0.19061,-0.147121,0.163439,-0.004797,0.971524,0.001729,-0.006546
holiday,0.029368,1.0,-0.250491,-0.007074,0.000295,-0.005215,0.001929,0.008409,-0.005393,0.012021,0.001731,-0.015877,-0.000354
workingday,-0.008126,-0.250491,1.0,0.033772,0.029966,0.02466,-0.01088,0.013373,0.011594,-0.002482,-0.003394,0.009829,0.00278
weather,0.008879,-0.007074,0.033772,1.0,-0.055035,-0.055376,0.406244,0.007261,-0.128655,-0.012548,0.012144,-0.00789,-0.02274
temp,0.258689,0.000295,0.029966,-0.055035,1.0,0.984948,-0.064949,-0.017852,0.394454,0.061226,0.257589,0.015551,0.14543
atemp,0.264744,-0.005215,0.02466,-0.055376,0.984948,1.0,-0.043536,-0.057473,0.389784,0.05854,0.264173,0.011866,0.140343
humidity,0.19061,0.001929,-0.01088,0.406244,-0.064949,-0.043536,1.0,-0.318607,-0.317371,-0.078606,0.204537,-0.011335,-0.278011
windspeed,-0.147121,0.008409,0.013373,0.007261,-0.017852,-0.057473,-0.318607,1.0,0.101369,-0.015221,-0.150192,0.036157,0.146631
count,0.163439,-0.005393,0.011594,-0.128655,0.394454,0.389784,-0.317371,0.101369,1.0,0.260403,0.166862,0.019826,0.400601
year,-0.004797,0.012021,-0.002482,-0.012548,0.061226,0.05854,-0.078606,-0.015221,0.260403,1.0,-0.004932,0.0018,-0.004234


In [8]:
train["tempband"] = pd.cut(train["temp"], 5)
train[["tempband", "count"]].groupby(['tempband'], as_index=False).mean().sort_values(by='tempband', ascending=True)

Unnamed: 0,tempband,count
0,"(0.78, 8.856]",67.601116
1,"(8.856, 16.892]",131.741819
2,"(16.892, 24.928]",194.850299
3,"(24.928, 32.964]",261.918255
4,"(32.964, 41.0]",339.297767


In [9]:
def setTemp(val):
    result = 0
    if val < 8.856: result = 0
    elif val <16.892: result = 1
    elif val <24.928: result = 2
    elif val <32.964: result = 3
    else: result = 4
    return result

In [10]:
train["temp"] = train["temp"].apply(setTemp)
test["temp"] = test["temp"].apply(setTemp)

In [11]:
train["atempband"] = pd.cut(train["humidity"], 5)
train[["atempband", "count"]].groupby(['atempband'], as_index=False).mean().sort_values(by='atempband', ascending=True)

Unnamed: 0,atempband,count
0,"(-0.1, 20.0]",180.384615
1,"(20.0, 40.0]",290.648718
2,"(40.0, 60.0]",220.54349
3,"(60.0, 80.0]",171.275872
4,"(80.0, 100.0]",109.783666


In [12]:
def setAtemp(val):
    result = 0
    if val < 9.699: result = 0
    elif val <18.638: result = 1
    elif val <27.577: result = 2
    elif val <36.516: result = 3
    else: result = 4
    return result

In [13]:
train["atemp"] = train["atemp"].apply(setAtemp)
test["atemp"] = test["atemp"].apply(setAtemp)

In [14]:
train["humidityband"] = pd.cut(train["humidity"], 5)
train[["humidityband", "count"]].groupby(['humidityband'], as_index=False).mean().sort_values(by='humidityband', ascending=True)

Unnamed: 0,humidityband,count
0,"(-0.1, 20.0]",180.384615
1,"(20.0, 40.0]",290.648718
2,"(40.0, 60.0]",220.54349
3,"(60.0, 80.0]",171.275872
4,"(80.0, 100.0]",109.783666


In [15]:
def setHumidity(val):
    result = 0
    if val < 20.0: result = 0
    elif val <40.0: result = 1
    elif val <60.0: result = 2
    elif val <80.0: result = 3
    else: result = 4
    return result

In [16]:
train["humidity"] = train["humidity"].apply(setHumidity)
test["humidity"] = test["humidity"].apply(setHumidity)

In [17]:
train["windband"] = pd.cut(train["windspeed"], 5)
train[["windband", "count"]].groupby(['windband'], as_index=False).mean().sort_values(by='windband', ascending=True)

Unnamed: 0,windband,count
0,"(-0.057, 11.399]",171.720719
1,"(11.399, 22.799]",210.086558
2,"(22.799, 34.198]",217.713115
3,"(34.198, 45.598]",200.992754
4,"(45.598, 56.997]",133.111111


In [18]:
def setWindBand(val):
    result = 0
    if val < 11.399: result = 0
    elif val <22.799: result = 1
    elif val <34.198: result = 2
    elif val <45.598: result = 3
    else: result = 4
    return result

In [19]:
train["windspeed"] = train["windspeed"].apply(setWindBand)
test["windspeed"] = test["windspeed"].apply(setWindBand)

In [20]:
train.corr()

Unnamed: 0,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,count,year,month,day,hour
season,1.0,0.029368,-0.008126,0.008879,0.241749,0.243988,0.188565,-0.134057,0.163439,-0.004797,0.971524,0.001729,-0.006546
holiday,0.029368,1.0,-0.250491,-0.007074,-0.004596,-0.002714,-0.000295,0.008276,-0.005393,0.012021,0.001731,-0.015877,-0.000354
workingday,-0.008126,-0.250491,1.0,0.033772,0.018028,0.013792,-0.006966,0.022052,0.011594,-0.002482,-0.003394,0.009829,0.00278
weather,0.008879,-0.007074,0.033772,1.0,-0.060207,-0.062695,0.383526,-0.000888,-0.128655,-0.012548,0.012144,-0.00789,-0.02274
temp,0.241749,-0.004596,0.018028,-0.060207,1.0,0.925869,-0.067866,-0.027298,0.362836,0.061808,0.243694,0.008466,0.12232
atemp,0.243988,-0.002714,0.013792,-0.062695,0.925869,1.0,-0.062026,-0.042542,0.372151,0.057302,0.244058,0.004876,0.128585
humidity,0.188565,-0.000295,-0.006966,0.383526,-0.067866,-0.062026,1.0,-0.302579,-0.307351,-0.06884,0.201981,-0.013008,-0.268825
windspeed,-0.134057,0.008276,0.022052,-0.000888,-0.027298,-0.042542,-0.302579,1.0,0.095819,-0.018324,-0.138251,0.030146,0.135678
count,0.163439,-0.005393,0.011594,-0.128655,0.362836,0.372151,-0.307351,0.095819,1.0,0.260403,0.166862,0.019826,0.400601
year,-0.004797,0.012021,-0.002482,-0.012548,0.061808,0.057302,-0.06884,-0.018324,0.260403,1.0,-0.004932,0.0018,-0.004234


In [21]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import Perceptron
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier
import xgboost as xgb
import lightgbm as lgb

from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
k_fold = KFold(n_splits=3, shuffle=True, random_state=0)

scoring = 'accuracy'

This means that in case of installing LightGBM from PyPI via the ``pip install lightgbm`` command, you don't need to install the gcc compiler anymore.
Instead of that, you need to install the OpenMP library, which is required for running LightGBM on the system with the Apple Clang compiler.
You can install the OpenMP library by the following command: ``brew install libomp``.


In [22]:
x = train.drop(columns=["datetime", "tempband", "atempband", "humidityband", "windband", "count"], axis=1)
y = train["count"]
x_test = test.drop(columns=["datetime"])

In [23]:
x.head()

Unnamed: 0,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,year,month,day,hour
0,1,0,0,1,1,1,4,0,2011,1,1,0
1,1,0,0,1,1,1,4,0,2011,1,1,1
2,1,0,0,1,1,1,4,0,2011,1,1,2
3,1,0,0,1,1,1,3,0,2011,1,1,3
4,1,0,0,1,1,1,3,0,2011,1,1,4


In [24]:
x_test.head()

Unnamed: 0,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,year,month,day,hour
0,1,0,1,1,1,1,2,2,2011,1,20,0
1,1,0,1,1,1,1,2,0,2011,1,20,1
2,1,0,1,1,1,1,2,0,2011,1,20,2
3,1,0,1,1,1,1,2,0,2011,1,20,3
4,1,0,1,1,1,1,2,0,2011,1,20,4


In [25]:
rf = RandomForestClassifier()
rf.fit(x, y)
scoring = 'accuracy'
score = cross_val_score(rf, x, y, cv=k_fold, n_jobs=1, scoring=scoring)
print(score)
print("RandomForest: {0: .4f}".format(round(np.mean(score)*100, 2)))



[0.01285583 0.01469238 0.02295684 0.01285583 0.02846648 0.02387511
 0.02389706 0.01470588 0.01746324 0.02022059]
RandomForest:  1.9200


In [None]:
lo = LogisticRegression()
lo.fit(x, y)
scoring = 'accuracy'
score = cross_val_score(lo, x, y, cv=k_fold, n_jobs=1, scoring=scoring)
print(score)
print("LogisticRegression: {0: .4f}".format(round(np.mean(score)*100, 2)))



In [27]:
# sv = SVC()
# sv.fit(x, y)
# scoring = 'accuracy'
# score = cross_val_score(sv, x, y, cv=k_fold, n_jobs=1, scoring=scoring)
# print(score)
# print("SupportVectorMachine: {0: .4f}".format(round(np.mean(score)*100, 2)))

In [28]:
# ga = GaussianNB()
# ga.fit(x, y)
# scoring = 'accuracy'
# score = cross_val_score(ga, x, y, cv=k_fold, n_jobs=1, scoring=scoring)
# print(score)
# print("GaussianNB: {0: .4f}".format(round(np.mean(score)*100, 2)))

In [29]:
# pc = Perceptron()
# pc.fit(x, y)
# scoring = 'accuracy'
# score = cross_val_score(pc, x, y, cv=k_fold, n_jobs=1, scoring=scoring)
# print(score)
# print("perceptron: {0: .4f}".format(round(np.mean(score)*100, 2)))

In [30]:
# sgd = SGDClassifier()
# sgd.fit(x, y)
# scoring = 'accuracy'
# score = cross_val_score(sgd, x, y, cv=k_fold, n_jobs=1, scoring=scoring)
# print(score)
# print("SGD: {0: .4f}".format(round(np.mean(score)*100, 2)))

In [31]:
# de = DecisionTreeClassifier()
# de.fit(x, y)
# scoring = 'accuracy'
# score = cross_val_score(de, x, y, cv=k_fold, n_jobs=1, scoring=scoring)
# print(score)
# print("DecisionTree: {0: .4f}".format(round(np.mean(score)*100, 2)))

In [32]:
# xgb_model = xgb.XGBClassifier(objective="binary:logistic", random_state=42)
# xgb_model.fit(x, y)
# scoring = 'accuracy'
# score = cross_val_score(xgb_model, x, y, cv=k_fold, n_jobs=1, scoring=scoring)
# print(score)
# print("xgboost: {0: .4f}".format(round(np.mean(score)*100, 2)))

In [73]:
# lgb_model = lgb.LGBMClassifier(objective="binary", random_state=42)
# lgb_model.fit(x, y)
# scoring = 'accuracy'
# score = cross_val_score(lgb_model, x, y, cv=k_fold, n_jobs=1, scoring=scoring)
# print(score)
# print("lightgbm: {0: .4f}".format(round(np.mean(score)*100, 2)))

In [76]:
predict = rf.predict(x_test)

In [75]:
submission = pd.DataFrame({
    "datetime": test["datetime"],
    "count": predict
})

In [79]:
# submission.to_csv("0322_randomforest_{0: .4f}.csv".format(1.95), index=False) 