In [None]:
import numpy as np
import pandas as pd 
from pandas_profiling import ProfileReport
import pickle 
import re
from sklearn.ensemble import RandomForestRegressor # ML
from sklearn.inspection import permutation_importance # computing feature importance
import seaborn as sns
import matplotlib.pyplot as plt
import os
import warnings
warnings.filterwarnings('ignore') 

In [None]:
train=pd.read_csv('../input/bike-sharing-demand/train.csv')
test=pd.read_csv('../input/bike-sharing-demand/test.csv')

数据EDA与预处理

In [None]:
train.head()

In [None]:
test.head()

In [None]:
train.info()

In [None]:
test.info()

In [None]:
#将datetime 的数据类型从 object 转化为 datetime
train['datetime'] = pd.to_datetime(train['datetime'])
test['datetime'] = pd.to_datetime(test['datetime'])

In [None]:
#由于casual和register在test中没有出现，所以在train中删去这两列数据
train = train.drop(['casual','registered'],axis=1)

In [None]:
profile = ProfileReport(train, title="Pandas Profiling Report")
profile

特征工程

In [None]:
#去除异常值
train = train[np.abs(train["count"]-train["count"].mean())<=(3*train["count"].std())] 
train = train[np.abs(train["count"]-train["count"].mean())<=(3*train["count"].std())] 
drop_idx = train[(train['atemp'] > 20) & (train['atemp'] < 40) & (train['temp'] > 10) & (train['temp'] < 20)].index
train = train.drop(drop_idx)
train.head()

In [None]:
fields = [f for f in train]
fields = fields[5:-6]
print(fields)

fig = plt.figure(figsize=(17, 3))

for i, f in enumerate(fields):
    ax = fig.add_subplot(1, 4, i+1)
    ax.scatter(train[f], train['count'])
    ax.set_ylabel('count')
    ax.set_xlabel(f)

plt.show()

In [None]:
for df in (train,test):
    
    df['year'] = df['datetime'].dt.year
    df['month'] = df['datetime'].dt.month
    df['day'] = df['datetime'].dt.day
    df['hr'] = df['datetime'].dt.hour

    df['hr_categori'] = np.nan
    df.loc[(df.workingday == 1)&(df.hr <= 9)&(df.hr >= 7), 'hr_categori'] = "rush"
    df.loc[(df.workingday == 1)&(df.hr <= 19)&(df.hr >= 16), 'hr_categori'] = "rush"
    df.loc[(df.workingday == 1)&(df.hr < 16)&(df.hr > 9), 'hr_categori'] = "day"
    df.loc[(df.workingday == 1)&(df.hr < 7)|(df.hr > 19), 'hr_categori'] = "night"

count与各项指标的关系

In [None]:
sns.factorplot(x="hr",y="count",data=train,kind='bar',size=5,aspect=1.5)

In [None]:
sns.factorplot(x="month",y="count",data=train,kind='bar',size=5,aspect=1.5)

In [None]:
sns.factorplot(x="year",y="count",data=train,kind='bar',size=5,aspect=1.5)

In [None]:
new_df=train.copy()

new_df['temp_bin']=np.floor(new_df['temp'])//3

sns.factorplot(x="temp_bin",y="count",data=new_df,kind='bar')

In [None]:
new_df=train.copy()
new_df['humidity_bin']=np.floor(new_df['humidity'])//5

sns.factorplot(x="humidity_bin",y="count",data=new_df,kind='bar')

随机森林模型

In [None]:
# Creating the list of features
feature_names = ['year', 'hour', 'season', 'holiday', 'workingday',
                 'weather', 'temp', 'atemp', 'humidity', 'windspeed']

# Assigning the data corresponding to features in feature_names
train['year']=train.datetime.dt.year
train['hour']=train.datetime.dt.year
X_train = train[feature_names]

# Assigning the data corresponding to features in feature_names
test['year'] = test.datetime.dt.year
test['hour'] = test.datetime.dt.hour

X_test = test[feature_names]

In [None]:
# Assigning the data corresponding to the target variable
y_train = train['count']
print(y_train)

In [None]:
# Defining the model 
rf_model = RandomForestRegressor(random_state=1)

# Fitting the model 
rf_model.fit(X_train, y_train)

# Prediction
y_pred = rf_model.predict(X_test)
pred = pd.DataFrame(y_pred)

In [None]:
# Calculating feature importance
feat_importances = pd.Series(rf_model.feature_importances_, index=X_train.columns)
feat_importances.nlargest(10).plot(kind='barh')

In [None]:
ss = pd.read_csv('/kaggle/input/bike-sharing-demand/sampleSubmission.csv')
datasets = pd.concat([ss['datetime'], pred], axis=1)
datasets.columns =['datetime', 'count']
datasets.to_csv('submission.csv', index=False)