In [1]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
plt.style.use('seaborn')
data_path='/kaggle/input/bike-sharing-demand/'
df_train=pd.read_csv(data_path +'train.csv', parse_dates=['datetime'])
df_test=pd.read_csv(data_path +'test.csv', parse_dates=['datetime'])
submission=pd.read_csv(data_path +'sampleSubmission.csv')

  plt.style.use('seaborn')


In [2]:
def RMSLE(y_real, y_pred):
    L1 = np.log1p(np.exp(y_real))
    L2 = np.log1p(np.exp(y_pred))    
    calc = (L1 - L2) ** 2
    return np.sqrt(np.mean(calc))

In [3]:
# RMSLE score 
from sklearn.metrics import make_scorer
rmsle_score=make_scorer(RMSLE)

In [4]:
df_train.tail()

Unnamed: 0,datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,casual,registered,count
10881,2012-12-19 19:00:00,4,0,1,1,15.58,19.695,50,26.0027,7,329,336
10882,2012-12-19 20:00:00,4,0,1,1,14.76,17.425,57,15.0013,10,231,241
10883,2012-12-19 21:00:00,4,0,1,1,13.94,15.91,61,15.0013,4,164,168
10884,2012-12-19 22:00:00,4,0,1,1,13.94,17.425,61,6.0032,12,117,129
10885,2012-12-19 23:00:00,4,0,1,1,13.12,16.665,66,8.9981,4,84,88


# データの前処理

### 年月日に分割

In [5]:
def split_datetime(df):
    df['year'] = df['datetime'].dt.year
    df['month'] = df['datetime'].dt.month
    df['day'] = df['datetime'].dt.day
    df['hour'] = df['datetime'].dt.hour
    df['weekday'] = df['datetime'].dt.weekday
    df = df.drop(['datetime'],axis = 1)
    return df

In [6]:
data=pd.concat([df_train, df_test], ignore_index=True)
data.tail()

Unnamed: 0,datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,casual,registered,count
17374,2012-12-31 19:00:00,1,0,1,2,10.66,12.88,60,11.0014,,,
17375,2012-12-31 20:00:00,1,0,1,2,10.66,12.88,60,11.0014,,,
17376,2012-12-31 21:00:00,1,0,1,1,10.66,12.88,60,11.0014,,,
17377,2012-12-31 22:00:00,1,0,1,1,10.66,13.635,56,8.9981,,,
17378,2012-12-31 23:00:00,1,0,1,1,10.66,13.635,65,8.9981,,,


## 特徴量エンジニアリング

#### 共通　（data_1）
* weather = 4 → 3 （4は1つのみ）
* windspeed = 0 → 線型補間
* datetime → delete　（被っているため）
* count　→ log convert　（対数変換）

#### data_2
* atempt → delete （tempを使う）

In [7]:
data_1 = data.copy()
data_1.replace({'weather': 4}, 3, inplace=True)
data_1.loc[:, 'datetime'] = pd.to_datetime(data_1.loc[:, 'datetime'])
data_1.set_index('datetime', inplace=True)
data_1.replace({'windspeed': 0}, np.nan, inplace=True)
data_1['windspeed'] = data_1['windspeed'].interpolate(method='time', limit_direction='both')

data_1.tail()

  data_1.loc[:, 'datetime'] = pd.to_datetime(data_1.loc[:, 'datetime'])


Unnamed: 0_level_0,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,casual,registered,count
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2012-12-31 19:00:00,1,0,1,2,10.66,12.88,60,11.0014,,,
2012-12-31 20:00:00,1,0,1,2,10.66,12.88,60,11.0014,,,
2012-12-31 21:00:00,1,0,1,1,10.66,12.88,60,11.0014,,,
2012-12-31 22:00:00,1,0,1,1,10.66,13.635,56,8.9981,,,
2012-12-31 23:00:00,1,0,1,1,10.66,13.635,65,8.9981,,,


In [8]:
data_1 = data_1.reset_index()
data_1 = split_datetime(data_1)
data_1.tail()

Unnamed: 0,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,casual,registered,count,year,month,day,hour,weekday
17374,1,0,1,2,10.66,12.88,60,11.0014,,,,2012,12,31,19,0
17375,1,0,1,2,10.66,12.88,60,11.0014,,,,2012,12,31,20,0
17376,1,0,1,1,10.66,12.88,60,11.0014,,,,2012,12,31,21,0
17377,1,0,1,1,10.66,13.635,56,8.9981,,,,2012,12,31,22,0
17378,1,0,1,1,10.66,13.635,65,8.9981,,,,2012,12,31,23,0


In [12]:
data_2 = data_1.drop('atemp', axis=1) 
data_2.tail()

Unnamed: 0,season,holiday,workingday,weather,temp,humidity,windspeed,casual,registered,count,year,month,day,hour,weekday
17374,1,0,1,2,10.66,60,11.0014,,,,2012,12,31,19,0
17375,1,0,1,2,10.66,60,11.0014,,,,2012,12,31,20,0
17376,1,0,1,1,10.66,60,11.0014,,,,2012,12,31,21,0
17377,1,0,1,1,10.66,56,8.9981,,,,2012,12,31,22,0
17378,1,0,1,1,10.66,65,8.9981,,,,2012,12,31,23,0


## 仮説7
* day → delete　（trainとtestで異なるため）

In [13]:
data_7_1 = data_2.drop('day', axis=1)

In [14]:
data_7_2 = data_2.drop('day', axis=1)

In [15]:
X_train_val_7_1=data_7_1[~pd.isnull(data_7_1['count'])]
X_test_7_1=data_7_1[pd.isnull(data_7_1['count'])]
X_train_val_7_1 = X_train_val_7_1.drop(['registered', 'casual', 'count'], axis=1)
X_test_7_1 = X_test_7_1.drop(['registered', 'casual', 'count'], axis=1)
y_train_val_7_1=np.log(df_train['count']) #　countはlogに

In [16]:
X_train_7_1, X_val_7_1, y_train_7_1, y_val_7_1 = train_test_split(
    X_train_val_7_1, y_train_val_7_1, test_size = 0.3, random_state = 1)

In [17]:
X_train_7_1.shape, X_val_7_1.shape, X_test_7_1.shape

((7620, 11), (3266, 11), (6493, 11))

## 検証

## LightGBM + Oputuna

In [18]:
from lightgbm import LGBMRegressor
import optuna
from optuna.samplers import TPESampler

def objective_func(trial):
    # 探索空間
    params = {
        'num_leaves': trial.suggest_int('num_leaves',2, 30),
        'max_depth': trial.suggest_int('max_depth',2, 30),
        'learning_rate': trial.suggest_loguniform("learning_rate", 1e-6, 1e-1),
        'n_estimators': trial.suggest_int('n_estimators', 100, 800),
#         'min_child_samples': trial.suggest_int('min_child_samples', 2, 30),
#         'subsample': trial.suggest_discrete_uniform('subsample', 0.5, 1, 0.1)
        # low, high, step
    }
    model=LGBMRegressor(**params)#　辞書型の受け渡し
    lgbm_model = model.fit(X_train_7_1, y_train_7_1, eval_set=[(X_val_7_1, y_val_7_1)], verbose=False, early_stopping_rounds=25)
    score = RMSLE(y_val_7_1, lgbm_model.predict(X_val_7_1))
    return score 

In [19]:
sampler = TPESampler(seed=10)
study = optuna.create_study(direction='minimize', sampler=sampler)
study.optimize(objective_func, n_trials=50)
trial = study.best_trial
trial_params = trial.params
print('Best Trial: score {},\n params {}'.format(trial.value, trial_params))

[32m[I 2023-06-13 08:33:38,354][0m A new study created in memory with name: no-name-fb7a3d65-ac68-493a-8920-8957e42780fe[0m
  'learning_rate': trial.suggest_loguniform("learning_rate", 1e-6, 1e-1),
[32m[I 2023-06-13 08:33:39,853][0m Trial 0 finished with value: 0.9180742847299579 and parameters: {'num_leaves': 24, 'max_depth': 2, 'learning_rate': 0.0014731303415988814, 'n_estimators': 624}. Best is trial 0 with value: 0.9180742847299579.[0m
  'learning_rate': trial.suggest_loguniform("learning_rate", 1e-6, 1e-1),
[32m[I 2023-06-13 08:33:42,145][0m Trial 1 finished with value: 1.4145524276245045 and parameters: {'num_leaves': 16, 'max_depth': 8, 'learning_rate': 9.779447595326336e-06, 'n_estimators': 633}. Best is trial 0 with value: 0.9180742847299579.[0m
  'learning_rate': trial.suggest_loguniform("learning_rate", 1e-6, 1e-1),
[32m[I 2023-06-13 08:33:43,485][0m Trial 2 finished with value: 0.6549377537572053 and parameters: {'num_leaves': 6, 'max_depth': 4, 'learning_rate':

Best Trial: score 0.27385741129485314,
 params {'num_leaves': 17, 'max_depth': 14, 'learning_rate': 0.06977953935011831, 'n_estimators': 633}


In [20]:
from optuna import visualization
optuna.visualization.plot_param_importances(study)

In [21]:
from sklearn.model_selection import cross_val_score
lgbm_best=LGBMRegressor(**trial_params)
score=cross_val_score(lgbm_best, X_train_7_1, y_train_7_1, cv=5, scoring=rmsle_score, verbose=True)
print(score)
print(np.mean(score))

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[0.29777458 0.27650067 0.28748471 0.28646362 0.29619211]
0.28888313515004665


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:   11.2s finished


In [22]:
lgbm_best.fit(X_train_7_1, y_train_7_1)

In [23]:
y_ret_7_1=lgbm_best.predict(X_test_7_1)

In [163]:
submission_7_1 = submission.copy()
submission_7_1['count']=np.exp(y_ret_7_1)
submission_7_1.to_csv('/kaggle/working/submission_7_1.csv', index=False)


<a href="submission_7_1.csv"> Download File </a>

In [166]:
ls 

ls: cannot access 'kaggle': No such file or directory


In [162]:
pd.read_csv('submission_7_1.csv')

Unnamed: 0,datetime,count
0,2011-01-20 00:00:00,10.482539
1,2011-01-20 01:00:00,3.300955
2,2011-01-20 02:00:00,2.239297
3,2011-01-20 03:00:00,1.935562
4,2011-01-20 04:00:00,1.782357
...,...,...
6488,2012-12-31 19:00:00,273.206763
6489,2012-12-31 20:00:00,180.455421
6490,2012-12-31 21:00:00,133.874408
6491,2012-12-31 22:00:00,100.306951


In [50]:
submission = pd.read_csv('../input/bike-sharing-demand/sampleSubmission.csv')
submission['count']=np.exp(y_ret)
submission.to_csv('submission.csv', index=False)