# Setting

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import pickle
import glob

from sklearn.model_selection import KFold
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

%matplotlib inline

In [2]:
!pip install category_encoders
import category_encoders as ce

Collecting category_encoders
  Downloading category_encoders-2.6.3-py2.py3-none-any.whl.metadata (8.0 kB)
Downloading category_encoders-2.6.3-py2.py3-none-any.whl (81 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.9/81.9 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: category_encoders
Successfully installed category_encoders-2.6.3


In [3]:
df_train = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/TOP_SIGNATE/Deroito_competition/train.csv')
df_test = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/TOP_SIGNATE/Deroito_competition/test.csv')
df_holidays = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/TOP_SIGNATE/Deroito_competition/holidays_in_japan.csv')
df_venue = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/TOP_SIGNATE/Deroito_competition/venue_information.csv')
df_match_info = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/TOP_SIGNATE/Deroito_competition/match_reports.csv')

In [4]:
# kfインスタンスを作る(KFoldの設定)
kf = KFold(n_splits=5, shuffle=True, random_state=42)

In [5]:
df_train_x = df_train.drop(['id', 'attendance'], axis=1)
df_train_y = df_train['attendance']
df_test_x = df_test.drop(['id'], axis=1)

# Feature engineering

In [6]:
# 日付をdatetime型にする
df_holidays['holiday_date'] = pd.to_datetime(df_holidays['holiday_date'])
df_train_x['match_date'] = pd.to_datetime(df_train_x['match_date'])
df_test_x['match_date'] = pd.to_datetime(df_test_x['match_date'])

In [7]:
# 休日なら1、平日なら0のholidays列をデータフレームに追加
df_train_x['holidays'] =df_train_x['match_date'].dt.dayofweek.apply(lambda x: 1 if(x in df_holidays['holiday_date'].values or x in [5,6]) else 0)
df_test_x['holidays'] =df_test_x['match_date'].dt.dayofweek.apply(lambda x: 1 if(x in df_holidays['holiday_date'].values or x in [5,6]) else 0)

In [8]:
# sectionとroundからsection_roundを作る
df_train_x['section_round'] = df_train_x['section'] + '_' + df_train_x['round']
df_test_x['section_round'] = df_test_x['section'] + '_' + df_test_x['round']
df_train_x = df_train_x.drop(['section', 'round'], axis=1)
df_test_x = df_test_x.drop(['section', 'round'], axis=1)

In [9]:
# kick_off_timeをhourだけにする
df_train_x['kick_off_time'] = pd.to_datetime(df_train_x['kick_off_time'], format='%H:%M').dt.hour
df_test_x['kick_off_time'] = pd.to_datetime(df_test_x['kick_off_time'], format='%H:%M').dt.hour

In [10]:
# 都道府県を見つける関数
def extract_prefecture(address):
  if '県' in address:
    return address[:address.find('県')+1]
  elif '道' in address:
    return address[:address.find('道')+1]
  elif '府' in address:
    return address[:address.find('府')+1]
  elif '都' in address:
    return address[:address.find('都')+1]
  else:
    return None

In [11]:
# df_venueに都道府県を示すprefecture列を追加
df_venue['prefecture'] = df_venue['address'].apply(extract_prefecture)
df_venue.head()

Unnamed: 0,venue,capacity,address,prefecture
0,札幌ドーム,38794,北海道札幌市豊平区羊ヶ丘1,北海道
1,埼玉スタジアム2002,62010,埼玉県さいたま市緑区中野田500,埼玉県
2,味の素スタジアム,47851,東京都調布市西町376−3,東京都
3,町田GIONスタジアム,15320,東京都町田市野津田町2035,東京都
4,日産スタジアム,71624,神奈川県横浜市港北区小机町3300,神奈川県


In [12]:
# データにvenueをマージして、capacity列,prefecture列を追加
df_train_x = df_train_x.merge(df_venue[['venue', 'prefecture', 'capacity']], on='venue', how='left')
df_test_x = df_test_x.merge(df_venue[['venue', 'prefecture', 'capacity']], on='venue', how='left')

In [13]:
# obj型とそれ以外を分ける
df_train_x_obj = df_train_x.select_dtypes(include='object')
df_test_x_obj = df_test_x.select_dtypes(include='object')
df_train_x_num = df_train_x.select_dtypes(exclude='object')
df_test_x_num = df_test_x.select_dtypes(exclude='object')

In [14]:
# objをエンコーダーで数字を割り振る
list_cols = df_train_x_obj.columns
encoder = ce.OrdinalEncoder(cols=list_cols)
df_train_x_obj_enc = encoder.fit_transform(df_train_x_obj)
df_test_x_obj_enc = encoder.transform(df_test_x_obj)

In [15]:
df_train_x_coc = pd.concat([df_train_x_num, df_train_x_obj_enc], axis=1)
df_test_x_coc = pd.concat([df_test_x_num, df_test_x_obj_enc], axis=1)

In [16]:
# 何月かを示すmatch_monthを作成
df_train_x_coc['match_month'] = df_train_x_coc['match_date'].dt.month
df_test_x_coc['match_month'] = df_test_x_coc['match_date'].dt.month
df_train_x_coc.head()

Unnamed: 0,match_date,kick_off_time,temperature,humidity,holidays,capacity,home_team,away_team,venue,weather,broadcasters,section_round,prefecture,match_month
0,2006-03-04,16,8.3,40,1,21000,1,1,1,1,1,1,1,3
1,2006-03-05,13,12.9,28,1,15859,2,2,2,1,2,2,2,3
2,2006-03-05,13,12.1,35,1,47851,3,3,3,1,3,2,3,3
3,2006-03-05,14,11.6,42,1,51697,4,4,4,1,4,2,4,3
4,2006-03-05,14,13.1,32,1,20223,5,5,5,1,5,2,5,3


In [17]:
df_train_x_coc = df_train_x_coc.drop(['match_date'], axis=1)
df_test_x_coc = df_test_x_coc.drop(['match_date'], axis=1)

# prediction

In [18]:
kf.get_n_splits(df_train_x_coc)

5

In [20]:
print(kf.split(df_train_x_coc))

<generator object _BaseKFold.split at 0x788116b1f7d0>


In [27]:
kf.split(df_train_x_coc).__next__()

(array([   1,    2,    3, ..., 3668, 3669, 3671]),
 array([   0,    7,   12,   14,   17,   25,   26,   29,   30,   32,   33,
          43,   44,   45,   51,   52,   58,   61,   63,   70,   76,   80,
          88,   93,   96,  102,  109,  120,  124,  134,  139,  144,  149,
         152,  157,  166,  170,  173,  174,  175,  178,  179,  183,  184,
         188,  192,  194,  196,  203,  209,  211,  214,  217,  218,  221,
         229,  238,  239,  240,  246,  247,  251,  254,  256,  257,  258,
         263,  266,  270,  274,  278,  279,  283,  289,  291,  295,  296,
         298,  299,  305,  309,  314,  315,  321,  322,  324,  325,  331,
         332,  346,  351,  354,  358,  366,  368,  371,  387,  393,  402,
         408,  410,  411,  414,  415,  416,  423,  430,  432,  433,  438,
         439,  442,  443,  449,  450,  457,  461,  463,  465,  478,  479,
         485,  486,  490,  495,  501,  505,  506,  507,  511,  527,  534,
         538,  543,  544,  549,  551,  554,  557,  561,  564,

In [29]:
list_RMSE = []
for i, (train_index, test_index) in enumerate(kf.split(df_train_x_coc)):
  X_train, X_test = df_train_x_coc.loc[train_index], df_train_x_coc.loc[test_index]
  y_train, y_test = df_train_y.loc[train_index], df_train_y.loc[test_index]

  reg = RandomForestRegressor(n_estimators=100, random_state=42)
  reg.fit(X_train, y_train)
  y_pred = reg.predict(X_test)

  name = 'model_' + str(i) + '.pkl'
  with open(name, mode = 'wb') as f:
    pickle.dump(reg, f)

  y_pred = reg.predict(X_test)
  rmse = np.sqrt(mean_squared_error(y_test, y_pred))
  list_RMSE.append(rmse)

  print(rmse)
  print(reg.feature_importances_)
  print(reg)

4286.445177850772
[0.02324195 0.03563654 0.04108417 0.01007817 0.5735099  0.06046421
 0.06272029 0.02587444 0.01326104 0.06879216 0.03226131 0.03494253
 0.01813328]
RandomForestRegressor(random_state=42)
4557.361037125982
[0.01648667 0.03443617 0.03925346 0.01090723 0.57142773 0.06046982
 0.06363716 0.02605324 0.01231434 0.0773666  0.0315796  0.03643707
 0.01963091]
RandomForestRegressor(random_state=42)
4557.330210689621
[0.01905128 0.03577541 0.04252906 0.01061885 0.56429648 0.05784762
 0.06068115 0.02839371 0.01358765 0.07362809 0.03498689 0.03921706
 0.01938675]
RandomForestRegressor(random_state=42)
4542.725911960011
[0.01829705 0.03675096 0.03914503 0.01090324 0.57559951 0.05805417
 0.06150167 0.01877652 0.01376097 0.07428404 0.03294052 0.03944309
 0.02054324]
RandomForestRegressor(random_state=42)
4796.873948758496
[0.02145496 0.03340421 0.03282961 0.01080498 0.57643547 0.05294245
 0.06600386 0.03096379 0.01203825 0.06770489 0.03250499 0.04379599
 0.01911656]
RandomForestRegress

In [None]:
models = glob.glob('*.pkl')
models

['model_0.pkl', 'model_1.pkl', 'model_2.pkl', 'model_4.pkl', 'model_3.pkl']