# Setting

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import pickle
import glob

from sklearn.model_selection import KFold
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

%matplotlib inline

In [2]:
!pip install category_encoders
import category_encoders as ce

Collecting category_encoders
  Downloading category_encoders-2.6.3-py2.py3-none-any.whl.metadata (8.0 kB)
Downloading category_encoders-2.6.3-py2.py3-none-any.whl (81 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.9/81.9 kB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: category_encoders
Successfully installed category_encoders-2.6.3


In [3]:
df_train = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/TOP_SIGNATE/Deroito_competition/train.csv')
df_test = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/TOP_SIGNATE/Deroito_competition/test.csv')
df_holidays = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/TOP_SIGNATE/Deroito_competition/holidays_in_japan.csv')
df_venue = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/TOP_SIGNATE/Deroito_competition/venue_information.csv')
df_match_info = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/TOP_SIGNATE/Deroito_competition/match_reports.csv')

In [4]:
# kfインスタンスを作る(KFoldの設定)
kf = KFold(n_splits=5, shuffle=True, random_state=42)

In [5]:
df_train_x = df_train.drop(['id', 'attendance'], axis=1)
df_train_y = df_train['attendance']
df_test_x = df_test.drop(['id'], axis=1)

# Feature engineering

In [6]:
# 日付をdatetime型にする
df_holidays['holiday_date'] = pd.to_datetime(df_holidays['holiday_date'])
df_train_x['match_date'] = pd.to_datetime(df_train_x['match_date'])
df_test_x['match_date'] = pd.to_datetime(df_test_x['match_date'])

In [7]:
# 休日なら1、平日なら0のholidays列をデータフレームに追加
df_train_x['holidays'] =df_train_x['match_date'].dt.dayofweek.apply(lambda x: 1 if(x in df_holidays['holiday_date'].values or x in [5,6]) else 0)
df_test_x['holidays'] =df_test_x['match_date'].dt.dayofweek.apply(lambda x: 1 if(x in df_holidays['holiday_date'].values or x in [5,6]) else 0)

In [8]:
# sectionとroundからsection_roundを作る
df_train_x['section_round'] = df_train_x['section'] + '_' + df_train_x['round']
df_test_x['section_round'] = df_test_x['section'] + '_' + df_test_x['round']
df_train_x = df_train_x.drop(['section', 'round'], axis=1)
df_test_x = df_test_x.drop(['section', 'round'], axis=1)

In [9]:
# kick_off_timeをhourだけにする
df_train_x['kick_off_time'] = pd.to_datetime(df_train_x['kick_off_time'], format='%H:%M').dt.hour
df_test_x['kick_off_time'] = pd.to_datetime(df_test_x['kick_off_time'], format='%H:%M').dt.hour

In [10]:
# 都道府県を見つける関数
def extract_prefecture(address):
  if '県' in address:
    return address[:address.find('県')+1]
  elif '道' in address:
    return address[:address.find('道')+1]
  elif '府' in address:
    return address[:address.find('府')+1]
  elif '都' in address:
    return address[:address.find('都')+1]
  else:
    return None

In [11]:
# df_venueに都道府県を示すprefecture列を追加
df_venue['prefecture'] = df_venue['address'].apply(extract_prefecture)
df_venue.head()

Unnamed: 0,venue,capacity,address,prefecture
0,札幌ドーム,38794,北海道札幌市豊平区羊ヶ丘1,北海道
1,埼玉スタジアム2002,62010,埼玉県さいたま市緑区中野田500,埼玉県
2,味の素スタジアム,47851,東京都調布市西町376−3,東京都
3,町田GIONスタジアム,15320,東京都町田市野津田町2035,東京都
4,日産スタジアム,71624,神奈川県横浜市港北区小机町3300,神奈川県


In [12]:
# データにvenueをマージして、capacity列,prefecture列を追加
df_train_x = df_train_x.merge(df_venue[['venue', 'prefecture', 'capacity']], on='venue', how='left')
df_test_x = df_test_x.merge(df_venue[['venue', 'prefecture', 'capacity']], on='venue', how='left')

In [13]:
# obj型とそれ以外を分ける
df_train_x_obj = df_train_x.select_dtypes(include='object')
df_test_x_obj = df_test_x.select_dtypes(include='object')
df_train_x_num = df_train_x.select_dtypes(exclude='object')
df_test_x_num = df_test_x.select_dtypes(exclude='object')

In [14]:
# objをエンコーダーで数字を割り振る
list_cols = df_train_x_obj.columns
encoder = ce.OrdinalEncoder(cols=list_cols)
df_train_x_obj_enc = encoder.fit_transform(df_train_x_obj)
df_test_x_obj_enc = encoder.transform(df_test_x_obj)

In [15]:
df_train_x_coc = pd.concat([df_train_x_num, df_train_x_obj_enc], axis=1)
df_test_x_coc = pd.concat([df_test_x_num, df_test_x_obj_enc], axis=1)

In [16]:
# 何月かを示すmatch_monthを作成
df_train_x_coc['match_month'] = df_train_x_coc['match_date'].dt.month
df_test_x_coc['match_month'] = df_test_x_coc['match_date'].dt.month
df_train_x_coc.head()

Unnamed: 0,match_date,kick_off_time,temperature,humidity,holidays,capacity,home_team,away_team,venue,weather,broadcasters,section_round,prefecture,match_month
0,2006-03-04,16,8.3,40,1,21000,1,1,1,1,1,1,1,3
1,2006-03-05,13,12.9,28,1,15859,2,2,2,1,2,2,2,3
2,2006-03-05,13,12.1,35,1,47851,3,3,3,1,3,2,3,3
3,2006-03-05,14,11.6,42,1,51697,4,4,4,1,4,2,4,3
4,2006-03-05,14,13.1,32,1,20223,5,5,5,1,5,2,5,3


In [17]:
df_train_x_coc = df_train_x_coc.drop(['match_date'], axis=1)
df_test_x_coc = df_test_x_coc.drop(['match_date'], axis=1)

# Prediction

In [18]:
list_RMSE = []
for i, (train_index, test_index) in enumerate(kf.split(df_train_x_coc)):
  X_train, X_test = df_train_x_coc.iloc[train_index], df_train_x_coc.iloc[test_index]
  y_train, y_test = df_train_y.iloc[train_index], df_train_y.iloc[test_index]

  reg = RandomForestRegressor(n_estimators=100, random_state=42)
  reg.fit(X_train, y_train)
  y_pred = reg.predict(X_test)

  name = 'model_' + str(i) + '.pickle'
  with open(name, 'wb') as f:
    pickle.dump(reg, f)

  y_pred = reg.predict(X_test)

  RMSE = np.sqrt(mean_squared_error(y_test, y_pred))

  print(f'Fold {i+1}: RMSE = {RMSE}')
  list_RMSE.append(RMSE)

Fold 1: RMSE = 4286.445177850772
Fold 2: RMSE = 4557.361037125982
Fold 3: RMSE = 4557.330210689621
Fold 4: RMSE = 4542.725911960011
Fold 5: RMSE = 4796.873948758496


In [19]:
models = glob.glob('*.pickle')

In [20]:
list_pred = []
for model in models:
  with open(model, 'rb') as f:
    model = pickle.load(f)

  test_pred = model.predict(df_test_x_coc)

  list_pred.append(test_pred)

In [26]:
list_pred = np.array(list_pred)
print(list_pred.shape)

(5, 612)


In [27]:
average_pred = np.mean(list_pred, axis=0)
print(average_pred.shape)

(612,)


In [28]:
print(average_pred)

[14138.728 31640.704 17958.304 23019.364 13372.642 16872.442 17165.792
 27007.746 29193.24  18755.132 13743.138 11721.592 23816.618 23812.14
 20889.484 19803.932 13980.788 42030.992 21262.63  11914.04  13746.648
 17466.978 19100.186 12450.652 12748.498 17498.602 12753.206 26636.65
 15931.466 24242.786 21056.244 39511.462 19388.676 14801.14  12049.836
 28665.876 11812.852 16334.82  14841.32  18289.068 30234.094 20498.206
 13637.012 12849.236 28212.4   15386.908 31276.236 12962.104 33825.108
 11873.66  16757.838 11716.752 22016.106 32923.33  14168.428 25717.982
 18306.016 14285.432 13664.494 16990.628 17873.392 11616.57  12047.16
 14477.248 12055.416 13884.79  28372.798 13830.384 23951.742 12665.824
 37387.84  12411.57  14109.598 14501.38  14379.988 36434.87  21408.294
 26391.084 18590.37  20202.6   15349.61  10129.788 12429.558 12849.868
 21315.922 10127.11  13507.312 14361.674 13204.356 13668.092 16242.184
 14505.208 14809.432 26935.144 13265.734 37628.224 18628.94  30575.398
 21871.30

In [29]:
submission = pd.DataFrame({'id': df_test['id'], 'attendance': average_pred})
submission.to_csv('Deroito_J_league_08.csv', index=False, header=False)