# Setting

In [None]:
!pip install category_encoders
import category_encoders as ce



In [None]:
import lightgbm as lgb

Dask dataframe query planning is disabled because dask-expr is not installed.

You can install it with `pip install dask[dataframe]` or `conda install dask`.
This will raise in a future version.



In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error

In [None]:
df_train = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/TOP_SIGNATE/Deroito_competition/train.csv')
df_test = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/TOP_SIGNATE/Deroito_competition/test.csv')
df_holidays = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/TOP_SIGNATE/Deroito_competition/holidays_in_japan.csv')
df_venue = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/TOP_SIGNATE/Deroito_competition/venue_information.csv')

In [None]:
df_train_x = df_train.drop(['id', 'attendance'], axis=1)
df_train_y = df_train['attendance']
df_test_x = df_test.drop(['id'], axis=1)

# Feature engineering

In [None]:
# 日付をdatetime型にする
df_holidays['holiday_date'] = pd.to_datetime(df_holidays['holiday_date'])
df_train_x['match_date'] = pd.to_datetime(df_train_x['match_date'])
df_test_x['match_date'] = pd.to_datetime(df_test_x['match_date'])

In [None]:
# 休日なら1、平日なら0のholidays列をデータフレームに追加
df_train_x['holidays'] =df_train_x['match_date'].dt.dayofweek.apply(lambda x: 1 if(x in df_holidays['holiday_date'].values or x in [5,6]) else 0)
df_test_x['holidays'] =df_test_x['match_date'].dt.dayofweek.apply(lambda x: 1 if(x in df_holidays['holiday_date'].values or x in [5,6]) else 0)

In [None]:
# sectionとroundからsection_roundを作る
df_train_x['section_round'] = df_train_x['section'] + '_' + df_train_x['round']
df_test_x['section_round'] = df_test_x['section'] + '_' + df_test_x['round']
df_train_x = df_train_x.drop(['section', 'round'], axis=1)
df_test_x = df_test_x.drop(['section', 'round'], axis=1)

In [None]:
# df_venueからcapacity列を作成する
df_train_x = pd.merge(df_train_x, df_venue[['venue', 'capacity']], on='venue', how='left')
df_test_x = pd.merge(df_test_x, df_venue[['venue', 'capacity']], on='venue', how='left')
df_train_x.head()

Unnamed: 0,match_date,kick_off_time,home_team,away_team,venue,weather,temperature,humidity,broadcasters,holidays,section_round,capacity
0,2006-03-04,16:04,G大阪,浦和,万博記念競技場,晴,8.3,40,NHK総合/J SPORTS(録),1,第1節_第1日,21000
1,2006-03-05,13:00,甲府,清水,山梨県小瀬スポーツ公園陸上競技場,晴,12.9,28,山梨放送/テレビ静岡(録)/J SPORTS(録),1,第1節_第2日,15859
2,2006-03-05,13:35,FC東京,大分,味の素スタジアム,晴,12.1,35,BS-i/MXテレビ(録)/J SPORTS(録),1,第1節_第2日,47851
3,2006-03-05,14:04,磐田,福岡,静岡スタジアムエコパ,晴,11.6,42,J SPORTS,1,第1節_第2日,51697
4,2006-03-05,14:04,名古屋,C大阪,名古屋市瑞穂陸上競技場,晴,13.1,32,スカイパーフェクTV!/NHK名古屋(録)/NHK大阪(録)/J SPORTS(録),1,第1節_第2日,20223


In [None]:
# kick_off_timeをhourだけにする
df_train_x['kick_off_time'] = pd.to_datetime(df_train_x['kick_off_time'], format='%H:%M').dt.hour
df_test_x['kick_off_time'] = pd.to_datetime(df_test_x['kick_off_time'], format='%H:%M').dt.hour
df_train_x.head()

Unnamed: 0,match_date,kick_off_time,home_team,away_team,venue,weather,temperature,humidity,broadcasters,holidays,section_round,capacity
0,2006-03-04,16,G大阪,浦和,万博記念競技場,晴,8.3,40,NHK総合/J SPORTS(録),1,第1節_第1日,21000
1,2006-03-05,13,甲府,清水,山梨県小瀬スポーツ公園陸上競技場,晴,12.9,28,山梨放送/テレビ静岡(録)/J SPORTS(録),1,第1節_第2日,15859
2,2006-03-05,13,FC東京,大分,味の素スタジアム,晴,12.1,35,BS-i/MXテレビ(録)/J SPORTS(録),1,第1節_第2日,47851
3,2006-03-05,14,磐田,福岡,静岡スタジアムエコパ,晴,11.6,42,J SPORTS,1,第1節_第2日,51697
4,2006-03-05,14,名古屋,C大阪,名古屋市瑞穂陸上競技場,晴,13.1,32,スカイパーフェクTV!/NHK名古屋(録)/NHK大阪(録)/J SPORTS(録),1,第1節_第2日,20223


In [None]:
# obj型とそれ以外を分ける
df_train_x_obj = df_train_x.select_dtypes(include='object')
df_test_x_obj = df_test_x.select_dtypes(include='object')
df_train_x_num = df_train_x.select_dtypes(exclude='object')
df_test_x_num = df_test_x.select_dtypes(exclude='object')

In [None]:
# objをエンコーダーで数字を割り振る
list_cols = df_train_x_obj.columns
encoder = ce.OrdinalEncoder(cols=list_cols)
df_train_x_obj_enc = encoder.fit_transform(df_train_x_obj)
df_test_x_obj_enc = encoder.transform(df_test_x_obj)

In [None]:
df_train_x_coc = pd.concat([df_train_x_num, df_train_x_obj_enc], axis=1)
df_test_x_coc = pd.concat([df_test_x_num, df_test_x_obj_enc], axis=1)

In [None]:
df_train_x_coc.head()

Unnamed: 0,match_date,kick_off_time,temperature,humidity,holidays,capacity,home_team,away_team,venue,weather,broadcasters,section_round
0,2006-03-04,16,8.3,40,1,21000,1,1,1,1,1,1
1,2006-03-05,13,12.9,28,1,15859,2,2,2,1,2,2
2,2006-03-05,13,12.1,35,1,47851,3,3,3,1,3,2
3,2006-03-05,14,11.6,42,1,51697,4,4,4,1,4,2
4,2006-03-05,14,13.1,32,1,20223,5,5,5,1,5,2


In [None]:
df_train_x_coc.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3672 entries, 0 to 3671
Data columns (total 12 columns):
 #   Column         Non-Null Count  Dtype         
---  ------         --------------  -----         
 0   match_date     3672 non-null   datetime64[ns]
 1   kick_off_time  3672 non-null   int32         
 2   temperature    3672 non-null   float64       
 3   humidity       3672 non-null   int64         
 4   holidays       3672 non-null   int64         
 5   capacity       3672 non-null   int64         
 6   home_team      3672 non-null   int64         
 7   away_team      3672 non-null   int64         
 8   venue          3672 non-null   int64         
 9   weather        3672 non-null   int64         
 10  broadcasters   3672 non-null   int64         
 11  section_round  3672 non-null   int64         
dtypes: datetime64[ns](1), float64(1), int32(1), int64(9)
memory usage: 330.0 KB


In [None]:
# 何月かを示すmatch_monthを作成
df_train_x_coc['match_month'] = df_train_x_coc['match_date'].dt.month
df_test_x_coc['match_month'] = df_test_x_coc['match_date'].dt.month
df_train_x_coc.head()

Unnamed: 0,match_date,kick_off_time,temperature,humidity,holidays,capacity,home_team,away_team,venue,weather,broadcasters,section_round,match_month
0,2006-03-04,16,8.3,40,1,21000,1,1,1,1,1,1,3
1,2006-03-05,13,12.9,28,1,15859,2,2,2,1,2,2,3
2,2006-03-05,13,12.1,35,1,47851,3,3,3,1,3,2,3
3,2006-03-05,14,11.6,42,1,51697,4,4,4,1,4,2,3
4,2006-03-05,14,13.1,32,1,20223,5,5,5,1,5,2,3


In [None]:
df_train_x_coc = df_train_x_coc.drop(['match_date'], axis=1)
df_test_x_coc = df_test_x_coc.drop(['match_date'], axis=1)

# prediction

In [None]:
# LightGBMのモデル構築
params = {
    'objective': 'regression',
    'metric': 'rmse',
    'boosting_type': 'gbdt',
    'num_leaves': 31,
    'learning_rate': 0.05,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'verbose': 0
}

In [None]:
# 評価用と訓練用にデータを分ける
x_train_pra, x_train_val, y_train_pra, y_train_val = train_test_split(df_train_x_coc, df_train_y, test_size=0.2)

In [None]:
# 評価用と訓練用のデータをlgbに登録
train_data = lgb.Dataset(x_train_pra, label=y_train_pra)
eval_data = lgb.Dataset(x_train_val, label=y_train_val)

In [None]:
# 学習
model = lgb.train(params, train_data, valid_sets=[eval_data], num_boost_round=1000)

In [None]:
# 評価を見てみる
y_pred = model.predict(x_train_val)
score = r2_score(y_train_val, y_pred)
print(score)

0.8106507983944263


In [None]:
y_test_pred = model.predict(df_test_x_coc)

In [None]:
submission = pd.DataFrame({'id': df_test['id'], 'attendance': y_test_pred})
submission.to_csv('Deroito_J_league_05.csv', index=False, header=False)