# Setting

In [None]:
!pip install category_encoders
import category_encoders as ce



In [None]:
import lightgbm as lgb

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error

In [None]:
df_train = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/TOP_SIGNATE/Deroito_competition/train.csv')
df_test = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/TOP_SIGNATE/Deroito_competition/test.csv')
df_holidays = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/TOP_SIGNATE/Deroito_competition/holidays_in_japan.csv')
df_venue = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/TOP_SIGNATE/Deroito_competition/venue_information.csv')

In [None]:
df_train_x = df_train.drop(['id', 'kick_off_time', 'attendance'], axis=1)
df_train_y = df_train['attendance']
df_test_x = df_test.drop(['id', 'kick_off_time'], axis=1)

# Feature engineering

In [None]:
# 日付をdatetime型にする
df_holidays['holiday_date'] = pd.to_datetime(df_holidays['holiday_date'])
df_train_x['match_date'] = pd.to_datetime(df_train_x['match_date'])
df_test_x['match_date'] = pd.to_datetime(df_test_x['match_date'])

In [None]:
# 休日なら1、平日なら0のholidays列をデータフレームに追加
df_train_x['holidays'] =df_train_x['match_date'].dt.dayofweek.apply(lambda x: 1 if(x in df_holidays['holiday_date'].values or x in [5,6]) else 0)
df_test_x['holidays'] =df_test_x['match_date'].dt.dayofweek.apply(lambda x: 1 if(x in df_holidays['holiday_date'].values or x in [5,6]) else 0)

In [None]:
df_train_x.head()

Unnamed: 0,match_date,section,round,home_team,away_team,venue,weather,temperature,humidity,broadcasters,holidays
0,2006-03-04,第1節,第1日,G大阪,浦和,万博記念競技場,晴,8.3,40,NHK総合/J SPORTS(録),1
1,2006-03-05,第1節,第2日,甲府,清水,山梨県小瀬スポーツ公園陸上競技場,晴,12.9,28,山梨放送/テレビ静岡(録)/J SPORTS(録),1
2,2006-03-05,第1節,第2日,FC東京,大分,味の素スタジアム,晴,12.1,35,BS-i/MXテレビ(録)/J SPORTS(録),1
3,2006-03-05,第1節,第2日,磐田,福岡,静岡スタジアムエコパ,晴,11.6,42,J SPORTS,1
4,2006-03-05,第1節,第2日,名古屋,C大阪,名古屋市瑞穂陸上競技場,晴,13.1,32,スカイパーフェクTV!/NHK名古屋(録)/NHK大阪(録)/J SPORTS(録),1


In [None]:
# sectionとroundからsection_roundを作る
df_train_x['section_round'] = df_train_x['section'] + '_' + df_train_x['round']
df_test_x['section_round'] = df_test_x['section'] + '_' + df_test_x['round']
df_train_x = df_train_x.drop(['section', 'round'], axis=1)
df_test_x = df_test_x.drop(['section', 'round'], axis=1)

In [None]:
df_train_x.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3672 entries, 0 to 3671
Data columns (total 10 columns):
 #   Column         Non-Null Count  Dtype         
---  ------         --------------  -----         
 0   match_date     3672 non-null   datetime64[ns]
 1   home_team      3672 non-null   object        
 2   away_team      3672 non-null   object        
 3   venue          3672 non-null   object        
 4   weather        3672 non-null   object        
 5   temperature    3672 non-null   float64       
 6   humidity       3672 non-null   int64         
 7   broadcasters   3672 non-null   object        
 8   holidays       3672 non-null   int64         
 9   section_round  3672 non-null   object        
dtypes: datetime64[ns](1), float64(1), int64(2), object(6)
memory usage: 287.0+ KB


In [None]:
# df_venueからcapacity列を作成する
df_train_x = pd.merge(df_train_x, df_venue[['venue', 'capacity']], on='venue', how='left')
df_test_x = pd.merge(df_test_x, df_venue[['venue', 'capacity']], on='venue', how='left')
df_train_x.head()

Unnamed: 0,match_date,home_team,away_team,venue,weather,temperature,humidity,broadcasters,holidays,section_round,capacity
0,2006-03-04,G大阪,浦和,万博記念競技場,晴,8.3,40,NHK総合/J SPORTS(録),1,第1節_第1日,21000
1,2006-03-05,甲府,清水,山梨県小瀬スポーツ公園陸上競技場,晴,12.9,28,山梨放送/テレビ静岡(録)/J SPORTS(録),1,第1節_第2日,15859
2,2006-03-05,FC東京,大分,味の素スタジアム,晴,12.1,35,BS-i/MXテレビ(録)/J SPORTS(録),1,第1節_第2日,47851
3,2006-03-05,磐田,福岡,静岡スタジアムエコパ,晴,11.6,42,J SPORTS,1,第1節_第2日,51697
4,2006-03-05,名古屋,C大阪,名古屋市瑞穂陸上競技場,晴,13.1,32,スカイパーフェクTV!/NHK名古屋(録)/NHK大阪(録)/J SPORTS(録),1,第1節_第2日,20223


In [None]:
# obj型とそれ以外を分ける
df_train_x_obj = df_train_x.select_dtypes(include='object')
df_test_x_obj = df_test_x.select_dtypes(include='object')
df_train_x_num = df_train_x.select_dtypes(exclude='object')
df_test_x_num = df_test_x.select_dtypes(exclude='object')

In [None]:
# objをエンコーダーで数字を割り振る
list_cols = df_train_x_obj.columns
encoder = ce.OrdinalEncoder(cols=list_cols)
df_train_x_obj_enc = encoder.fit_transform(df_train_x_obj)
df_test_x_obj_enc = encoder.transform(df_test_x_obj)

In [None]:
df_train_x_coc = pd.concat([df_train_x_num, df_train_x_obj_enc], axis=1)
df_test_x_coc = pd.concat([df_test_x_num, df_test_x_obj_enc], axis=1)

In [None]:
df_train_x_coc.head()

Unnamed: 0,match_date,temperature,humidity,holidays,capacity,home_team,away_team,venue,weather,broadcasters,section_round
0,2006-03-04,8.3,40,1,21000,1,1,1,1,1,1
1,2006-03-05,12.9,28,1,15859,2,2,2,1,2,2
2,2006-03-05,12.1,35,1,47851,3,3,3,1,3,2
3,2006-03-05,11.6,42,1,51697,4,4,4,1,4,2
4,2006-03-05,13.1,32,1,20223,5,5,5,1,5,2


In [None]:
df_test_x_coc.head()

Unnamed: 0,match_date,temperature,humidity,holidays,capacity,home_team,away_team,venue,weather,broadcasters,section_round
0,2018-02-23,6.6,57,0,24130,27.0,19.0,49.0,1.0,611.0,1.0
1,2018-02-24,14.1,40,1,47851,3.0,1.0,3.0,1.0,626.0,2.0
2,2018-02-24,16.3,51,1,36894,8.0,22.0,51.0,1.0,-1.0,2.0
3,2018-02-24,12.9,42,1,39694,31.0,13.0,-1.0,2.0,611.0,2.0
4,2018-02-24,13.8,50,1,15380,25.0,-1.0,-1.0,1.0,611.0,2.0


In [None]:
df_train_x_coc = df_train_x_coc.drop(['match_date'], axis=1)
df_test_x_coc = df_test_x_coc.drop(['match_date'], axis=1)

# prediction

In [None]:
# LightGBMのモデル構築
params = {
    'objective': 'regression',
    'metric': 'rmse',
    'boosting_type': 'gbdt',
    'num_leaves': 31,
    'learning_rate': 0.05,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'verbose': 0
}

In [None]:
# 評価用と訓練用にデータを分ける
x_train_pra, x_train_val, y_train_pra, y_train_val = train_test_split(df_train_x_coc, df_train_y, test_size=0.2)

In [None]:
# 評価用と訓練用のデータをlgbに登録
train_data = lgb.Dataset(x_train_pra, label=y_train_pra)
eval_data = lgb.Dataset(x_train_val, label=y_train_val)

In [None]:
# 学習
model = lgb.train(params, train_data, valid_sets=[eval_data], num_boost_round=1000)

In [None]:
# 評価を見てみる
y_pred = model.predict(x_train_val)
score = r2_score(y_train_val, y_pred)
print(score)

0.7974519290225964


In [None]:
y_test_pred = model.predict(df_test_x_coc)

In [None]:
submission = pd.DataFrame({'id': df_test['id'], 'attendance': y_test_pred})
submission.to_csv('Deroito_J_league_03.csv', index=False)

In [None]:
submission = pd.DataFrame({'id': df_test['id'], 'attendance': y_test_pred})
submission.to_csv('Deroito_J_league_04.csv', index=False, header=False)