# Setting

In [1]:
!pip install category_encoders
import category_encoders as ce



In [2]:
import lightgbm as lgb

Dask dataframe query planning is disabled because dask-expr is not installed.

You can install it with `pip install dask[dataframe]` or `conda install dask`.
This will raise in a future version.



In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error

In [4]:
df_train = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/TOP_SIGNATE/Deroito_competition/train.csv')
df_test = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/TOP_SIGNATE/Deroito_competition/test.csv')
df_holidays = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/TOP_SIGNATE/Deroito_competition/holidays_in_japan.csv')
df_venue = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/TOP_SIGNATE/Deroito_competition/venue_information.csv')
df_match_info = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/TOP_SIGNATE/Deroito_competition/match_reports.csv')

In [5]:
df_train_x = df_train.drop(['id', 'attendance'], axis=1)
df_train_y = df_train['attendance']
df_test_x = df_test.drop(['id'], axis=1)

# Feature engineering

In [6]:
# 日付をdatetime型にする
df_holidays['holiday_date'] = pd.to_datetime(df_holidays['holiday_date'])
df_train_x['match_date'] = pd.to_datetime(df_train_x['match_date'])
df_test_x['match_date'] = pd.to_datetime(df_test_x['match_date'])

In [7]:
# 休日なら1、平日なら0のholidays列をデータフレームに追加
df_train_x['holidays'] =df_train_x['match_date'].dt.dayofweek.apply(lambda x: 1 if(x in df_holidays['holiday_date'].values or x in [5,6]) else 0)
df_test_x['holidays'] =df_test_x['match_date'].dt.dayofweek.apply(lambda x: 1 if(x in df_holidays['holiday_date'].values or x in [5,6]) else 0)

In [8]:
# sectionとroundからsection_roundを作る
df_train_x['section_round'] = df_train_x['section'] + '_' + df_train_x['round']
df_test_x['section_round'] = df_test_x['section'] + '_' + df_test_x['round']
df_train_x = df_train_x.drop(['section', 'round'], axis=1)
df_test_x = df_test_x.drop(['section', 'round'], axis=1)

In [9]:
# kick_off_timeをhourだけにする
df_train_x['kick_off_time'] = pd.to_datetime(df_train_x['kick_off_time'], format='%H:%M').dt.hour
df_test_x['kick_off_time'] = pd.to_datetime(df_test_x['kick_off_time'], format='%H:%M').dt.hour
df_train_x.head()

Unnamed: 0,match_date,kick_off_time,home_team,away_team,venue,weather,temperature,humidity,broadcasters,holidays,section_round
0,2006-03-04,16,G大阪,浦和,万博記念競技場,晴,8.3,40,NHK総合/J SPORTS(録),1,第1節_第1日
1,2006-03-05,13,甲府,清水,山梨県小瀬スポーツ公園陸上競技場,晴,12.9,28,山梨放送/テレビ静岡(録)/J SPORTS(録),1,第1節_第2日
2,2006-03-05,13,FC東京,大分,味の素スタジアム,晴,12.1,35,BS-i/MXテレビ(録)/J SPORTS(録),1,第1節_第2日
3,2006-03-05,14,磐田,福岡,静岡スタジアムエコパ,晴,11.6,42,J SPORTS,1,第1節_第2日
4,2006-03-05,14,名古屋,C大阪,名古屋市瑞穂陸上競技場,晴,13.1,32,スカイパーフェクTV!/NHK名古屋(録)/NHK大阪(録)/J SPORTS(録),1,第1節_第2日


In [10]:
def extract_prefecture(address):
  if '県' in address:
    return address[:address.find('県')+1]
  elif '道' in address:
    return address[:address.find('道')+1]
  elif '府' in address:
    return address[:address.find('府')+1]
  elif '都' in address:
    return address[:address.find('都')+1]
  else:
    return None

In [11]:
df_venue['prefecture'] = df_venue['address'].apply(extract_prefecture)
df_venue.head()

Unnamed: 0,venue,capacity,address,prefecture
0,札幌ドーム,38794,北海道札幌市豊平区羊ヶ丘1,北海道
1,埼玉スタジアム2002,62010,埼玉県さいたま市緑区中野田500,埼玉県
2,味の素スタジアム,47851,東京都調布市西町376−3,東京都
3,町田GIONスタジアム,15320,東京都町田市野津田町2035,東京都
4,日産スタジアム,71624,神奈川県横浜市港北区小机町3300,神奈川県


In [12]:
df_train_x = df_train_x.merge(df_venue[['venue', 'prefecture', 'capacity']], on='venue', how='left')
df_test_x = df_test_x.merge(df_venue[['venue', 'prefecture', 'capacity']], on='venue', how='left')

In [13]:
# obj型とそれ以外を分ける
df_train_x_obj = df_train_x.select_dtypes(include='object')
df_test_x_obj = df_test_x.select_dtypes(include='object')
df_train_x_num = df_train_x.select_dtypes(exclude='object')
df_test_x_num = df_test_x.select_dtypes(exclude='object')

In [14]:
# objをエンコーダーで数字を割り振る
list_cols = df_train_x_obj.columns
encoder = ce.OrdinalEncoder(cols=list_cols)
df_train_x_obj_enc = encoder.fit_transform(df_train_x_obj)
df_test_x_obj_enc = encoder.transform(df_test_x_obj)

In [16]:
df_train_x_coc = pd.concat([df_train_x_num, df_train_x_obj_enc], axis=1)
df_test_x_coc = pd.concat([df_test_x_num, df_test_x_obj_enc], axis=1)

In [17]:
# 何月かを示すmatch_monthを作成
df_train_x_coc['match_month'] = df_train_x_coc['match_date'].dt.month
df_test_x_coc['match_month'] = df_test_x_coc['match_date'].dt.month
df_train_x_coc.head()

Unnamed: 0,match_date,kick_off_time,temperature,humidity,holidays,capacity,home_team,away_team,venue,weather,broadcasters,section_round,prefecture,match_month
0,2006-03-04,16,8.3,40,1,21000,1,1,1,1,1,1,1,3
1,2006-03-05,13,12.9,28,1,15859,2,2,2,1,2,2,2,3
2,2006-03-05,13,12.1,35,1,47851,3,3,3,1,3,2,3,3
3,2006-03-05,14,11.6,42,1,51697,4,4,4,1,4,2,4,3
4,2006-03-05,14,13.1,32,1,20223,5,5,5,1,5,2,5,3


In [18]:
df_train_x_coc = df_train_x_coc.drop(['match_date'], axis=1)
df_test_x_coc = df_test_x_coc.drop(['match_date'], axis=1)

In [33]:
df_train_x_coc['prefecture'].unique()

array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
       18, 19, 20, 21, 22, 23, 24])

# prediction

In [26]:
# LightGBMのモデル構築
params = {
    'objective': 'regression',
    'metric': 'rmse',
    'boosting_type': 'gbdt',
    'num_leaves': 31,
    'learning_rate': 0.05,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'verbose': 0
}

In [27]:
# 評価用と訓練用にデータを分ける
x_train_pra, x_train_val, y_train_pra, y_train_val = train_test_split(df_train_x_coc, df_train_y, test_size=0.2)

In [28]:
# 評価用と訓練用のデータをlgbに登録
train_data = lgb.Dataset(x_train_pra, label=y_train_pra)
eval_data = lgb.Dataset(x_train_val, label=y_train_val)

In [29]:
# 学習
model = lgb.train(params, train_data, valid_sets=[eval_data], num_boost_round=1000)

In [30]:
# 評価を見てみる
y_pred = model.predict(x_train_val)
score = r2_score(y_train_val, y_pred)
print(score)

0.7981619683269745


In [31]:
y_test_pred = model.predict(df_test_x_coc)

In [32]:
submission = pd.DataFrame({'id': df_test['id'], 'attendance': y_test_pred})
submission.to_csv('Deroito_J_league_06.csv', index=False, header=False)