In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

In [None]:
train = pd.read_csv('../input/hiroshima/train_pitch.csv')
test = pd.read_csv('../input/hiroshima/test_pitch.csv')

In [None]:
data = pd.concat([train, test], sort = False)

In [None]:
data.columns

In [None]:
labels = ['データ内連番', '球種', '投球位置区域', '年度', '試合ID', '試合内連番', '試合内投球数', '日付', '時刻',
       'ホームチームID', 'アウェイチームID', '球場ID', '球場名', '試合種別詳細', 'イニング', '表裏',
       'イニング内打席数', '打席内投球数', '投手ID', '投手チームID', '投手投球左右', '投手役割', '投手登板順',
       '投手試合内対戦打者数', '投手試合内投球数', '投手イニング内投球数', '打者ID', '打者チームID', '打者打席左右',
       '打者打順', '打者守備位置', '打者試合内打席数', 'プレイ前ホームチーム得点数', 'プレイ前アウェイチーム得点数',
       'プレイ前アウト数', 'プレイ前ボール数', 'プレイ前ストライク数', 'プレイ前走者状況', '一塁走者ID', '二塁走者ID',
       '三塁走者ID', '捕手ID', '一塁手ID', '二塁手ID', '三塁手ID', '遊撃手ID', '左翼手ID', '中堅手ID',
       '右翼手ID', '成績対象投手ID', '成績対象打者ID']

In [None]:
data = data.reindex(labels, axis=1)

In [None]:
data['打者守備位置'].replace(['投', '捕', '一', '二', '三', '遊', '左', '中', '右'], [0, 1, 1, 1, 1, 1, 1, 1, 1], inplace = True)

data['プレイ前走者状況'].replace(['___', '_2_', '1__', '12_', '_23', '1_3', '__3', '123'], 
                         ['0', '1', '2', '3', '4', '5', '6', '7'], inplace = True)

data['一塁走者ID'].fillna(0, inplace = True)

data['プレイ前走者状況'] = data['プレイ前走者状況'].astype(np.int32)
data['データ内連番'] = data['データ内連番'].astype(np.int32)
data['球種'] = data['球種'].astype(np.float16)
data['イニング'] = data['イニング'].astype(np.int8)
data['投手ID'] = data['投手ID'].astype(np.int32)
data['打者ID'] = data['打者ID'].astype(np.int32)
data['プレイ前ボール数'] = data['プレイ前ボール数'].astype(np.int8)
data['プレイ前ストライク数'] = data['プレイ前ストライク数'].astype(np.int8)
data['プレイ前走者状況'] = data['プレイ前走者状況'].astype(np.int8)
data['一塁走者ID'] = data['一塁走者ID'].astype(np.float32)
data['捕手ID'] = data['捕手ID'].astype(np.int32)
data['成績対象打者ID'] = data['捕手ID'].astype(np.int32)

In [None]:
data.drop(['年度','試合種別詳細', '表裏','日付', '時刻', '投手登板順', '投手試合内対戦打者数', '打者チームID', '打者試合内打席数', '球場名',
           'プレイ前ホームチーム得点数','投手チームID','投手試合内投球数', 'プレイ前アウェイチーム得点数','プレイ前アウト数', '打者守備位置', 
           '一塁手ID', '二塁手ID','三塁手ID', '遊撃手ID', '左翼手ID', '中堅手ID', '右翼手ID','投手役割', '投手投球左右', '試合ID',
           'ホームチームID', 'アウェイチームID', '成績対象投手ID', '三塁走者ID', '打者打順', '二塁走者ID', 'イニング内打席数', 
           '試合内連番', '打者打席左右', '投手イニング内投球数', '打席内投球数', '投球位置区域', '試合内投球数', '球場ID'], axis = 1, inplace = True)

In [None]:
data.rename(columns = {'データ内連番': 'ID', '球種':'Ball type',
                       'イニング':'Inning',
                       '投手ID':'Pitcher ID','投手役割':'Pitcher role', 
                       '投手投球左右':'L/R Pitcher','打者ID':'batter ID',
                       'プレイ前ホームチーム得点数':'The sum of points',
                       'プレイ前ボール数':'The amount of balls', 'プレイ前ストライク数':'the amount of strike', 
                       'プレイ前走者状況':'The runner situation','捕手ID':'catcher id', '一塁走者ID':'First runner', 
                       '成績対象打者ID':'Batter ID' }, inplace = True)

In [None]:
train = data[:len(train)]
test = data[len(train):]

In [None]:
y_train = train['Ball type']
X_train = train.drop('Ball type', axis = 1)
X_test = test.drop('Ball type', axis = 1)

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_valid, y_train, y_valid = \
    train_test_split(X_train, y_train, test_size = 0.3, 
                    random_state = 0, stratify = y_train)

In [None]:
categorical_features = ['Inning', 'Pitcher ID',
       'batter ID','The amount of balls', 'the amount of strike',
       'The runner situation', 'First runner',
       'catcher id', 'Batter ID']

# Lightgbm

In [None]:
import lightgbm as lgb

lgb_train = lgb.Dataset(X_train, y_train, 
                       categorical_feature = categorical_features)
lgb_eval = lgb.Dataset(X_valid, y_valid, reference = lgb_train, 
                      categorical_feature=categorical_features)
params = {
    'objective': 'multiclass', 
    'metric': {'multi_logloss'},
    'num_class': 8
}

model = lgb.train(
    params, lgb_train,
    valid_sets=[lgb_train, lgb_eval],
    num_boost_round = 1000,
    verbose_eval=10,
    early_stopping_rounds = 10)

y_pred = model.predict(X_test, num_iteration = model.best_iteration)

In [None]:
sub = pd.read_csv('../input/hiroshima/sample_submit_ball_type.csv')
sub = sub.join(pd.DataFrame(y_pred))
sub.columns = ['0','class1','class2','class3','class4','class5','class6','class7','class8',
               '0.125', '0.125', '0.125', '0.125', '0.125', '0.125','0.125', '0.125']
sub.to_csv('submission_lightgbm.csv', index=False) 

# HyperParameter 調整

In [None]:
import lightgbm as lgb
import optuna
from sklearn.metrics import log_loss

def objective(trial):
    params = {
        'objective':'multiclass', 
        'max_bin': trial.suggest_int('max_bin', 255, 500), 
        'learning_rate':0.02, 
        'num_leaves': trial.suggest_int('num_leaves', 32, 128),
        'num_class': 8
    }
    
    lgb_train = lgb.Dataset(X_train, y_train, 
                           categorical_feature=categorical_features)
    lgb_eval = lgb.Dataset(X_valid, y_valid, reference = lgb_train,  
                          categorical_feature=categorical_features)
    
    model = lgb.train(params, lgb_train, 
                     valid_sets = [lgb_train, lgb_eval], 
                     verbose_eval = 10, 
                     num_boost_round = 500, 
                     early_stopping_rounds = 10)
    
    y_pred_valid = model.predict(X_valid, num_iteration=model.best_iteration)
    score = log_loss(y_valid, y_pred_valid)
    return score

In [None]:
study = optuna.create_study(sampler = optuna.samplers.RandomSampler(seed = 0))
study.optimize(objective, n_trials = 10)
study.best_params

# Cross Validation

In [None]:
from sklearn.model_selection import KFold


y_preds = []
models = []
oof_train = np.zeros((len(X_train),))
cv = KFold(n_splits=5, shuffle=True, random_state=0)

categorical_features = ['Inning', 'Pitcher ID',
       'batter ID','The amount of balls', 'the amount of strike',
       'The runner situation', 'First runner',
       'catcher id', 'Batter ID']

params = {
    'objective': 'binary',
    'max_bin': 300,
    'learning_rate': 0.05,
    'num_leaves': 40, 
    'num_class': 8
}

for fold_id, (train_index, valid_index) in enumerate(cv.split(X_train)):
    X_tr = X_train.loc[train_index, :]
    X_val = X_train.loc[valid_index, :]
    y_tr = y_train[train_index]
    y_val = y_train[valid_index]

    lgb_train = lgb.Dataset(X_tr, y_tr,categorical_feature=categorical_features)
    lgb_eval = lgb.Dataset(X_val, y_val, reference=lgb_train,categorical_feature=categorical_features)

    model = lgb.train(params, lgb_train,
                                   valid_sets=[lgb_train, lgb_eval],
                                   verbose_eval=10,
                                   num_boost_round=1000,
                                   early_stopping_rounds=10)


    oof_train[valid_index] = model.predict(X_val, num_iteration=model.best_iteration)
    y_pred = model.predict(X_test, num_iteration=model.best_iteration)

    y_preds.append(y_pred)
    models.append(model)