In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

In [2]:
train = pd.read_csv('../input/train_pitch.csv')
test = pd.read_csv('../input/test_pitch.csv')

In [3]:
data = pd.concat([train, test], sort = False)

In [4]:
data = data.drop(['試合ID','日付', '年度','ホームチームID', 'アウェイチームID', '球場ID', '球場名', '試合種別詳細', '表裏', 'イニング内打席数', '投手チームID', '投手投球左右',
                  '投手試合内投球数', '投手イニング内投球数','打者チームID', '打者打順', '打者守備位置', '打者試合内打席数',
                  '一塁手ID','二塁手ID','三塁手ID', '遊撃手ID','左翼手ID','中堅手ID' ,'右翼手ID', '時刻', 
                  '一塁走者ID', '二塁走者ID', '三塁走者ID', 'プレイ前アウェイチーム得点数', 'プレイ前アウト数', '成績対象投手ID',
                  '成績対象打者ID','投手役割', '投手登板順', '投手試合内対戦打者数'], axis = 1   )

In [5]:
data['プレイ前走者状況'].replace(['___', '_2_', '1__', '12_', '_23', '1_3', '__3', '123'], ['0', '1', '2', '3', '4', '5', '6', '7'], inplace = True)

In [6]:
data['プレイ前走者状況'] = data['プレイ前走者状況'].astype(int)

In [7]:
data['打者打席左右'] = pd.get_dummies(train['打者打席左右'])

In [8]:
data = data.rename(columns = {'データ内連番': 'ID', '球種':'Ball type', '投球位置区域':'Pitch area', '試合内連番': 'Pitch continuously', '試合内投球数':'the amount of pitch', 'イニング':'Inning', '打席内投球数':'The amount of pitch in a batter', '投手ID':'Pitcher ID',
       '打者ID':'batter ID', '打者打席左右':'Left or Right', 'プレイ前ホームチーム得点数':'The sum of points', 'プレイ前ボール数':'The amount of balls', 'プレイ前ストライク数':'the amount of strike', 'プレイ前走者状況':'The runner situation',
       '捕手ID':'catcher id'})

In [9]:
data.head()

Unnamed: 0,ID,Ball type,Pitch area,Pitch continuously,the amount of pitch,Inning,The amount of pitch in a batter,Pitcher ID,batter ID,Left or Right,The sum of points,The amount of balls,the amount of strike,The runner situation,catcher id
0,0,0.0,0.0,1,1,1,1,1500001,900410,0.0,0,0,0,0,1300027
1,1,0.0,8.0,2,2,1,2,1500001,900410,0.0,0,0,1,0,1300027
2,2,0.0,5.0,3,3,1,3,1500001,900410,0.0,0,0,2,0,1300027
3,3,0.0,12.0,4,4,1,1,1500001,11436,1.0,0,0,0,0,1300027
4,4,0.0,8.0,5,5,1,2,1500001,11436,1.0,0,0,1,0,1300027


In [10]:
train = data[:len(train)]
test = data[len(train):]

In [11]:
y_train = train['Ball type']
X_train = train.drop('Ball type', axis = 1)
X_test = test.drop('Ball type', axis = 1)

In [12]:
from sklearn.model_selection import train_test_split

X_train, X_valid, y_train, y_valid = \
    train_test_split(X_train, y_train, test_size = 0.3, 
                    random_state = 0, stratify = y_train)

In [13]:
categorical_features = ['Pitch area', 'Pitch continuously',
       'the amount of pitch', 'Inning', 'The amount of pitch in a batter',
       'Pitcher ID', 'batter ID', 'Left or Right', 'The sum of points',
       'The amount of balls', 'the amount of strike', 'The runner situation',
       'catcher id']

In [14]:
import lightgbm as lgb

lgb_train = lgb.Dataset(X_train, y_train, 
                       categorical_feature = categorical_features)
lgb_eval = lgb.Dataset(X_valid, y_valid, reference = lgb_train, 
                      categorical_feature=categorical_features)

In [15]:
params = {
    'objective': 'multiclass', 
    'metric': {'multi_logloss'},
    'num_class': 8
}

In [16]:
model = lgb.train(
    params, lgb_train,
    valid_sets=[lgb_train, lgb_eval],
    num_boost_round = 1000,
    verbose_eval=10,
    early_stopping_rounds = 10)

Training until validation scores don't improve for 10 rounds
[10]	training's multi_logloss: 1.25476	valid_1's multi_logloss: 1.29674
[20]	training's multi_logloss: 1.11999	valid_1's multi_logloss: 1.19808
[30]	training's multi_logloss: 1.03987	valid_1's multi_logloss: 1.14956
[40]	training's multi_logloss: 0.98423	valid_1's multi_logloss: 1.12371
[50]	training's multi_logloss: 0.942421	valid_1's multi_logloss: 1.11012
[60]	training's multi_logloss: 0.907885	valid_1's multi_logloss: 1.10173
[70]	training's multi_logloss: 0.878128	valid_1's multi_logloss: 1.09706
[80]	training's multi_logloss: 0.851181	valid_1's multi_logloss: 1.09453
[90]	training's multi_logloss: 0.827199	valid_1's multi_logloss: 1.09373
[100]	training's multi_logloss: 0.804441	valid_1's multi_logloss: 1.09335
Early stopping, best iteration is:
[99]	training's multi_logloss: 0.806588	valid_1's multi_logloss: 1.0933


In [17]:
y_pred = model.predict(X_test, num_iteration = model.best_iteration)

In [18]:
sub = pd.read_csv('../input/sample_submit_ball_type.csv')
sub = sub.join(pd.DataFrame(y_pred))
sub.columns = ['ID', 'class1','class2','class3','class4','class5','class6','class7','class8', 'class9', 'class10', 'class11', 'class12', 'class13', 'class14','class15', 'class16']
sub.to_csv('submission.csv', index=False) 