In [5]:
import pandas as pd

from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

warnings.filterwarnings('ignore')

In [6]:
csv_data = pd.read_csv('training_data_all.csv')
csv_data.head(5)

Unnamed: 0.1,Unnamed: 0,Win_team,Team_1_pitcher_WHIP,Team_1_pitcher_piAn,Team_1_pitcher_piHom,Team_1_pitcher_sam,Team_1_pitcher_ball,Team_1_pitcher_ja,Team_2_pitcher_WHIP,Team_2_pitcher_piAn,...,Team_1_hitter_chul,Team_1_hitter_do,Team_2_hitter_hit,Team_2_hitter_homrun,Team_2_hitter_sam,Team_2_hitter_ball,Team_2_hitter_deg,Team_2_hitter_ta,Team_2_hitter_chul,Team_2_hitter_do
0,0,1,1.17,0.234273,0.02603,0.607143,0.205357,4.178571,1.4,0.223785,...,0.261251,0.425926,0.297333,0.038541,0.184575,0.019147,0.153753,0.168974,0.308273,0.6822
1,1,0,1.23,0.245774,0.014304,0.82658,0.205294,3.889789,1.32,0.236887,...,0.288868,0.431263,0.283333,0.028122,0.208332,0.009366,0.140507,0.148764,0.288695,0.316402
2,2,0,1.58,0.269179,0.024226,0.6,0.363636,4.690909,1.31,0.250307,...,0.244749,0.485862,0.258444,0.03821,0.203828,0.022838,0.147275,0.125598,0.271322,0.338624
3,3,0,1.32,0.258782,0.02459,0.740557,0.223658,3.489066,1.69,0.281099,...,0.294911,0.348744,0.283111,0.028662,0.17391,0.014732,0.143559,0.140787,0.290519,0.725853
4,4,1,1.3,0.249373,0.02381,0.833779,0.235168,3.800107,1.29,0.23431,...,0.305814,0.472489,0.287889,0.034753,0.208439,0.031419,0.147032,0.157054,0.305711,0.650661


In [7]:
drop_column = list(csv_data.columns)[0]
csv_data.drop(drop_column, axis=1, inplace=True)

# inf 를 NaN 값으로 바꾼 후 NaN값을 평균값으로 치환하는 방식.
# 바로 평균값 mean을 사용하는 경우 inf를 포함하여 평균값을 계산하기 때문에
csv_data['Team_1_hitter_deg'].replace(np.inf, np.nan, inplace=True)
csv_data['Team_2_hitter_deg'].replace(np.inf, np.nan, inplace=True)

csv_data['Team_1_hitter_deg'].replace(np.nan, csv_data['Team_1_hitter_deg'].mean(), inplace=True)
csv_data['Team_2_hitter_deg'].replace(np.nan, csv_data['Team_2_hitter_deg'].mean(), inplace=True)

train_data = csv_data.drop('Win_team', axis=1)
target_data = csv_data['Win_team']

In [19]:
# 개별 모델은 로지스틱 회귀와 KNN 임. 
lr_clf = LogisticRegression()
knn_clf = KNeighborsClassifier(n_neighbors=8)

# 개별 모델을 소프트 보팅 기반의 앙상블 모델로 구현한 분류기 
vo_clf = VotingClassifier( estimators=[('LR',lr_clf),('KNN',knn_clf)] , voting='soft' )

X_train, X_test, y_train, y_test = train_test_split(train_data, target_data, 
                                                    test_size=0.3)

# VotingClassifier 학습/예측/평가. 
vo_clf.fit(X_train , y_train)
pred = vo_clf.predict(X_test)
print('Voting 분류기 정확도: {0:.4f}'.format(accuracy_score(y_test , pred)))

Voting 분류기 정확도: 0.5813


In [15]:
# 개별 모델의 학습/예측/평가.
classifiers = [lr_clf, knn_clf]
for classifier in classifiers:
    classifier.fit(X_train , y_train)
    pred = classifier.predict(X_test)
    class_name= classifier.__class__.__name__
    print('{0} 정확도: {1:.4f}'.format(class_name, accuracy_score(y_test , pred)))

LogisticRegression 정확도: 0.5917
KNeighborsClassifier 정확도: 0.5767
