In [1]:
import pandas as pd
import numpy as np
import plotly.express as px
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt


In [2]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.model_selection import GridSearchCV, cross_val_score, StratifiedKFold, learning_curve, KFold
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import balanced_accuracy_score

# classification models
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
import xgboost as xgb

# evaluation metrics
from sklearn import metrics
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import classification_report
from sklearn.feature_selection import mutual_info_classif, f_classif

In [3]:
data = pd.read_csv('players_feats.csv')
train = pd.read_csv('train.csv')

In [4]:
train.isnull().values.any()

False

In [5]:
result = pd.merge(train, data, on=['map_id'])
result.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1426 entries, 0 to 1425
Columns: 132 entries, map_id to map_name_y
dtypes: float64(80), int64(50), object(2)
memory usage: 1.4+ MB


In [6]:
team1_df = result[:][::2]
team2_df = result[:][1::2]

In [7]:
final_df = pd.merge(team1_df, team2_df, on=['map_id'])

In [8]:
final_df.info()
final_df.isnull().values.any().sum()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 713 entries, 0 to 712
Columns: 263 entries, map_id to map_name_y_y
dtypes: float64(160), int64(99), object(4)
memory usage: 1.4+ MB


1

In [9]:
final_df.dropna(inplace=True)
final_df.isnull().values.any().sum()

0

In [None]:
final_df = pd.get_dummies(final_df, columns = ["map_name_x_x"] , prefix_sep = "_",drop_first = True)
final_df.head()



In [None]:
y = pd.DataFrame(final_df['who_win_x'])
y.shape

In [None]:
final_df.drop(['map_id','team1_id_x','team1_id_y','team2_id_x','team2_id_y','who_win_x', 'who_win_y','map_name_x_y', 'map_name_y_x', 'map_name_y_y'], inplace=True, axis=1)

In [None]:
final_df

In [None]:
X = pd.DataFrame(final_df)
X.info()
X.isnull().values.any()

In [None]:
X

In [None]:
scaler = StandardScaler().fit(X)
X_scaled = scaler.transform(X)
selection = SelectKBest(score_func=f_classif, k=64).fit(X,y.values.ravel())
X_new = selection.fit_transform(X, y.values.ravel())

In [None]:
X_new = pd.DataFrame(X_new)

In [None]:
def optimize_hyperparams(model, parameters, x_train, y_train):
    nfolds = 10
    cross_val = StratifiedKFold(nfolds)
    grid = GridSearchCV(model, parameters, cv=cross_val, refit=True, verbose=1, n_jobs=4)
    grid.fit(x_train, y_train)
    print(f'Accuracy : {grid.best_score_} with params {grid.best_estimator_}')
    return grid.best_estimator_

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1337)

In [None]:
rf_clf = RandomForestClassifier(criterion='gini')
print(X_train.shape, y_train.shape)
rf_params = {
    'n_estimators': [5, 15, 25],
    'max_depth': [5, 15, 25],
    'max_features': [16, 32, 64],
    'max_leaf_nodes': [5, 10],
    'bootstrap': [True]
}
best_rf = optimize_hyperparams(rf_clf, rf_params, X_train, y_train.values.ravel())

In [None]:
import optuna
from

In [None]:
def objective(trial):
    x = X_train
    y = y_train.values.ravel()
    
    max_depth = trial.suggest_int('xgb_max_depth', 2, 64, log=True)
    max_leaves = trial.suggest_int('xgb_max_leaves', 5, 20)
    n_estimators = trial.suggest_int('xgb_n_estimators', 100, 200)
    learning_rate = trial.suggest_float('xgb_learning_rate', 0.001, 0.5)
    gamma = trial.suggest_float('xgb_gamma', 1, 9)
    
    xgb_model = xgb.XGBClassifier(
        n_estimators=n_estimators,
        max_depth=max_depth,
        learning_rate=learning_rate,
        max_leaves=max_leaves,
        gamma=gamma
    )
    
    
#     rf_model = RandomForestClassifier(
#         max_depth= max_depth,
#         max_samples=max_samples,
#         max_features=max_features,
#         max_leaf_nodes=max_leaf_nodes,
#         n_estimators=n_estimators,
#         random_state=1337
#     )
    score = cross_val_score(xgb_model, x, y, cv=5).mean()
    return score

In [None]:
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials = 100)

trial = study.best_trial
print(f'best score = {trial.value}')
print(f'Best params: ')
for key, value in trial.params.items():
    print(f'{key} {value}')

In [None]:
optuna.visualization.plot_param_importances(study)

In [None]:
optuna.visualization.plot_optimization_history(study)

In [None]:
y_train

In [None]:
def BuildModel(best_alg, x_train, y_train, x_test, kf, ntrain, ntest, nclass, nfolds):
    Xr_train = np.zeros((ntrain, nclass))
    Xr_test = np.zeros((ntest, nclass))
    tr_ind = np.arange(ntrain)
    for i, (ttrain, ttest) in enumerate(kf.split(tr_ind)):
        clf = best_alg
        clf.fit(x_train.iloc[ttrain], y_train.iloc[ttrain])
        sc = clf.score(x_train.iloc[ttest], y_train.iloc[ttest])
        print(f'{i} accuracy {sc:.4f}')
        Xr_train[ttest] = clf.predict_proba(x_train.iloc[ttest])
        Xr_test += clf.predict_proba(x_test) / nfolds

    return Xr_train, Xr_test


def train_and_test_model(model, x_train, y_train, x_test, y_test, kf, ntrain, ntest, nclass, nfolds, labels):
    pred_train, pred_test = BuildModel(model, x_train, y_train, x_test, kf, ntrain, ntest, nclass,
                                       nfolds)
    thresholds = np.linspace(0.01, 0.9, 100)
    f1_sc = np.array([f1_score(y_train, pred_train[:, 1] > thr) for thr in thresholds])
    plt.figure(figsize=(12, 8))
    plt.plot(thresholds, f1_sc, linewidth=4)
    plt.ylabel("F1 score", fontsize=18)
    plt.xlabel("Threshold", fontsize=18)
    best_lr = thresholds[f1_sc.argmax()]
    show_accuracy(pred_train[:, 1], y_train, labels, best_lr, nclass)
    show_accuracy(pred_test[:, 1], y_test, labels, best_lr, nclass)
    

def show_accuracy(Xr, y, labels, best, nclass):
    pred = []
    for x in Xr:
        if x > best:
            pred.append(1)
        else:
            pred.append(0)
    print(f'pred = {pred}')
    print(classification_report(y, pred, target_names=labels, digits=4, zero_division=True))
    print(confusion_matrix(y, pred, labels=range(nclass)))


In [None]:
ntrain = X_train.shape[0]
ntest = X_test.shape[0]
nclass = 2
NFOLDS = 10
kf = KFold(n_splits=NFOLDS, random_state=1337, shuffle=True)
labels = ['Team1_win', 'team2_win']

In [None]:
y_test.hist()

In [None]:
xgb_clf = xgb.XGBClassifier(max_depth=9, max_leaves=14, n_estimators=158, learning_rate=0.03, gamma=7.3)
train_and_test_model(xgb_clf, X_train, y_train, X_test, y_test, kf, ntrain, ntest, nclass, NFOLDS, labels)

# xgb_clf.fit(X_train, y_train)
# rf_clf = RandomForestClassifier()
# rf_clf.fit(X_train, y_train.values.ravel())
# y_pred = xgb_clf.predict(X_test)
# print(y_pred)
# predictions = [value for value in y_pred]
# print(predictions)
# # evaluate predictions
# fscore = f1_score(y_test, predictions)
# print("Accuracy: %.2f%%" % (fscore))