In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
pd.set_option('display.max_columns',500)
pd.set_option('display.width',500)
pd.set_option('display.max_rows', 500)

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from datetime import datetime, timedelta

from sklearn import model_selection
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import PowerTransformer
from sklearn.model_selection import RandomizedSearchCV
# from sklearn.feature_selection import RFECV

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
# from xgboost import XGBClassifier

from sklearn import metrics
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import plot_roc_curve
from sklearn.metrics import classification_report
from sklearn.metrics import log_loss

In [None]:
train_df = pd.read_csv('/kaggle/input/the-soccer-fever/train.csv')
train_df

In [None]:
test_df = pd.read_csv('/kaggle/input/the-soccer-fever/test.csv')
test_df

In [None]:
def details(df):
    sum_null_values = df.isnull().sum()
    percent_null_values = 100* (sum_null_values/len(df))
    data_type = df.dtypes
    unique_values = df.nunique()

    table = pd.concat([sum_null_values,percent_null_values,data_type,unique_values], axis=1)
    table_col = table.rename(columns = {0 : 'Missing Values', 1 : '% of Total Missing Values', 2 : 'Data_Type', 3: 'Unique values'})
    return table_col

In [None]:
details(train_df)

In [None]:
details(test_df)

In [None]:
df_percent_null = train_df.isnull().sum() / len(train_df)
df_column_del = df_percent_null[df_percent_null > 0.50].index
train_df.drop(df_column_del, axis=1, inplace=True)
train_df.shape

In [None]:
df_percent_null = test_df.isnull().sum() / len(test_df)
df_column_del = df_percent_null[df_percent_null > 0.50].index
test_df.drop(df_column_del, axis=1, inplace=True)
test_df.shape

In [None]:
train_df.Outcome.value_counts().plot(kind='pie', figsize=(6, 6), autopct='%1.2f%%')
plt.show()

In [None]:
train_df.season.value_counts().plot.bar()
plt.show()

In [None]:
test_df.season.value_counts().plot.bar()
plt.show()

In [None]:
train_df["date"]= pd.to_datetime(train_df["date"])
test_df["date"]= pd.to_datetime(test_df["date"])
details(train_df)

In [None]:
details(test_df)

In [None]:
cols_obj_train = train_df.select_dtypes('object').columns
train_df[cols_obj_train]

In [None]:
cols_num_train = train_df.select_dtypes('number').columns
train_df[cols_num_train]

In [None]:
train_df

In [None]:
sns.countplot(x="season", hue='Outcome', data=train_df)
plt.show()

In [None]:
train_df.league_id.value_counts().plot.bar(figsize=(30,15))
plt.show()

In [None]:
train_df.league.value_counts().plot.bar(figsize=(30,15))
plt.show()

In [None]:
cols = ['SPI1','SPI2','proj_score1','proj_score2']
for i in cols:
    train_df[i].plot.hist(bins=20, alpha=0.8)
    plt.show()

In [None]:
X = train_df.drop(['Outcome'], 1)
y = train_df['Outcome']
X_test = test_df.copy()
X_train, X_val, y_train, y_val = train_test_split(X, y, train_size=0.7, random_state=23)

In [None]:
cols_num_X_train = [feature for feature in X_train.columns if X_train[feature].dtypes != 'object' and feature not in ['season']+['date']+['league_id']]
X_train[cols_num_X_train]

In [None]:
cols_num_X_val = [feature for feature in X_val.columns if X_val[feature].dtypes != 'object' and feature not in ['season']+['date']+['league_id']]
X_val[cols_num_X_val]

In [None]:
cols_num_X_test = [feature for feature in X_test.columns if X_test[feature].dtypes != 'object' and feature not in ['season']+['date']+['league_id']]
X_test[cols_num_X_test]

In [None]:
plt.figure(figsize = (15,10))
sns.heatmap(X_train[cols_num_X_train].corr(),annot = True, cmap="GnBu",fmt='.2f')
plt.show()

In [None]:
pt = PowerTransformer(copy=False)
pt.fit_transform(X_train[cols_num_X_train])
pt.transform(X_val[cols_num_X_val])
pt.transform(X_test[cols_num_X_test])

In [None]:
model_list = list()
#AUC curve
AUCROC_train = list()
AUCROC_val = list()
#Precision
Precision_train = list()
Precision_val = list()
#Recall
Recall_train = list()
Recall_val = list()
#Accuracy
Acc_train = list()
Acc_val = list()
#Logloss
LogLoss_train = list()
LogLoss_val = list()

In [None]:
model_LR = LogisticRegression()
params_LR = {'C':np.logspace(-1, 5, 10), 'class_weight':[None,'balanced'], 'penalty':['l1','l2']}
model_DT = DecisionTreeClassifier(random_state=23)
params_DT = {
    'max_depth': [10, 20, 50, 100, 200],
    'min_samples_leaf': [10, 20, 50, 100, 200],
    'min_samples_split' : [10, 20, 50, 100, 200],
    'criterion': ["gini", "entropy"]
}
model_RF = RandomForestClassifier(oob_score = True, random_state=23)
params_RF = {
    'n_estimators': [10, 20, 50, 100, 200],
    'max_depth': [10, 20, 50, 100, 200],
    'min_samples_leaf': [10, 20, 50, 100, 200],
    'min_samples_split' : [10, 20, 50, 100, 200],
    'criterion': ["gini", "entropy"]
}

In [None]:
def model_fit(model, X_train, y_train, X_val, y_val, algo=None):
    
    model_LR.fit(X_train, y_train)
    
    y_train_prob = model_LR.predict_proba(X_train)
    y_train_pred = model_LR.predict(X_train)
    y_val_prob = model_LR.predict_proba(X_val)
    y_val_pred = model_LR.predict(X_val)
        
    matrix_train = confusion_matrix(y_train, y_train_pred)
    matrix_val = confusion_matrix(y_val, y_val_pred)
    report_train = classification_report(y_train, y_train_pred)
    report_val = classification_report(y_val, y_val_pred)
    auc_train = roc_auc_score(y_train, y_train_prob[:,1])
    auc_val = roc_auc_score(y_val, y_val_prob[:,1])
    logloss_train = log_loss(y_train, y_train_prob[:,1])
    logloss_val = log_loss(y_val, y_val_prob[:,1])
        
    print('Confusion Matrix for train')
    print('='*60)
    print(matrix_train,"\n")
    print('Confusion Matrix for val')
    print('='*60)
    print(matrix_val,"\n")
    print('Classification Report for train')
    print('='*60)
    print(report_train,"\n")
    print('Classification Report for val')
    print('='*60)
    print(report_val,"\n")
    print('AUC-ROC for train')
    print('='*60)
    print(auc_train,'\n')
    print('AUC-ROC for val')
    print('='*60)
    print(auc_val,'\n')
    print('Roc-Auc-Curve for Train set')
    print('='*60)
    print(plot_roc_curve(model_LR, X_train, y_train),'\n')
    print('Roc-Auc-Curve for Val set')
    print('='*60)
    print(plot_roc_curve(model_LR, X_val, y_val),'\n')
    print('Log-Loss for Train set')
    print('='*60)
    print(logloss_train,'\n')
    print('Log-Loss for Val set')
    print('='*60)
    print(logloss_val,'\n')
    
    model_list.append(algo)
    AUCROC_train.append(auc_train)
    AUCROC_val.append(auc_val)
    Precision_train.append(precision_score(y_train, y_train_pred))
    Precision_val.append(precision_score(y_val, y_val_pred))
    Recall_train.append(recall_score(y_train, y_train_pred)) 
    Recall_val.append(recall_score(y_val, y_val_pred))
    Acc_train.append(accuracy_score(y_train, y_train_pred))
    Acc_val.append(accuracy_score(y_val, y_val_pred))
    LogLoss_train.append(logloss_train)
    LogLoss_val.append(logloss_val)

In [None]:
def model_fit_evaluation(model, params, X_train, y_train, X_val, y_val, algo=None):
    
    rcv = RandomizedSearchCV(model, params, cv=5, scoring='neg_log_loss', n_jobs=-1, verbose=1, random_state=23)
    rcv.fit(X_train, y_train)
    
    rcv_best = rcv.best_estimator_
    print('\n')
    print('best estimator : ', rcv_best)
    print('best parameters: ', rcv.best_params_)
    print('best score: ', rcv.best_score_)
    print('\n')

    y_train_prob = rcv_best.predict_proba(X_train)
    y_train_pred = rcv_best.predict(X_train)
    y_val_prob = rcv_best.predict_proba(X_val)
    y_val_pred = rcv_best.predict(X_val)
    
    matrix_train = confusion_matrix(y_train, y_train_pred)
    matrix_val = confusion_matrix(y_val, y_val_pred)
    report_train = classification_report(y_train, y_train_pred)
    report_val = classification_report(y_val, y_val_pred)
    auc_train = roc_auc_score(y_train, y_train_prob[:,1])
    auc_val = roc_auc_score(y_val, y_val_prob[:,1])
    logloss_train = log_loss(y_train, y_train_prob[:,1])
    logloss_val = log_loss(y_val, y_val_prob[:,1])
    
    print('Confusion Matrix for train')
    print('='*60)
    print(matrix_train,"\n")
    print('Confusion Matrix for val')
    print('='*60)
    print(matrix_val,"\n")
    print('Classification Report for train')
    print('='*60)
    print(report_train,"\n")
    print('Classification Report for val')
    print('='*60)
    print(report_val,"\n")
    print('AUC-ROC for train')
    print('='*60)
    print(auc_train,'\n')
    print('AUC-ROC for val')
    print('='*60)
    print(auc_val,'\n')
    print('Roc-Auc-Curve for Train set')
    print('='*60)
    print(plot_roc_curve(rcv_best, X_train, y_train),'\n')
    print('Roc-Auc-Curve for Val set')
    print('='*60)
    print(plot_roc_curve(rcv_best, X_val, y_val),'\n')
    print('Log-Loss for Train set')
    print('='*60)
    print(logloss_train,'\n')
    print('Log-Loss for Val set')
    print('='*60)
    print(logloss_val,'\n')
    
    model_list.append(algo)
    AUCROC_train.append(auc_train)
    AUCROC_val.append(auc_val)
    Precision_train.append(precision_score(y_train, y_train_pred))
    Precision_val.append(precision_score(y_val, y_val_pred))
    Recall_train.append(recall_score(y_train, y_train_pred)) 
    Recall_val.append(recall_score(y_val, y_val_pred))
    Acc_train.append(accuracy_score(y_train, y_train_pred))
    Acc_val.append(accuracy_score(y_val, y_val_pred))
    LogLoss_train.append(logloss_train)
    LogLoss_val.append(logloss_val)

In [None]:
model_fit(model_LR, X_train[cols_num_X_train], y_train, X_val[cols_num_X_val], y_val, algo='Logistic Regression without Hyperparameters')

In [None]:
model_fit_evaluation(model_LR, params_LR, X_train[cols_num_X_train], y_train, X_val[cols_num_X_val], y_val, algo='Logistic Regression with Hyperparameter tuning')

In [None]:
model_fit(model_DT, X_train[cols_num_X_train], y_train, X_val[cols_num_X_val], y_val, algo='Decision Tree without Hyperparameters')

In [None]:
model_fit_evaluation(model_DT, params_DT, X_train[cols_num_X_train], y_train, X_val[cols_num_X_val], y_val, algo='Decision Tree with Hyperparameter tuning')

In [None]:
model_fit(model_RF, X_train[cols_num_X_train], y_train, X_val[cols_num_X_val], y_val, algo='Random Forest without Hyperparameters')

In [None]:
model_fit_evaluation(model_RF, params_RF, X_train[cols_num_X_train], y_train, X_val[cols_num_X_val], y_val, algo='Random Forest with Hyperparameter tuning')

In [None]:
eval_df = pd.DataFrame({'model': model_list, 'train_AUC': AUCROC_train, 'val_AUC': AUCROC_val, 'train_precision': Precision_train, 'val_precision': Precision_val, 'train_recall': Recall_train, 'val_recall': Recall_val, 'train_accu': Acc_train, 'val_accu': Acc_val, 'logloss_train': LogLoss_train, 'logloss_val': LogLoss_val})
eval_df

In [None]:
X_num_train = [feature for feature in X.columns if X[feature].dtypes != 'object' and feature not in ['season']+['date']+['league_id']]
X[X_num_train]

In [None]:
X_test[cols_num_X_test]

In [None]:
X.shape, y.shape, X_test[cols_num_X_test].shape

In [None]:

rcv = RandomizedSearchCV(model_LR, params_LR, cv=5, scoring='neg_log_loss', n_jobs=-1, verbose=1, random_state=23)
rcv.fit(X[X_num_train], y)

rcv_best = rcv.best_estimator_
print('\n')
print('best estimator : ', rcv_best)
print('best parameters: ', rcv.best_params_)
print('best score: ', rcv.best_score_)
print('\n')

y_train_prob = rcv_best.predict_proba(X[X_num_train])
y_train_pred = rcv_best.predict(X[X_num_train])
y_test_prob = rcv_best.predict_proba(X_test[cols_num_X_test])
y_test_pred = rcv_best.predict(X_test[cols_num_X_test])

matrix_train = confusion_matrix(y, y_train_pred)
# matrix_val = confusion_matrix(y_val, y_val_pred)
report_train = classification_report(y, y_train_pred)
# report_val = classification_report(y_val, y_val_pred)
auc_train = roc_auc_score(y, y_train_prob[:,1])
# auc_val = roc_auc_score(y_val, y_val_prob[:,1])
logloss_train = log_loss(y, y_train_prob[:,1])
# logloss_val = log_loss(y_val, y_val_prob[:,1])

print('Confusion Matrix for train')
print('='*60)
print(matrix_train,"\n")
# print('Confusion Matrix for val')
# print('='*60)
# print(matrix_val,"\n")
print('Classification Report for train')
print('='*60)
print(report_train,"\n")
# print('Classification Report for val')
# print('='*60)
# print(report_val,"\n")
print('AUC-ROC for train')
print('='*60)
print(auc_train,'\n')
# print('AUC-ROC for val')
# print('='*60)
# print(auc_val,'\n')
print('Roc-Auc-Curve for Train set')
print('='*60)
print(plot_roc_curve(rcv_best, X[X_num_train], y),'\n')
# print('Roc-Auc-Curve for Val set')
# print('='*60)
# print(plot_roc_curve(rcv_best, X_val, y_val),'\n')
print('Log-Loss for Train set')
print('='*60)
print(logloss_train,'\n')
# print('Log-Loss for Val set')
# print('='*60)
# print(logloss_val,'\n')

In [None]:
y_test_pred = rcv_best.predict(X_test[cols_num_X_test])
y_test_pred

In [None]:
submission_df = pd.DataFrame(y_test_pred,columns=["Outcome"])
submission_df.value_counts()

In [None]:
preds = rcv_best.predict_proba(X_test[cols_num_X_test])
submission_df['Outcome'] = [pred[1] for pred in preds]

In [None]:
submission_df.to_csv('my_submission_file.csv', index=False)