In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# _Soccer Fever_

### _Introduction_
Soccer aka Football is the most popular game in the world. It’s a religion of its own. If groups of 10 people can stop time and make people watch them in awe and reverence, it’s this beautiful game. Also, anybody can play soccer- all it needs is 4 poles, a ground and a ball. You can just get started with the play.

In fact, Nelson Mandela very effectively used Football as the unifying factor when he was elected President of South Africa post the Apartheid era. The sport just cuts across all discriminating factors.

### _Relevance_
An entire ecosystem revolves around this beautiful sport. Clubs, Merchandise, listed football clubs, fan clubs and a group of rivals who can just get into a fight based on the outcome of the game.  The amount of currency involved in this game is just phenomenal. It impacts millions of people who depend on it for their livelihood and recreation.

### _Criticality_
We live in ambiguity and always need some information to just make a decision. Decisions are made based on possible outcomes. Win/ Loss/ Pass / Fail etc.

The below problem statement is a classic study for decision-making and understanding the odds stacked against a particular situation.

<b> Train </b>
Dataset: 7443*21
Columns: 21
Target Column: Outcome
Evaluation Metric: Log Loss

<b> Test </b>
Dataset: 4008*20
Columns: 20
Submission Format :

Dataset: 4008*1( Column Name - ‘Outcome’)
Skills
Multi-Class Classification
Optimizing Log Loss

## _Load Packages and Data_

In [None]:
#data manipulation
import pandas as pd
import numpy as np
#system operations and python version
import os
import sys
assert sys.version_info >= (3,5)
#visualizations
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
#display all columns
pd.options.display.max_columns = None
#consistent sized plots
from pylab import rcParams
rcParams['figure.figsize']=12,5
rcParams['axes.labelsize']= 12
rcParams['xtick.labelsize']= 12
rcParams['ytick.labelsize']= 12
#handle unwanted warnings
import warnings
warnings.filterwarnings(action='ignore',message='')

In [None]:
#load training and test data
train =  pd.read_csv('/kaggle/input/soccer-dataset/train.csv',delimiter=',',engine='python')
test =  pd.read_csv('/kaggle/input/soccer-dataset/test.csv',delimiter=',',engine='python')

In [None]:
#view the top rows
train.head()

In [None]:
train.tail()

- _As per the data description, Outcome is the target label_


In [None]:
#check the shape of train and test data
train.shape, test.shape

In [None]:
#check info 
train.info()

In [None]:
#check basic stats of the numerical features
train.describe().transpose()

In [None]:
#check the value counts of all categorical features
cat_features = train.select_dtypes(include='object').columns.to_list()
print('Categorical features \n {}'.format(cat_features))

In [None]:
#number of seasons involved
train['season'].nunique()

In [None]:
#seasons and their counts
train['season'].value_counts()

In [None]:
'''
Print the unique feature names with counts for categorical features
'''
for feature in cat_features:
    print('Feature Name {}'.format(feature))
    print('--------------------------------')
    print('Number of unique categories {}'.format(train[feature].nunique()))
    print('Unique categories & their counts')
    print(train[feature].value_counts().sort_values(ascending=False))
    print('\n')

- _There are 3 seasons from 2019 to 2021_
- _There are 657 unique teams in both Team1 and Team2_
- _There are 39 unique leagues and United Soccer League and Major League Soccer have the highest counts_ 

In [None]:
#check for data balance
sns.countplot(train['Outcome'])
plt.title('Countplot of Outcome')
plt.show()

In [None]:
train['Outcome'].value_counts()

- _Clearly the data is not balanced. This consideration will have to be adjusted during model training. In logistic regression this can be addressed via weights hyper-parameter. Other approach would be to split the dataset for model training using stratified approach_

In [None]:
'''
Null values in the dataset
'''
train.isna().sum().sort_values(ascending=False)

In [None]:
#visualize null values
plt.figure(figsize=(15,7))
sns.heatmap(train.isna(),cbar=False,yticklabels=False,cmap='summer')
plt.title('Null Values Visualization')
plt.show()

- _The yellow patches reflect the null values. There are way too many null values in some of the features_
- _Imputing these features with more than 50% of their values as null would be a huge assumption and risk to the model. Secondly, there is no metadata provided with the dataset to determine what these features actually reflect and how they can influence the Outcome. This is a pity but a hard reality of this hackathon._

## _Convert features to right data type_
- _There are date features as object_
- _season should be a categorical feature which can be later encoded for model training_
- _league id is numerical which can be converted to categorical. However, in all likelihood this feature would eventually will not be seen important to feed into the model training. The model might learn the unique pattern from these id's and may not generalize well to the unseen data_ 
- _league id however, is a unique identifier and in production it would be a helpful feature to determine that the same data remains in the train and dev set even when the dataset is updated with new data. With the help of these unique league ids, hash codes can be generated using which the same data observation can be ensured to be in the train and the dev set for multiple iterations in the future. For the current problem, this feature is not useful_

In [None]:
#convert to the right data type
train['date'] = pd.to_datetime(train['date'],dayfirst=True)
train['season'] = train['season'].astype(str)

In [None]:
train.info()

In [None]:
#check a few rows
train.head(2)

## _Exploratory Data Analysis_

In [None]:
'''
First & foremost, create a copy of the train data
'''
soccer = train.copy()

In [None]:
'''
Check histogram of the integer features
'''
num_features = soccer.select_dtypes(include=[float,int]).columns.to_list()

soccer_num = soccer[num_features]
soccer_num.hist(figsize=(20,15),bins=30)
plt.show()

In [None]:
#check the correlation heatmap
sns.heatmap(soccer.corr())
plt.title('Correlation Heatmap')
plt.show()

- _A few of the multi-collinearity between the features is visible. Example - score1 with adj_score1, score2 with adj_score2_

In [None]:
'''
Plot the correlations between numerical features which do not have null values. 
'''
# plot scatter matrix
from pandas.plotting import scatter_matrix

attributes = ['SPI1', 'SPI2', 'proj_score1', 'proj_score2']
scatter_matrix(soccer[attributes], figsize=(12, 8))
plt.show()

#### _Analysis from the scatter plots_
- _SPI1 and SPI2 are highly correlated with each other. Both these features are positively correlated_
- _proj_score1 and proj_score2 are highly negatively correlated_
- _There is also visible correlation between SPI1 and proj_score1 as well as SPI2 and proj_score2_
- _SPI2 and proj_score2 seems to have a non-linear relationship as the scatter plot is denser till the SPI2 values of 50 and then becomes lighter and higher as SPI2 values increases_

_One way to address the multi-collinearity would be by dimensionality reduction using PCA or even t-SNE for non linear relationship_

In [None]:
#scatter plot of correlated feature SPI1 and SPI2
sns.lmplot(x='SPI1',y='SPI2',data=soccer,hue='Outcome',palette='Set1',markers=['x','o'])
plt.title('SPI1 vs SPI2 Scatter Plot')
plt.grid()
plt.xlabel('SPI1')
plt.ylabel('SPI2')
plt.show()

In [None]:
#scatter plot of correlated feature proj_score2 and SPI2
plt.figure(figsize=(15,7))
sns.lmplot(x='proj_score2',y='SPI2',data=soccer,hue='Outcome',palette='Set1',markers=['x','o'])
plt.title('proj_score2 vs SPI2 Scatter Plot')
plt.grid()
plt.xlabel('proj_score2')
plt.ylabel('SPI2')
plt.show()

In [None]:
#regression plot of correlated feature proj_score2 and SPI2
sns.regplot(x='proj_score2',y='SPI2',data=soccer,x_jitter=0.2)
plt.title('proj_score2 vs SPI2 Regression Plot')
plt.grid()
plt.xlabel('proj_score2')
plt.ylabel('SPI2')
plt.show()

- _This shows that proj_score2 and SPI2 have significant linear correlation_

In [None]:
# Plot the distribution of SPI1
sns.displot(
    data=soccer,
    x="SPI1", hue="Outcome",
    kind="kde", height=6,
    multiple="fill", clip=(0, None),
    palette="ch:rot=-.25,hue=1,light=.75")
plt.title('Plot of Distribution of SPI1')
plt.show()   

- _As the value of SPI1 increases, there are more outcomes of 1 versus 0_

In [None]:
# Plot the distribution of SPI2
sns.displot(
    data=soccer,
    x="SPI2", hue="Outcome",
    kind="kde", height=6,
    multiple="fill", clip=(0, None),
    palette="ch:rot=-.25,hue=1,light=.75")
plt.title('Plot of Distribution of SPI2')
plt.show() 

- _As the value of SPI2 increases, there appears to be more of outomces 0. However, over the distribution of the SPI2, the outcomes 0 and 1 appear to be almost equal_

In [None]:
# Plot the distribution of SPI1
sns.displot(
    data=soccer,
    x="proj_score1", hue="Outcome",
    kind="kde", height=6,
    multiple="fill", clip=(0, None),
    palette="ch:rot=-.25,hue=1,light=.75")
plt.title('Plot of Distribution of proj_score1')
plt.show()  

- _Similar to SPI1 feature, there are more outcomes of value 1 when the value of proj_score1 increases. Again to remind there is no feature description or metadata to understand the associate any logic of this relationship._

In [None]:
# Plot the distribution of SPI1
sns.displot(
    data=soccer,
    x="proj_score2", hue="Outcome",
    kind="kde", height=6,
    multiple="fill", clip=(0, None),
    palette="ch:rot=-.25,hue=1,light=.75")
plt.title('Plot of Distribution of proj_score2')
plt.show()  

- _Similar to SPI2 feature, there are more outcomes of value 0 when the value of proj_score1 increases. Again to remind there is no feature description or metadata to understand the associate any logic of this relationship._

## _Summary of the EDA_
- _There are numerous values with null in some of the features. Instead of imputing without understanding their significance and role, it is better to simply drop them from further analysis and from building a predictive model._
- _SPI1 SPI2, proj score1 and proj score2 are some of the features which depict multi-collinearity and also influence the outcome label in a big way as indicated by the displots. The data is imbalanced with more 1's than 0 outcomes and this gives us a clue of which feature would be more suitable to have a better predictability of one of these two outcomes_




## _Statistical Tests_

In [None]:
'''
Normality tests of SPI1 and SPI2 features
'''
from scipy.stats import normaltest

features = ['SPI1','SPI2','proj_score1','proj_score2']
for feature in features:
    data = soccer[feature]
    stat,p = normaltest(data)
    if p >0.05:
        print('Feature {} is normally distributed'.format(feature))
    else:
        print('Feature {} is NOT normally distributed'.format(feature))

## _Statistical Correlations_

In [None]:
'''
test for correlation
'''
from scipy.stats import pearsonr

data1 = soccer['SPI1']
data2 = soccer['SPI2']

stat, p = pearsonr(data1, data2)
print('stat=%.3f, p=%.3f' % (stat, p))
if p > 0.05:
    print(f'Feature SPI1 and SPI2 are independent and have no correlation')
else:
    print('Feature SPI1 and SPI2 are dependent and are correlated')

- _Similarly, we can statistically check the correlations between the other features. Infact they are all correlated_

## _Feature Engineering and Data Preparation_
### _Ways to handle the multi-collinearity_
- _Drop one of the multi-collinear feature_
- _Create  new feature combining the multi-collinear features_
- _Let the features be present and handle them as part of dimensionality reduction techniques_


In [None]:
'''
Drop the fatures which have null values, these all have more than 64% missing values
'''
soccer.drop(['importance1', 'importance2','score1', 'score2', 'xg1', 'xg2', 'nsxg1', 'nsxg2', 'adj_score1','adj_score2'],axis=1,inplace=True)
soccer.head()
             

In [None]:
#check if any missing values
soccer.isna().sum()

In [None]:
'''
Create a new feature as the linear sum of SPI1 and SPI2
'''
soccer['SPI'] = soccer['SPI1'] + soccer['SPI2']
#soccer['proj_score'] = soccer['proj_score1'] + soccer['proj_score2']

In [None]:
'''
Drop the redundant feature columns
'''
#drop the redundant columns 
soccer.drop(['SPI1','SPI2','proj_score1','proj_score2'],axis=1,inplace=True)
soccer.head(3)

In [None]:
#replace date with month, the year will be taken care by the season feature 
import datetime as dt
soccer['month'] = soccer['date'].dt.month
soccer['weekday'] = soccer['date'].dt.weekday
soccer.drop('date',axis=1,inplace=True)
soccer.head(2)

In [None]:
soccer.info()

In [None]:
plt.figure(figsize=(25,5))
ax = sns.countplot(x="league_id", data=soccer,
                   facecolor=(0, 0, 0, 0),
                   linewidth=2,
                   edgecolor=sns.color_palette("dark", 3),hue='Outcome')
plt.title('Countplot of league_id')
plt.show()

In [None]:
'''
Split the data into train and dev set
'''
from sklearn.model_selection import train_test_split
X = soccer.drop('Outcome',axis=1)
y = soccer['Outcome']
seed =  41
test_size = 0.2

#split the dataset
X_train,X_dev,y_train,y_dev = train_test_split(X,y,test_size=test_size,random_state=seed,stratify=soccer['Outcome'])

In [None]:
#check the shape post split 
X_train.shape,X_dev.shape,y_train.shape,y_dev.shape

In [None]:
# split into numerical and categorical data
X_train_num = X_train.select_dtypes(exclude='object')
X_train_cat = X_train.select_dtypes(include='object')

X_dev_num = X_dev.select_dtypes(exclude='object')
X_dev_cat = X_dev.select_dtypes(include='object')

In [None]:
#transform the numerical features for modeling

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PowerTransformer
from sklearn.preprocessing import StandardScaler


num_pipeline = Pipeline([('pow_transform',PowerTransformer()),                         
                         ('std_scaler',StandardScaler())])

X_train_num_tr = num_pipeline.fit_transform(X_train_num)



In [None]:
#transformed numerical values
X_train_num_tr

In [None]:
#transform the categorical features for modeling
from sklearn.preprocessing import OneHotEncoder

cat_pipeline = Pipeline([('cat_encoder',OneHotEncoder(handle_unknown='ignore',sparse=False))])
X_train_cat_tr = cat_pipeline.fit_transform(X_train_cat)

In [None]:
X_train_num_tr.shape

In [None]:
#concatenate the transformed array to create the input array
X_train_tr = np.hstack((X_train_num_tr,X_train_cat_tr))

In [None]:
'''
Apply the transformation pipeline on the dev set
'''
X_dev_num_tr = num_pipeline.transform(X_dev_num)
X_dev_cat_tr = cat_pipeline.transform(X_dev_cat)
X_dev_tr = np.hstack((X_dev_num_tr,X_dev_cat_tr))

In [None]:
X_dev_tr.shape

In [None]:
#there is no transformation here --> converting to numpy array
y_train_tr = np.array(y_train)
y_dev_tr = np.array(y_dev)

## _Modeling_

In [None]:
#import predictive models
from sklearn.ensemble import RandomForestClassifier

In [None]:
'''
Train the baseline model
'''
weights = {0:5.0,1:1.0}
clf = RandomForestClassifier(n_estimators=200,random_state=42,max_features=0.2,class_weight=weights)
clf.fit(X_train_tr,y_train_tr)

In [None]:
#predict on the train and dev set 
predictions_train = clf.predict(X_train_tr)
predictions_dev = clf.predict(X_dev_tr)

In [None]:
## evaluate the models
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import log_loss
from sklearn.metrics import plot_confusion_matrix
from sklearn.metrics import classification_report

In [None]:
print('Accuracy on the train set {}'.format(accuracy_score(y_train_tr,predictions_train)))
print('Accuracy on the dev set {}'.format(accuracy_score(y_dev_tr,predictions_dev)))

In [None]:
'''
Classification report on the dev set
'''
print(classification_report(y_dev_tr,predictions_dev))

In [None]:
'''
Prepare the test data from predictions. This is being done linearly here. A better approach is to have a data preparation function
'''
#convert to the right data type 
test['date'] = pd.to_datetime(test['date'],dayfirst=True)
test['season'] = test['season'].astype(str)

#drop the features with more than 65% null values 
test.drop(['importance1', 'importance2','score1', 'score2', 'xg1', 'xg2', 'nsxg1', 'nsxg2', 'adj_score1','adj_score2'],axis=1,inplace=True)

#create the new features & drop the redundant features
test['SPI'] = test['SPI1'] + test['SPI2']
#test['proj_score'] = test['proj_score1'] + test['proj_score2']
test.drop(['SPI1','SPI2','proj_score1','proj_score2'],axis=1,inplace=True)


test['month'] = test['date'].dt.month
test['weekday'] = test['date'].dt.weekday
test.drop('date',axis=1,inplace=True)

#apply the data transformation using defined pipeline
X_test_num = test.select_dtypes(exclude='object')
X_test_cat = test.select_dtypes(include='object')

X_test_num_tr = num_pipeline.transform(X_test_num)
X_test_cat_tr = cat_pipeline.transform(X_test_cat)
X_test_tr = np.hstack((X_test_num_tr,X_test_cat_tr))

In [None]:
'''
Fit the model on the entire training set before proceeding with final predictions on the test set
'''
X_train_full = np.vstack((X_train_tr,X_dev_tr))
y_train_full = np.hstack((y_train_tr,y_dev_tr))
clf.fit(X_train_full,y_train_full)

predictions_train_full = clf.predict(X_train_full)
print('Accuracy = {}'.format(accuracy_score(y_train_full,predictions_train_full)))

predictions_train_full_proba = clf.predict_proba(X_train_full)
print('log loss = {}'.format(log_loss(y_train_full,predictions_train_full_proba[:,1])))

In [None]:
#perform predictions on the prepared test data
predictions_test = clf.predict(X_test_tr)
#get the prediction probability
predictions_test_proba = clf.predict_proba(X_test_tr)

In [None]:
#create and submit the submission file
submission_df =  pd.DataFrame(predictions_test_proba[:,1],columns=['Outcome'])
submission_df.to_csv('soccer_26Aug_1.csv',index=False)

## _Further Improvements_
- _Performance can be improved using hyper-parameter search and cross validation. The metric of interest is log_loss_



## _Deep Learning Model_

In [None]:
#import tensorflow
import tensorflow as tf

In [None]:
#instantiate the model
model = tf.keras.models.Sequential()
#define activation
activation = tf.keras.layers.Activation(tf.nn.relu)
#add the dense layers
model.add(tf.keras.layers.Dense(units=1024,input_dim=X_train_full.shape[1],activation='relu'))
model.add(tf.keras.layers.Dense(units=512,activation=activation))
model.add(tf.keras.layers.Dropout(0.5))

model.add(tf.keras.layers.Dense(units=256,activation=activation))
model.add(tf.keras.layers.Dense(units=128,activation=activation))
model.add(tf.keras.layers.Dropout(0.5))

model.add(tf.keras.layers.Dense(units=64,activation=activation))
model.add(tf.keras.layers.Dense(units=32,activation=activation))

model.add(tf.keras.layers.Dense(1,activation='sigmoid'))

#compile the model
model.compile(optimizer='adam',loss='binary_crossentropy',metrics=['accuracy'])

In [None]:
#define the early stop criteria
early_stop = tf.keras.callbacks.EarlyStopping(monitor='val_loss',patience=50,restore_best_weights=True)

In [None]:
#fit the model
history = model.fit(X_train_full,y_train_full,epochs=500,validation_split=0.2,callbacks=[early_stop],verbose=2,
                   batch_size=512)

In [None]:
from sklearn.metrics import log_loss
predictions_train = model.predict(X_train_full)

In [None]:
predictions_train

In [None]:
predictions_test_dnn = model.predict(X_test_tr)
#create and submit the submission file
submission_df =  pd.DataFrame(predictions_test_dnn,columns=['Outcome'])
submission_df.to_csv('soccer_26Aug_2_dnn.csv',index=False)