## DS LAB EVALUATION
### NAME: SHUBHAM GOEL
### ROLL NO: 101903748
### GROUP: 3COE28

In [28]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [29]:
import cudf as cd
import cupy as cp
from cuml.metrics import confusion_matrix,entropy,roc_auc_score,mean_squared_log_error
from cuml.decomposition import PCA
from sklearn.metrics import classification_report
from imblearn.over_sampling import SMOTE
import seaborn as sns

In [30]:
df_train=cd.read_csv('/kaggle/input/ucl-ai-society-football-match-prediction/Kaggle Data/train.csv')
df_test=cd.read_csv('/kaggle/input/ucl-ai-society-football-match-prediction/Kaggle Data/test.csv')
df_train

#### Dropping the id column

In [31]:
df_train=df_train.drop('id',axis=1)
ids=df_test['id'].copy()
df_test=df_test.drop('id',axis=1)
df_train

#### Splitting the target label column

In [32]:
y=df_train['Outcome'].copy()
df_train=df_train.drop('Outcome',axis=1)

#### Combining training and testing for further preprocessing

In [33]:
df=cd.concat((df_train,df_test))
df

#### Encoding the categorical columns

In [34]:
from cuml.preprocessing.LabelEncoder import LabelEncoder
# from cuml.preprocessing import OneHotEncoder
for col in df.columns[:-1]:
    enc=LabelEncoder()
    enc.fit(df[col])
    df[col]=enc.transform(df[col])
df

#### Converting bool to float for modelling

In [35]:
for col in df.columns:
    df[col]=df[col].astype('float32')

#### Standard scaling the data by replacing with z score for similar scales

In [36]:
for col in df.columns:
    a = df[col].mean()
    b = df[col].std()
    
    df[col] = df[col].applymap(lambda x: ((x-a)/b))
df = df.astype('float32')

#### Splitting the train and test data

In [37]:
X=df.iloc[:30000,:]
df_test=df.iloc[30000:,:]

#### Checking the class imbalances in predicting labels

In [38]:
class_weights=dict((len(y)/y.value_counts()).to_pandas())
class_weights

In [39]:
type(X.values)

#### Class imbalance is handled by oversampling the  minority data using SMOTE (Synthetic Minority Oversampling Technique)

In [40]:
oversample = SMOTE()
cols=X.columns
X, y = oversample.fit_resample(X.values.get(), y.values.get())
X = cd.DataFrame(X,columns=cols)
y = cd.Series(y)

In [41]:
X.shape,y.shape

#### Checking the correlation between features

In [42]:
sns.heatmap(X.to_pandas().corr(),annot=True)

#### Splitting training data into training and validation

In [43]:
from cuml.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.3,random_state=42)

#### Getting the initial idea by fitting the data into Random Forest CLassifier model and calculating relevant metrics

In [44]:
from cuml.ensemble import RandomForestClassifier
clf=RandomForestClassifier()
clf.fit(X_train,y_train)
clf.score(X_test,y_test)
y_pred = clf.predict(X_test)

In [45]:
def metrics(y_test,y_pred):
    y_test = cp.array(y_test)
    y_pred = cp.array(y_pred)
    print('roc_auc_score: ',roc_auc_score(y_test,y_pred))
    print('mean_squared_log_error: ',mean_squared_log_error(y_test.astype('int'),y_pred.astype('int')))
    print('Confusion Matrix:\n',confusion_matrix(y_test.astype('int'),y_pred.astype('int')))
    print('Precision, Recall, F1 score, Accuracy:\n',classification_report(y_test.astype('int').get(),y_pred.astype('int').get()))

In [46]:
metrics(y_test,y_pred)

In [47]:
y_test.shape

#### Improving the performance by using a hyperparamter tuned XGBoost regressor (Gradient Boosting)

In [48]:
import xgboost as xgb

In [49]:
params = {'eval_metric':'error',
          'eta':'0.3',
#           'gamma':'10',
          'min_child_weight':'0',
          'max_delta_step':'10',
          'lambda':'1',
          'alpha':'0',
          'tree_method':'exact',
          'subsample':'1',
          'scale_pos_weight':'1',
          'objective':'binary:logistic',
          'max_depth': 10}
num_rounds=150
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)
# train the xgboost model
bst = xgb.train(params, dtrain, num_rounds)


In [50]:
y_pred=bst.predict(dtest)
y_pred=cp.around(y_pred)

In [51]:
metrics(y_test,y_pred)

#### Using the same params as previous models, making the final model for submission data by training with whole training+validation data

In [52]:
%%time
bst = xgb.train(params, xgb.DMatrix(X,label=y), num_rounds)
dfinal=xgb.DMatrix(df_test)
final=cp.around(bst.predict(dfinal))

In [53]:
sub=cd.DataFrame()
sub['id']=ids
sub['Outcome']=final
sub['Outcome']=sub['Outcome'].astype('int64')
sub

In [54]:
sub.to_csv('submission.csv',index=False)