In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
import numpy as np

import pandas as pd
import datatable as dt


#Import Libraries for train test split
from sklearn.model_selection import train_test_split

# Import XGBoost module
from xgboost import XGBClassifier

# Confusion matrix to evaluate performance
from sklearn.metrics import confusion_matrix, accuracy_score

# AUC score to evaluate performance
from sklearn.metrics import roc_auc_score

In [None]:
# pandas setting
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

In [None]:
train_full = dt.fread('../input/tabular-playground-series-oct-2021/train.csv').to_pandas().set_index('id')
test_full = dt.fread('../input/tabular-playground-series-oct-2021/test.csv').to_pandas().set_index('id')
sample_submission = dt.fread('../input/tabular-playground-series-oct-2021/test.csv').to_pandas().set_index('id')

In [None]:
train_full.head()

In [None]:
# Printing the shape of the given train and test datastes
print('Shape of train data: %s \nShape of test data: %s' %(train_full.shape,test_full.shape))

In [None]:
# Get a summary of the datatypes usof the recorded values.
train_full.info()

In [None]:
# Summary of missing values

pd.DataFrame({
    'data_set': ['train', 'test'],
    'missing_values': [
        train_full.isna().sum().sum(), 
        test_full.isna().sum().sum()
    ]})

In [None]:
# Determine the dataypes.
# Verification: len(num_cols) = 240
num_cols = train_full.dtypes[train_full.dtypes == 'float64'].index.to_list()
# Verification: len(binary_cols) = 45 excl target
binary_cols = train_full.dtypes[train_full.dtypes == 'bool'].index.drop('target').to_list() 

In [None]:
# reduce memory usage by changind datatypes.
train_full[num_cols] = train_full[num_cols].astype(np.float32)
train_full[binary_cols] = train_full[binary_cols].astype(np.short)
train_full['target'] = train_full['target'].astype(np.short)
    
test_full[num_cols] = test_full[num_cols].astype(np.float32)
test_full[binary_cols] = test_full[binary_cols].astype(np.short)

In [None]:
# memory usage
memory_usage = train_full.memory_usage(deep=True) / 1024 ** 2
print('Memory (train): {:.2f} MB'.format(memory_usage.sum()))

memory_usage = test_full.memory_usage(deep=True) / 1024 ** 2
print('Memory (test) : {:.2f} MB'.format(memory_usage.sum()))

In [None]:
# Sampling 1% of data since dataset consistis of 10^6 rows
train_sample = train_full.sample(frac=0.01, random_state=42) # A random sample of 10,000 rows
train_sample.head()

In [None]:
# Get Matrix of features (X) and Target(Y)
y = train_sample[train_sample.columns[-1]] # Target
X = train_sample.drop(train_sample.columns[-1], axis=1) # Features

In [None]:
# Split training data into training and validation sets.
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size = 0.2, random_state = 0)

In [None]:
classifier = XGBClassifier(random_state  = 1)
# Making prediction on validation set
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_valid)

In [None]:
#Making the Confusion Matrix
cm = confusion_matrix(y_valid, y_pred)
print(cm)
accuracy_score(y_valid, y_pred)

In [None]:
# Making prediction of probabilities on validation set
y_pred_prob = classifier.predict_proba(X_valid)[:,1]

In [None]:
# Calculating AUC score
auc_score = roc_auc_score(y_valid, y_pred_prob)
auc_score

In [None]:
# Get the test predictions
preds_test_proba = classifier.predict_proba(test_full)[:,1]
preds_test_proba

In [None]:
# Save test predictions to file
output = pd.DataFrame({'id': test_full.index,
                       'target': preds_test_proba})
output.to_csv('submission.csv', index=False)