# Extreme Gradient Boosting model
Aim of this notebook is to produce a Baseline Model to perform initial discovery

In [None]:
import numpy as np
import pandas as pd
import os
import matplotlib.pyplot as plt
import seaborn as sns

for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
# Import modules for model analysis
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import GridSearchCV

# Import xgb modules
import xgboost as xgb

In [None]:
# Read in the data
train = pd.read_csv('../input/tabular-playground-series-oct-2021/train.csv',index_col=0)
test  = pd.read_csv('../input/tabular-playground-series-oct-2021/test.csv', index_col=0)

train.head()

In [None]:
train.shape

In [None]:
# Check the memory consumed by the DataFrame
train.info(memory_usage='deep')

In [None]:
# Lets understand the range of values prior to improving memory performance
train.describe().T

In [None]:
# Review cardinality distribution by feature
train.nunique().value_counts()
# Looks like 46 features could be converted to categorical

> NOTE: check for co-linearity between the categorical features (excluding target)

In [None]:
# Check for the max value by feature
train.max().value_counts()
# It appears that scaling will not be required

In [None]:
# Lets take a sample of the training dataset to perform some model development analysis
# Use only 25% of the training data in this example
train_data      = train.sample(frac=0.25, random_state=42)

In [None]:
# Drop the original train dataset to conserve space within the environment
del train

In [None]:
# Lets reduce the memory usage of the features
# First - check the integer values and downcast
def int_downcast(df):
    int_cols = df.select_dtypes(include=['int64'])

    for col in int_cols.columns:
#         print(col, 'min:',df[col].min(),'; max:',df[col].max())
        df[col] = pd.to_numeric(df[col], downcast ='integer')
    return df

int_downcast(train_data)
int_downcast(test)
train_data.memory_usage(deep=True) * 1e-6

In [None]:
train_data.dtypes

In [None]:
# Second - check the float values and downcast. Method will have to be applied to the train and test DataFrames
def float_downcast(df):
    float_cols = df.select_dtypes(include=['float64'])

    for col in float_cols.columns:
#         print(col, 'min:',df[col].min(),'; max:',df[col].max())
        df[col] = pd.to_numeric(df[col], downcast ='float')
    return df

float_downcast(train_data)
float_downcast(test)

In [None]:
# Review the memory usage by DataFrame
train_data.info(memory_usage='deep')
test.info(memory_usage='deep')

The train dataset has now halved in size so this has helped with the memory aspect. We are now able to store these datasets and can use these going forward. 

In [None]:
# Check for missing values
print(f'Train df has missing value: {train_data.isnull().sum().value_counts()}')
print(f'Test df has missing value: {test.isnull().sum().value_counts()}')

# Feature analysis

In [None]:
# Correlation matrix
# corr = train_data.corr()
# # Mask the upper triangle
# mask = np.triu(np.ones_like(corr, dtype=bool))
# Add the mask to the heatmap
# sns.heatmap(corr, mask=mask, center=0, linewidths=1, annot=True, fmt=".2f")
# plt.show()

# Remove highly correlated features
corr_matrix = train_data.corr().abs()

# Create a True/False mask and apply it
mask = np.triu(np.ones_like(corr_matrix, dtype=bool))
tri_df = corr_matrix.mask(mask)

# List column names of highly correlated features (r > 0.25)
to_drop = [c for c in tri_df.columns if any(tri_df[c] >  0.25)]
to_drop

In [None]:
corr_matrix['f22']

### Multicollinearity

In [None]:
from statsmodels.stats.outliers_influence import variance_inflation_factor

In [None]:
# Review the multicollinearity for the features
# X_feat = train_data.drop('target', axis=1)
# vif_data = pd.DataFrame()
# vif_data['feature'] = X_feat.columns
# vif_data['VIF'] = [variance_inflation_factor(X_feat.values, i) for i in range(len(X_feat.columns))]

# Model Analysis

In [None]:
# Features and label
X = train_data.drop('target', axis=1)
y = train_data['target']

In [None]:
# Prepare the data to be used within the model. Make use of the lgb.Dataset() method to optimise the memory usage
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=6, stratify=y)

# Baseline model

In [None]:
# Instantiate the XGBClassifier
xg_cl = xgb.XGBClassifier(objective='binary:logistic', 
                          n_estimators=10, 
                          seed=123, 
                          use_label_encoder=False, 
                          eval_metric='auc', 
                          tree_method='gpu_hist')

# Fit the classifier to the training set
xg_cl.fit(X_train, y_train)

# Predict the labels of the test set: preds
preds = xg_cl.predict(X_test)

# Compute the accuracy: accuracy
accuracy = float(np.sum(preds==y_test))/y_test.shape[0]
print("accuracy: %f" % (accuracy))

In [None]:
# Lets use the boosting and inbuild CV methods

# Create the DMatrix from X and y: churn_dmatrix
d_train = xgb.DMatrix(data=X_train, label=y_train)
d_test = xgb.DMatrix(data=X_test, label=y_test)
xgd_test = xgb.DMatrix(data=test)

# Create the parameter dictionary: params. NOTE: have to explicitly provide the objective param
params = {"objective":"binary:logistic", 
          "max_depth":3,
#           "use_label_encoder":False, 
          "eval_metric":'auc', 
          "tree_method":'gpu_hist'
         }

# Reviewing the AUC metric
# Perform cross_validation: cv_results
cv_results = xgb.cv(dtrain=d_train, params=params,
                  nfold=3, num_boost_round=10, 
                  metrics="auc", as_pandas=True, seed=123)

# Print cv_results
print(cv_results)

# Print the AUC
print((cv_results["test-auc-mean"]).iloc[-1])

In [None]:
# Review the train method
params = {
    "objective": "binary:logistic", 
    "max_depth": 3,
    "eval_metric": 'auc', 
    "tree_method": 'gpu_hist'
}

# train - verbose_eval option switches off the log outputs
xgb_clf = xgb.train(
    params,
    d_train,
    num_boost_round=5000,
    evals=[(d_train, 'train'), (d_test, 'test')],
    early_stopping_rounds=100,
    verbose_eval=0
)

# predict
y_pred = xgb_clf.predict(d_test)
# Compute and print metrics
print(f"AUC : {roc_auc_score(y_test, y_pred)}")

# Dimensionlity Reduction

In [None]:
from sklearn.metrics import accuracy_score

In [None]:
# 1. First model - Lasso Regressor
from sklearn.linear_model import LassoCV

# Create and fit the LassoCV model on the training set
lcv = LassoCV()
lcv.fit(X_train, y_train)
print('Optimal alpha = {0:.3f}'.format(lcv.alpha_))

# Calculate R squared on the test set
r_squared = lcv.score(X_test, y_test)
print('The model explains {0:.1%} of the test set variance'.format(r_squared))

# Create a mask for coefficients not equal to zero
lcv_mask = lcv.coef_ != 0
print('{} features out of {} selected'.format(sum(lcv_mask), len(lcv_mask)))

In [None]:
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression

# Create the RFE with a LogisticRegression estimator and 3 features to select
rfe = RFE(estimator=LogisticRegression(), n_features_to_select=3, verbose=1)

# Fits the eliminator to the data
rfe.fit(X_train, y_train)

# Print the features and their ranking (high = dropped early on)
print(dict(zip(X.columns, rfe.ranking_)))

# Print the features that are not eliminated
print(X.columns[rfe.support_])

# Calculates the test set accuracy
acc = accuracy_score(y_test, rfe.predict(X_test))
print("{0:.1%} accuracy on test set.".format(acc)) 

In [None]:
# from sklearn.ensemble import RandomForestClassifier
# from sklearn.metrics import accuracy_score

In [None]:
# Fit the random forest model to the training data
# rf = RandomForestClassifier(random_state=0)
# rf.fit(X_train, y_train)

In [None]:
# Calculate the accuracy
# acc = accuracy_score(y_test, rf.predict(X_test))

# # Print the importances per feature
# print(dict(zip(X.columns, rf.feature_importances_.round(2))))

# # Print accuracy
# print("{0:.1%} accuracy on test set.".format(acc))

In [None]:
# Create a mask for features importances above the threshold
# mask = rf.feature_importances_ > 0.01

# # Apply the mask to the feature dataset X
# reduced_X = X.loc[:, mask]

# # prints out the selected column names
# print(reduced_X.columns)

In [None]:
# from sklearn.feature_selection import RFE
# from sklearn.linear_model import LogisticRegression

# # Create the RFE with a LogisticRegression estimator and 3 features to select
# rfe = RFE(estimator=LogisticRegression(), n_features_to_select=3, verbose=1)

# # Fits the eliminator to the data
# rfe.fit(X_train, y_train)

# # Print the features and their ranking (high = dropped early on)
# print(dict(zip(X.columns, rfe.ranking_)))

# # Print the features that are not eliminated
# print(X.columns[rfe.support_])

# # Calculates the test set accuracy
# acc = accuracy_score(y_test, rfe.predict(X_test))
# print("{0:.1%} accuracy on test set.".format(acc)) 

# Make submission

In [None]:
def submission_sample(model, df_test, model_name):
    sample = pd.read_csv('../input/tabular-playground-series-oct-2021/sample_submission.csv')
    sample['target'] = model.predict(df_test)
    return sample.to_csv(f'submission_{model_name}.csv',index=False)

In [None]:
# Baseline submission - original code versions
submission_sample(xgb_clf, xgd_test, 'xgb_base')