# Tabular Playground Series - Nov 2021

In [None]:
# import required libraries
import numpy as np
import pandas as pd

# visualizations 
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn import metrics
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, roc_auc_score, plot_roc_curve

In [None]:
# to display all columns
pd.options.display.max_columns = 999

In [None]:
# igonre warnings
import warnings
warnings.filterwarnings('ignore')

## Import both train and test data

In [None]:
train_data = pd.read_csv('../input/tabular-playground-series-nov-2021/train.csv')
test_data = pd.read_csv('../input/tabular-playground-series-nov-2021/test.csv')

In [None]:
train_data.head()

In [None]:
test_data.head()

In [None]:
print(train_data.shape)
print(test_data.shape)
print(train_data.info())
print(test_data.info())

In [None]:
def reduce_memory_usage(df, verbose=True):
    numerics = ["int8", "int16", "int32", "int64", "float16", "float32", "float64"]
    start_mem = df.memory_usage().sum() / 1024 ** 2
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == "int":
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
    end_mem = df.memory_usage().sum() / 1024 ** 2
    if verbose:
        print(
            "Mem. usage decreased to {:.2f} Mb ({:.1f}% reduction)".format(
                end_mem, 100 * (start_mem - end_mem) / start_mem
            )
        )
    return df

In [None]:
train_data = reduce_memory_usage(train_data)
test_data = reduce_memory_usage(test_data)

### Let's check train data

In [None]:
# lets use 1Lakh of data for our models building
train_data['target'].value_counts(normalize=True)*100

**The dataset is balanced.**

In [None]:
train_data.isna().sum().sort_values(ascending=False)

- No Null values

In [None]:
train_data.describe(include='all')

## Train test Split

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X = train_data.drop(['id','target'], axis=1)
y = train_data['target']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [None]:
print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

## Feature Scaling

In [None]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()

num_cols = X_train.columns

X_train[num_cols] = scaler.fit_transform(X_train[num_cols])
X_test[num_cols] = scaler.transform(X_test[num_cols])

In [None]:
X_train.describe()

## Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression(random_state=42, n_jobs=-1, solver='liblinear')

logreg.fit(X_train, y_train)
logreg.score

In [None]:
# evaluating function

def evaluation_final(mod, x_test, y_test, y_test_pred):
    
    print('Evaluation Report on Test set:')
    print(confusion_matrix(y_test, y_test_pred))
    print(classification_report(y_test, y_test_pred))
    print('Accuracy of Test data:',accuracy_score(y_test, y_test_pred))
    
    # plot roc_curve for test
    plot_roc_curve(mod, x_test, y_test)
    plt.show()
    # roc_auc_score
    print('roc_auc_score:', roc_auc_score(y_test, mod.predict_proba(x_test)[:,1]))


In [None]:
y_test_pred_log = logreg.predict(X_test)
evaluation_final(logreg, X_test, y_test, y_test_pred_log)

## PCA

In [None]:
# from sklearn.decomposition import PCA
# pca = PCA()

# # fit X_train
# pca.fit(X_train)

In [None]:
# # plot variance explained ratio
# plt.figure(figsize=[8,6])
# plt.bar(range(1,len(pca.explained_variance_ratio_)+1), pca.explained_variance_ratio_)
# plt.show()

In [None]:
# var_cumu = np.cumsum(pca.explained_variance_ratio_)

# # Making a scree plot
# fig = plt.figure(figsize=[12,7])
# plt.plot(var_cumu)
# plt.xlabel('no of principal components')
# plt.ylabel('explained variance - cumulative')
# plt.show()

In [None]:
# np.cumsum(np.round((pca.explained_variance_ratio_*100),2))

**98% of variance is explained by 60 components**

In [None]:
# pca_60 = PCA(n_components=60)

# # fit_transform
# X_train_60 = pca_60.fit_transform(X_train)
# print("Shape of X_train_60:", X_train_60.shape)

# X_test_60 = pca_60.fit_transform(X_test)
# print("Shape of X_test_60:", X_test_60.shape)

## XGBoost

In [None]:
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV

### Along with Hyperparameter Tuning

In [None]:
# # function for getting optimum hyperparameter tuning 
# def tune_hyperparameter(parameters,X_train,y_train,n_folds = 5):
    
#     xgb_model = XGBClassifier(random_state=42, n_jobs=-1, tree_method='gpu_hist')
    
#     xgb_model_cv = GridSearchCV(estimator=xgb_model,
#                                        param_grid=parameters,
#                                        n_jobs=-1,
#                                        cv=n_folds,
#                                        scoring='roc_auc',
#                                        verbose=1,
#                                        refit=True)


#     xgb_model_cv.fit(X_train, y_train)
#     scores = xgb_model_cv.cv_results_

#     for key in parameters.keys():
#         hyperparameters = key
#         break

#     # plotting accuracies for parameters
#     plt.figure(figsize=(16,5))
#     # plt.plot(scores["param_"+hyperparameters], scores["mean_train_score"], label="training accuracy")
#     plt.plot(scores["param_"+hyperparameters], scores["mean_test_score"], label="test accuracy")
#     plt.xlabel(hyperparameters)
#     plt.ylabel("ROC_AUC_SCORE")
#     plt.legend()
#     plt.show()

In [None]:
# # learning_rate
# params = {'learning_rate': [0.05, 0.1, 0.2, 0.3, 0.5]}
# tune_hyperparameter(parameters=params, X_train=X_train_60, y_train=y_train)

The ROC_AUC Curve is max at learning_rate=0.2

In [None]:
# learning_rate=0.2

In [None]:
# # fit the model with n_estimators parameters
# params = {'n_estimators': [300, 500, 800, 1000, 1300]}
# tune_hyperparameter(parameters=params, X_train=X_train_60, y_train=y_train)

More roc_auc at low value of n_estimators.

In [None]:
# # fit the model with n_estimators parameters again
# params = {'n_estimators': [50, 100, 150, 200]}
# tune_hyperparameter(parameters=params, X_train=X_train_60, y_train=y_train)

**Lets take n_estimators=100 for the final model**

**Building an Optimized model with new Hyperparameters**

In [None]:
xgb_model = XGBClassifier(random_state=42, n_jobs=-1, tree_method='gpu_hist')
    
parameters = {'learning_rate': [0.025, 0.05, 0.1, 1.5, 0.2, 0.25],
             'n_estimators': [100, 200, 400, 500],
             'min_child_weight': [50, 100, 200, 300]}
#              'min_sample_leaf': [10, 30, 50]}
#              'max_depth':[20, 40, 50, 60]}

xgb_model_cv = GridSearchCV(estimator=xgb_model,
                                       param_grid=parameters,
                                       n_jobs=-1,
                                       cv=4,
                                       scoring='roc_auc',
                                       verbose=1,
                                       refit=True)

In [None]:
%%time
xgb_model_cv.fit(X_train, y_train)

In [None]:
# plot roc_auc scores
plt.figure(figsize=[10,6])
plt.plot(range(1, len(xgb_model_cv.cv_results_['mean_test_score'])+1), xgb_model_cv.cv_results_['mean_test_score'])
plt.xlabel('Fits', fontsize=12)
plt.ylabel('ROC_AUC Score', fontsize=12)
plt.show()

In [None]:
final_best_xgb_model = xgb_model_cv.best_estimator_
final_best_xgb_model

In [None]:
y_actual_pred_xgb = final_best_xgb_model.predict(X_test)
evaluation_final(final_best_xgb_model, X_test, y_test, y_actual_pred_xgb)

## Prediction on Actual Test Data

In [None]:
# submitting the actual test data prediction
sub_df = pd.DataFrame({'id': test_data['id']})
sub_df.head()

In [None]:
# perform scaling
new_test_data = test_data.drop('id', axis=1)
new_test_num_cols = new_test_data.columns

new_test_data[new_test_num_cols] = scaler.transform(new_test_data[new_test_num_cols])
new_test_data.describe()

### PCA on Actual Test Data

In [None]:
# # fit_transform PCA
# new_test_data_60 = pca_60.fit_transform(new_test_data)
# print(new_test_data_60.shape)

In [None]:
# predicting the probabilities adding to the sub_df dataframe
y_actual_test_pred_final_proba = final_best_xgb_model.predict_proba(new_test_data)[:,1]
sub_df['target'] = y_actual_test_pred_final_proba
sub_df.head()

In [None]:
# submission
sub_df.to_csv('submission.csv', index=False)
print('Output file generated!!')