In [54]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# **Importing Libraries**

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from imblearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.model_selection import RepeatedStratifiedKFold

In [3]:
# importing SimpleImputer for handling missing value
from sklearn.impute import SimpleImputer
# importing MissingIndicator for handling missing value
from sklearn.impute import MissingIndicator
# importing StandardScaler for standardization
from sklearn.preprocessing import StandardScaler
# importing OnHotEncoder for encoding categorical variable
from sklearn.preprocessing import OneHotEncoder

# **Dataset Load**

In [4]:
traindf=pd.read_csv('../input/amex-default-prediction/train_data.csv', nrows=100000)

In [5]:
print('shape of train dataset is : ', traindf.shape)

In [6]:
traindf.isnull().sum()

In [7]:
train_labeldf = pd.read_csv('../input/amex-default-prediction/train_labels.csv')

In [8]:
testdf = pd.read_csv('../input/amex-default-prediction/test_data.csv', nrows=100000, index_col='customer_ID')

# **Data Preprocessing**

In [9]:
print("count of unique customers:",traindf.customer_ID.nunique())

In [11]:
traindf = pd.merge(traindf, train_labeldf, how="inner", on=["customer_ID"])

In [12]:
traindf.drop(['customer_ID','S_2'],axis=1,inplace=True)

In [14]:
print("shape of new data frame : ",traindf.shape)

In [15]:
testdf.drop(axis=1, columns=['S_2'], inplace=True)

In [17]:
traindf['B_30']

In [18]:
for col in traindf.columns:
    print('Type of the column {} is {}'.format(col, traindf[col].dtype))

In [19]:
traindf.isnull().sum()

In [22]:
#drop variables with missing values >=75% in the train dataframe
i=0
for col in traindf.columns:
    if (traindf[col].isnull().sum()/len(traindf[col])*100) >=75:
        print("Dropping column", col)
        traindf.drop(labels=col,axis=1,inplace=True)
        i=i+1
        
print("Total number of columns dropped in train dataframe", i)

In [23]:
i=0
for col in testdf.columns:
    if (testdf[col].isnull().sum()/len(testdf[col])*100) >=75:
        print("Dropping column", col)
        testdf.drop(labels=col,axis=1,inplace=True)
        i=i+1
        
print("Total number of columns dropped in test dataframe", i)

In [24]:
traindf = traindf.astype({"D_114": 'str', "D_116": 'str', "D_117": 'str', "D_120": 'str', "D_126": 'str', "D_68": 'str', "B_30": 'str', "B_38": 'str'})
testdf = testdf.astype({"D_114": 'str', "D_116": 'str', "D_117": 'str', "D_120": 'str', "D_126": 'str', "D_68": 'str', "B_30": 'str', "B_38": 'str'})

In [25]:
X = traindf.drop(columns='target')
y = traindf['target']

In [26]:
print("Shape of X", X.shape)
print("Shape of y", y.shape)

In [27]:
# define categorical variables (columns)
categorical = list(X.select_dtypes('object').columns)
print(f"Categorical variables (columns) are: {categorical}")
# define numerical variables (columns)
numerical = list(X.select_dtypes('number').columns)
print(f"Numerical variables (columns) are: {numerical}")

# **Handle Categorical Variable (Column) & Handle Numerical Variable (Column)**

In [29]:
# define categorical pipeline
CategoricalPipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent', missing_values=np.nan)),
    ('encoder', OneHotEncoder(handle_unknown='ignore', sparse=False)),
    ('scaler', StandardScaler())
])
print(CategoricalPipeline)
NumericalPipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent', missing_values=np.nan)),
    ('scaler', StandardScaler())
])
print(NumericalPipeline)

In [30]:
from sklearn.compose import ColumnTransformer

In [32]:
preprocess = ColumnTransformer([
    ('cat', CategoricalPipeline, categorical),
    ('num', NumericalPipeline, numerical)
])

In [33]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, stratify=y, random_state=42)

In [34]:
print("Shape of X_train", X_train.shape)
print("Shape of X_test", X_test.shape)
print("Shape of y_train", y_train.shape)
print("Shape of y_test", y_test.shape)

# **Model Building/Evaluation**

In [35]:
def amex_metric(y_true: pd.DataFrame, y_pred: pd.DataFrame) -> float:

    def top_four_percent_captured(y_true: pd.DataFrame, y_pred: pd.DataFrame) -> float:
        df = (pd.concat([y_true, y_pred], axis='columns')
              .sort_values('prediction', ascending=False))
        df['weight'] = df['target'].apply(lambda x: 20 if x==0 else 1)
        four_pct_cutoff = int(0.04 * df['weight'].sum())
        df['weight_cumsum'] = df['weight'].cumsum()
        df_cutoff = df.loc[df['weight_cumsum'] <= four_pct_cutoff]
        return (df_cutoff['target'] == 1).sum() / (df['target'] == 1).sum()
        
    def weighted_gini(y_true: pd.DataFrame, y_pred: pd.DataFrame) -> float:
        df = (pd.concat([y_true, y_pred], axis='columns')
              .sort_values('prediction', ascending=False))
        df['weight'] = df['target'].apply(lambda x: 20 if x==0 else 1)
        df['random'] = (df['weight'] / df['weight'].sum()).cumsum()
        total_pos = (df['target'] * df['weight']).sum()
        df['cum_pos_found'] = (df['target'] * df['weight']).cumsum()
        df['lorentz'] = df['cum_pos_found'] / total_pos
        df['gini'] = (df['lorentz'] - df['random']) * df['weight']
        return df['gini'].sum()

    def normalized_weighted_gini(y_true: pd.DataFrame, y_pred: pd.DataFrame) -> float:
        y_true_pred = y_true.rename(columns={'target': 'prediction'})
        return weighted_gini(y_true, y_pred) / weighted_gini(y_true, y_true_pred)

    g = normalized_weighted_gini(y_true, y_pred)
    d = top_four_percent_captured(y_true, y_pred)

    return 0.5 * (g + d)

In [36]:
def model_score(model_name):
    print("Training and Evalution using", model_name)
    model = pipe.fit(X_train, y_train)
    print("model training score: %.3f" % pipe.score(X_train, y_train))
    print("model validation score: %.3f" % pipe.score(X_test, y_test))
    print("Amex Evaluation Metric - Training: %.3f"% amex_metric(pd.DataFrame(y_train), pd.DataFrame(pipe.predict(X_train), columns=['prediction'])))
    print("Amex Evaluation Metric - Validation: %.3f"% amex_metric(pd.DataFrame(y_test), pd.DataFrame(pipe.predict(X_test), columns=['prediction'])))

In [53]:
from sklearn.model_selection import cross_validate

In [37]:
def model_cross_validation_score(model_name):
    print("Training and Evaluation with Cross Validation using",model_name)
    # using scoring with classification metrics
    scoring = ['accuracy', 'precision', 'recall','f1','roc_auc']
    #using RepeatedStratifiedKFold as cross validator
    cv = RepeatedStratifiedKFold(n_splits=3, n_repeats=2, random_state=42)
    # cross validation returning both train and test score
    scores = cross_validate(pipe, X, y, scoring=scoring, cv=cv, n_jobs=-1, return_train_score=True,return_estimator=True)
    print('Training Score: Accuracy: {:.2f}, Precision: {:.2f}, Recall: {:.2f},f1-score: {:.2f}, ROC AUC: {:.2f}'.format(np.mean(scores['train_accuracy']),np.mean(scores['train_precision']), np.mean(scores['train_recall']), np.mean(scores['train_f1']), np.mean(scores['train_roc_auc'])))
    print('Validation Score: Accuracy: {:.2f}, Precision: {:.2f}, Recall: {:.2f},f1-score: {:.2f}, ROC AUC: {:.2f}'.format(np.mean(scores['test_accuracy']),np.mean(scores['test_precision']), np.mean(scores['test_recall']), np.mean(scores['test_f1']), np.mean(scores['test_roc_auc'])))

In [38]:
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier

In [39]:
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler 
from sklearn.feature_selection import SelectFromModel 

In [44]:
steps = [
        ('preprocess', preprocess),
        ('over_sampler',SMOTE(random_state = 42)),
        ('under_sampler',RandomUnderSampler()),
        ('feature_selection', SelectFromModel(RandomForestClassifier(n_estimators = 10, random_state = 42, n_jobs = -1))),
        ('dimension_reduction', PCA(n_components='mle',random_state = 42)),
        ('model_estimator', RandomForestClassifier(random_state = 42))
    ]
pipe = Pipeline(steps, verbose=True)

In [48]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score

**Using RandomForestClassifier**

In [45]:
model_score("RandomForestClassifier")

In [55]:
model_cross_validation_score("RandomForestClassifier")

**Using XGBClassifier**

In [46]:
pipe.set_params(model_estimator=XGBClassifier())
# using custom function to display model training and validation score
model_score("XGBClassifier")

In [56]:
model_cross_validation_score("XGBClassifier")

**Using LGBMClassifier**

In [57]:
pipe.set_params(model_estimator=LGBMClassifier())
# using custom function to display model training and validation score
model_score("LGBMClassifier")

In [58]:
model_cross_validation_score("LGBMClassifier")

# **Prediction/Sample Submission file**

In [59]:
testdf_new=testdf.reset_index()

In [60]:
X_test_predict = testdf_new.groupby('customer_ID').tail(1)

In [61]:
X_test_predict.shape

In [62]:
X_test_predict.set_index('customer_ID', inplace=True)

In [63]:
y_test_pred = pipe.predict(X_test_predict)

In [64]:
output = pd.DataFrame({'customer_ID': X_test_predict.index,'prediction': y_test_pred})
output.to_csv('submission.csv', index=False, header=True)