In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

<div style="background-color:#3197BB; text-align:center; vertical-align: middle; padding:10px 0; margin-top:4px">
<center><h1>Study of a default prediction from American Express data</h1></center>
<h3>Description of columns values</h3></div>
<h3>
D_* = Delinquency variables</br>
S_* = Spend variables</br>
P_* = Payment variables</br>
B_* = Balance variables</br>
R_* = Risk variables</h3>


## Import Library

In [None]:
import numpy as np 
import pandas as pd 

import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import SGDClassifier
import seaborn as sns
from sklearn.compose import make_column_transformer


from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder

from sklearn.model_selection import train_test_split

from sklearn.metrics import r2_score, classification_report, confusion_matrix, accuracy_score, roc_auc_score, roc_curve, precision_recall_curve, average_precision_score

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score


import warnings
warnings.simplefilter('ignore')
import gc
import subprocess


In [None]:
train_data= pd.read_parquet("../input/amex-parquet/train_data.parquet")

## Getting Data and reduce memory usage

In [None]:
def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df



In [None]:
train_data= reduce_mem_usage(train_data)

In [None]:
gc.collect()

In [None]:
train_data.info()

### Manage column type and nan values

In [None]:
###############Some Change######################################
train_data['B_31']=train_data['B_31'].astype('float16')
train_data=train_data.rename(columns={'S_2':'Date'})
train_data['Date']=pd.to_datetime(train_data['Date'])
train_data['target']=train_data['target'].astype('category')

In [None]:
##########################Manage null value##################################
###########columns categorical##############
categorical_columns=['B_30', 'B_38', 'D_114', 'D_116', 'D_117', 'D_120', 'D_126', 'D_63', 'D_64', 'D_66', 'D_68']
train_data[categorical_columns]=train_data[categorical_columns].astype('category')
imp = SimpleImputer(missing_values=np.nan, strategy="most_frequent")
imp=imp.fit(train_data[categorical_columns])
train_data[categorical_columns]=imp.transform(train_data[categorical_columns])
print('Categorical missing values done')


In [None]:
gc.collect()

In [None]:
numeric=['float16']
numerical_columns=train_data.select_dtypes(include=numeric).columns.tolist()
for col in train_data[numerical_columns]:
    train_data[col]=train_data[col].fillna(0)
print('numerical values missing done')

In [None]:
gc.collect()

## Some other few changes

In [None]:
#########################################################################################
###########group by and sort date values#################################################
train_data=train_data.groupby(['customer_ID']).nth(-1).reset_index(drop=True)
train_data=train_data.sort_values(by='Date', ascending=True)

In [None]:
train_data.head()

In [None]:
gc.collect()

In [None]:
for col in train_data[categorical_columns]:
    print(col, " : " , train_data[col].unique())

## About the target

In [None]:
##############################About the target####################################

val_target=train_data['target'].value_counts()
print("target distribution: \n", val_target)
ratio_target=val_target/len(train_data['target'])
print(f"Rate of non default values:  {round(ratio_target[0],2)}\n Rate of default values: {round(ratio_target[1],2)}")

In [None]:
list_value=[False,True]
plt.figure(figsize=(7,7))
count_df=train_data['target'].value_counts()
count_df.plot(kind='pie', subplots=True, labels=list_value, figsize=(12, 12),autopct='%1.1f%%', cmap="Set2", fontsize=14, legend=False)
plt.title("Target values")
plt.show()

## View some performances of the data

In [None]:
#############################################################################

train_data=train_data.reset_index()
train_data=train_data.drop(columns='Date')
X=train_data[numerical_columns + categorical_columns]
y=train_data['target']

In [None]:

##################################Use pipeline for preprocessing###########################################################
X_train, X_test, y_train, y_test = train_test_split(X, y,  stratify=y, train_size=0.7, test_size=0.3, random_state=0)

In [None]:
from sklearn.compose import make_column_selector
numerical_features=make_column_selector(dtype_include=numerical_columns)
categorical_features=make_column_selector(dtype_include=categorical_columns)

In [None]:

numerical_pipeline=make_pipeline(StandardScaler())
categorical_pipeline=make_pipeline(OneHotEncoder())

preprocessor=make_column_transformer((numerical_pipeline,numerical_columns),
                       (categorical_pipeline, categorical_columns))
model=make_pipeline(preprocessor, SGDClassifier(max_iter=1000,tol=1e-3))
model.fit(X_train,y_train)

In [None]:
model.score(X_train, y_train)

In [None]:
from sklearn.metrics import plot_confusion_matrix
plot_confusion_matrix(model, X_test, y_test)  

In [None]:
y_predicted = model.predict(X_test)

In [None]:
print('Classification report:\n', classification_report(y_test, y_predicted))
conf_mat = confusion_matrix(y_true=y_test, y_pred=y_predicted)
print('Confusion matrix:\n', conf_mat)

In [None]:
X=train_data[categorical_columns + numerical_columns].drop(columns=['D_63', 'D_64'])
y=train_data['target']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y,  stratify=y, train_size=0.7, test_size=0.3, random_state=0)

## The ROC curve

In [None]:
ns_probs = [0 for _ in range(len(y_test))]

model = LogisticRegression(solver='lbfgs')
model.fit(X_train, y_train)

In [None]:
lr_probs = model.predict_proba(X_test)
lr_probs = lr_probs[:, 1]
ns_auc = roc_auc_score(y_test, ns_probs)
lr_auc = roc_auc_score(y_test, lr_probs)
print('Fault: ROC AUC=%.3f' % (ns_auc))
print('True: ROC AUC=%.3f' % (lr_auc))

In [None]:
ns_fpr, ns_tpr, _ = roc_curve(y_test, ns_probs)
lr_fpr, lr_tpr, _ = roc_curve(y_test, lr_probs)
plt.plot(ns_fpr, ns_tpr, linestyle='--', label='Fault')
plt.plot(lr_fpr, lr_tpr, marker='.', label='True')
# axis labels
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.legend()
# show the plot
plt.show()

In [None]:
##############################################################################