# Installation + Import

In [1]:
!python3 -m pip install --upgrade pip

Defaulting to user installation because normal site-packages is not writeable


In [2]:
!python3 -m pip install numpy
!python3 -m pip install pandas
!python3 -m pip install scikit-learn==1.0.2 #version copatible with imblearn library
!python3 -m pip install xgboost
!python3 -m pip install imblearn

Defaulting to user installation because normal site-packages is not writeable
Defaulting to user installation because normal site-packages is not writeable
Defaulting to user installation because normal site-packages is not writeable
Collecting scikit-learn==1.0.2
  Downloading scikit_learn-1.0.2-cp38-cp38-macosx_10_13_x86_64.whl (7.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.9/7.9 MB[0m [31m524.2 kB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Installing collected packages: scikit-learn
  Attempting uninstall: scikit-learn
    Found existing installation: scikit-learn 1.3.0
    Uninstalling scikit-learn-1.3.0:
      Successfully uninstalled scikit-learn-1.3.0
Successfully installed scikit-learn-1.0.2
Defaulting to user installation because normal site-packages is not writeable
Defaulting to user installation because normal site-packages is not writeable


In [3]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import gc

# Data Precessing 
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.impute import SimpleImputer

# Model
import xgboost as xgb 
from sklearn.decomposition import PCA
from imblearn.over_sampling import SMOTE 

# Training Data Preprocessing
Read in the training data and perform preprocessing

In [None]:
Train_df = pd.read_feather('amexfeather/train_data.ftr')

In [None]:
Train_df = Train_df.groupby('customer_ID').tail(1)

## Dropping Null Columns
Drop all columns with null ratio greater than 0.7

In [None]:
Null_Check = pd.DataFrame({'Columns':Train_df.columns,
                           'Null Ratio':Train_df.isna().sum().values / len(Train_df)}).sort_values(by = ['Null Ratio'], ascending = False)

In [None]:
for i in np.linspace(0,1, 11).round(1):
    print(i, len(Null_Check[Null_Check['Null Ratio'] > i]))
    
Drop_Columns = Null_Check[Null_Check['Null Ratio'] > 0.7]['Columns']

In [None]:
Train_df = Train_df.drop(columns = Drop_Columns)

## PCA
Perform PCA on data with number of components equal to 6

In [None]:
# Prepare for a PCA 
Master_df = Train_df[['customer_ID','target']].reset_index(drop = True)

# Categorial
PCA_Cat = Train_df.select_dtypes(include='category').reset_index(drop = True)

for i in PCA_Cat.columns:
    PCA_Cat[i].fillna(PCA_Cat[i].quantile(.5), inplace = True)
    
PCA_Cat = pd.get_dummies(PCA_Cat, drop_first= True)

# Numeric and Normalize
PCA_Numeric = Train_df.select_dtypes(include=['float16']).reset_index(drop = True)

for i in PCA_Numeric.columns:
    PCA_Numeric[i] = PCA_Numeric[i].astype('float64')
    PCA_Numeric[i] = PCA_Numeric[i].fillna(PCA_Numeric[i].mean())

PCA_Numeric = pd.DataFrame(StandardScaler().fit_transform(PCA_Numeric), columns = PCA_Numeric.columns)
    
# Concat
PCA_df = pd.concat([PCA_Cat, PCA_Numeric], axis = 1)

# PCA
PCA_Model = PCA(n_components=6, random_state=0)
Temp = pd.DataFrame(PCA_Model.fit_transform(PCA_df))
Master_df = pd.concat([Master_df.iloc[:, :2], Temp], axis = 1)

## SMOTE Oversampling
Resample so that the counts of target=0 and target=1 are the same 

In [None]:
sm = SMOTE(random_state=42)
X, y = sm.fit_resample(Master_df.iloc[:,2:], Master_df['target'])

# Amex Competition Metric
Define the method that the competition uses to score predictions

In [None]:
def amex_metric(y_true: pd.DataFrame, y_pred: pd.DataFrame) -> float:

    def top_four_percent_captured(y_true: pd.DataFrame, y_pred: pd.DataFrame) -> float:
        df = (pd.concat([y_true, y_pred], axis='columns')
              .sort_values('prediction', ascending=False))
        df['weight'] = df['target'].apply(lambda x: 20 if x==0 else 1)
        four_pct_cutoff = int(0.04 * df['weight'].sum())
        df['weight_cumsum'] = df['weight'].cumsum()
        df_cutoff = df.loc[df['weight_cumsum'] <= four_pct_cutoff]
        return (df_cutoff['target'] == 1).sum() / (df['target'] == 1).sum()
        
    def weighted_gini(y_true: pd.DataFrame, y_pred: pd.DataFrame) -> float:
        df = (pd.concat([y_true, y_pred], axis='columns')
              .sort_values('prediction', ascending=False))
        df['weight'] = df['target'].apply(lambda x: 20 if x==0 else 1)
        df['random'] = (df['weight'] / df['weight'].sum()).cumsum()
        total_pos = (df['target'] * df['weight']).sum()
        df['cum_pos_found'] = (df['target'] * df['weight']).cumsum()
        df['lorentz'] = df['cum_pos_found'] / total_pos
        df['gini'] = (df['lorentz'] - df['random']) * df['weight']
        return df['gini'].sum()

    def normalized_weighted_gini(y_true: pd.DataFrame, y_pred: pd.DataFrame) -> float:
        y_true_pred = y_true.rename(columns={'target': 'prediction'})
        return weighted_gini(y_true, y_pred) / weighted_gini(y_true, y_true_pred)

    g = normalized_weighted_gini(y_true, y_pred)
    d = top_four_percent_captured(y_true, y_pred)

    return 0.5 * (g + d)

# Model
Define XBG model using best parameters determined by the EDA notebook and fit it to the training data

In [None]:
best_parameters = {'max_depth': 12, 'min_child_weight': 7, 'eta': 0.1, 'objective': 'binary:logistic', 'tree_method': 'gpu_hist', 'eval_metric': 'rmsle'}

XGB_Model = xgb.XGBClassifier(**best_parameters,
                              verbosity = 1,
                              n_jobs = -1).fit(X, y)
        


In [None]:
del Train_df, PCA_Numeric, PCA_Cat, PCA_df, Master_df, Temp
gc.collect()

# Test Data Preprocessing
Read in the test data and perform preprocessing

In [None]:
Test_df = pd.read_feather('amexfeather/test_data.ftr')

In [None]:
Test_df = Test_df.groupby('customer_ID').tail(1)

## Dropping Null Columns

In [None]:
Test_df = Test_df.drop(columns = Drop_Columns)

## PCA

In [None]:
# Prepare for a PCA 
Master_df = Test_df[['customer_ID']].reset_index(drop = True)

# Categorial
PCA_Cat = Test_df.select_dtypes(include='category').reset_index(drop = True)

for i in PCA_Cat.columns:
    PCA_Cat[i].fillna(PCA_Cat[i].quantile(.5), inplace = True)
    
PCA_Cat = pd.get_dummies(PCA_Cat, drop_first= True)

# Numeric and Normalize
PCA_Numeric = Test_df.select_dtypes(include=['float16']).reset_index(drop = True)

for i in PCA_Numeric.columns:
    PCA_Numeric[i] = PCA_Numeric[i].astype('float64')
    PCA_Numeric[i] = PCA_Numeric[i].fillna(PCA_Numeric[i].mean())

PCA_Numeric = pd.DataFrame(StandardScaler().fit_transform(PCA_Numeric), columns = PCA_Numeric.columns)
    
# Concat
PCA_df = pd.concat([PCA_Cat, PCA_Numeric], axis = 1)

# PCA
PCA_Model = PCA(n_components=6, random_state=0)
Temp = pd.DataFrame(PCA_Model.fit_transform(PCA_df))
Master_df = pd.concat([Master_df.iloc[:, :2], Temp], axis = 1)

# Prediction
Use XGB model to predict probabilities for each customer in the test dataset

In [None]:
prediction = pd.DataFrame({'customer_ID':Master_df['customer_ID'],
                           'prediction':XGB_Model.predict_proba(Master_df.iloc[:,1:])[:, 1]})

del PCA_Numeric, PCA_Cat, PCA_df, Master_df, Temp
gc.collect()

In [None]:
prediction.to_csv('submission.csv', index=False)