# handling imbalanced column "TARGET"

In [1]:
# Pandas for managing datasets
import numpy as np
import pandas as pd

# math for operating numbers
import math

# Change pd displayg format for float
pd.options.display.float_format = '{:,.4f}'.format

# to show complete output of a cell: eg.
# with pd.option_context('display.max_rows', None, 'display.max_columns', None):
#     print(df.apply(lambda x:x.unique().size))

# # Matplotlib for additional customization
# from matplotlib import pyplot as plt
# %matplotlib inline

# # Seaborn for plotting and styling
# import seaborn as sns
# #Seaborn set() to set aesthetic parameters in one step.
# sns.set() 

In [2]:
# Read dataset
dataPath = '../../../BDSE12-Group3/datasets/homecdt_eda/application_train.csv'
df = pd.read_csv(dataPath)
df.shape
application_test = pd.read_csv('../../../BDSE12-Group3/datasets/homecdt_eda/application_test.csv')

In [3]:
df.head(3)

Unnamed: 0,SK_ID_CURR,TARGET,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,...,FLAG_DOCUMENT_18,FLAG_DOCUMENT_19,FLAG_DOCUMENT_20,FLAG_DOCUMENT_21,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR
0,100002,1,Cash loans,M,N,Y,0,202500.0,406597.5,24700.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,1.0
1,100003,0,Cash loans,F,N,N,0,270000.0,1293502.5,35698.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
2,100004,0,Revolving loans,M,Y,Y,0,67500.0,135000.0,6750.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0


In [4]:
# 'TARGET' is imbalanced
df['TARGET'].value_counts()

0    282686
1     24825
Name: TARGET, dtype: int64

In [5]:
# Area Under ROC Curve should be used as the measure
from sklearn.metrics import roc_auc_score

* Up-sample Minority Class
* Down-sample Majority Class
* Change Your Performance Metric
* Penalize Algorithms (Cost-Sensitive Training)
* Use Tree-Based Algorithms
* Create Synthetic Samples (Data Augmentation)
* Combine Minority Classes
* Reframe as Anomaly Detection

### Pre-processing of data

In [6]:
from sklearn import preprocessing
categorical_feats = [
    f for f in df.columns if df[f].dtype == 'object'
]

for col in categorical_feats:
    lb = preprocessing.LabelEncoder()
    lb.fit(list(df[col].values.astype('str')) + list(application_test[col].values.astype('str')))
    df[col] = lb.transform(list(df[col].values.astype('str')))
    application_test[col] = lb.transform(list(application_test[col].values.astype('str')))

In [7]:
df.fillna(0, inplace = True)

### Use Tree-Based Algorithms

In [8]:
# Random Forest
from sklearn.ensemble import RandomForestClassifier

In [9]:
# Train Random Forest on imbalanced dataset

# Separate input features (X) and target variable (y)
y = df['TARGET']
X = df.drop(['SK_ID_CURR', 'TARGET'], axis=1)
 
# Train model
model_rf = RandomForestClassifier()
model_rf.fit(X, y)
 


RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [10]:
# Predict on training set
pred_rf = model_rf.predict(X)

# Is our model still predicting just one class?
print( np.unique( pred_rf ) )
# [0 1]

[0 1]


In [11]:
# # How's our accuracy?
# print( accuracy_score(y, pred_rf) )
# # 0.9744
 
# What about AUROC?
prob_rf = model_rf.predict_proba(X)
prob_rf = [p[1] for p in prob_rf]
print( roc_auc_score(y, prob_rf) )
# 0.999078798186

1.0


In [12]:
X_test = application_test.drop(['SK_ID_CURR'], axis=1) 

In [13]:
X_test.fillna(0, inplace = True)

In [14]:
test_pred_rf = model_rf.predict(X_test)

In [15]:
type(test_pred_rf)
unique, counts = np.unique(test_pred_rf, return_counts=True)
print (np.asarray((unique, counts)).T)

[[    0 48741]
 [    1     3]]


In [16]:
print(test_pred_rf, np.unique( test_pred_rf ))

[0 0 0 ... 0 0 0] [0 1]


In [17]:
test_pred_rf_df = pd.DataFrame(data=test_pred_rf,columns=['TARGET'])
test_pred_rf_df

Unnamed: 0,TARGET
0,0
1,0
2,0
3,0
4,0
...,...
48739,0
48740,0
48741,0
48742,0


In [18]:
test_pred_rf_df.to_csv('submission.csv', encoding='utf-8', index=False)