# dropna() upsampling using FAMD

In [31]:
### read training data###
import pandas as pd
dataset=pd.read_csv('Simulated_Data_Train.csv').dropna()

In [36]:
## ### upsampling ####
### upsampling
## return upsampled dataset

def upsample(data):
    from sklearn.utils import resample
    df_majority=data[data.Default_ind==0]
    df_minority=data[data.Default_ind==1]
    largenumber=len(df_majority)
    df_minority_upsampled=resample(df_minority,replace=True,n_samples=int(1.2*largenumber),random_state=123)
    df_upsampled=pd.concat([pd.DataFrame(df_majority),pd.DataFrame(df_minority_upsampled)])
    return df_upsampled

In [37]:
data_up=upsample(dataset)

In [38]:
def famdfitdata(dataset,dataset2,num_comp,itera,eng='auto',seed=123):
    """
    dataset1: for training
    dataset2: To be operated
    num_comp(int): number of components
    itera(int): iterations
    eng: 'auto', 'sklearn' or 'fbpca'
    """
    import prince
    famd = prince.FAMD(
        n_components=num_comp,
        n_iter=itera,
        copy=True,
        check_input=True,
        engine=eng,       ## Can be "auto", 'sklearn', 'fbpca'
        random_state=seed)
    famdfit = famd.fit(dataset.drop('Default_ind', axis=1))
    result=famdfit.row_coordinates(dataset2)
    return(result)

In [39]:
train=famdfitdata(data_up,data_up,10,10,)

In [44]:
### read test dataset
test=pd.read_csv('Simulated_Data_Test.csv')
test.dropna(inplace=True)

In [45]:
test1=famdfitdata(data_up,test,10,10,)

In [42]:
from sklearn.linear_model import LogisticRegression
# all parameters not specified are set to their defaults
# default solver is incredibly slow which is why it was changed to 'lbfgs'
logisticRegr = LogisticRegression(solver = 'lbfgs')
logisticRegr.fit(train, data_up.iloc[:,20])
logisticRegr.score(train, data_up.iloc[:,20])


0.7095361515774421

In [46]:
pred_labels = logisticRegr.predict(test1)
from sklearn.metrics import accuracy_score
accuracy_score( pred_labels,test.iloc[:,20])

0.45338472657191037

In [47]:
from sklearn.metrics import confusion_matrix
confusion_matrix(test.iloc[:,20],pred_labels)

array([[1580, 2230],
       [  39,  302]])

# mean upsampling using FAMD

In [54]:
import pandas as pd
dfn=pd.read_csv('Unconditional_mean_impute_Test.csv')
dfn_test=pd.read_csv('Unconditional_mean_impute_Test.csv')

In [61]:
dfn=upsample(dfn)

In [55]:
train1=famdfitdata(dfn,dfn,10,10,)
test2=famdfitdata(dfn,dfn_test,10,10,)

In [58]:
from sklearn.linear_model import LogisticRegression
# all parameters not specified are set to their defaults
# default solver is incredibly slow which is why it was changed to 'lbfgs'
clf2 = LogisticRegression(solver = 'lbfgs')
clf2.fit(train1, dfn.iloc[:,20])
clf2.score(train1, dfn.iloc[:,20])


0.9334

In [59]:
pred_label = clf2.predict(test2)
from sklearn.metrics import accuracy_score
accuracy_score( pred_label,dfn_test.iloc[:,20])

0.9334

In [60]:
from sklearn.metrics import confusion_matrix
confusion_matrix(dfn_test.iloc[:,20],pred_label)

array([[4568,   31],
       [ 302,   99]])

# KNN upsampling using FAMD

In [62]:
import pandas as pd
df1=pd.read_csv('KNN_impute.csv')
df1_test=pd.read_csv('KNN_impute_test.csv')
df1=upsample(df1)

In [63]:
train3=famdfitdata(df1,df1,10,10,)
test3=famdfitdata(df1,df1_test,10,10,)

In [65]:
from sklearn.linear_model import LogisticRegression
# all parameters not specified are set to their defaults
# default solver is incredibly slow which is why it was changed to 'lbfgs'
clf3 = LogisticRegression(solver = 'lbfgs')
clf3.fit(train3, df1['Default_ind'])
clf3.score(train3, df1['Default_ind'])


0.7073068378178228

In [66]:
pred_label = clf3.predict(test3)
from sklearn.metrics import accuracy_score
accuracy_score( pred_label,df1_test['Default_ind'])

0.4442

In [67]:
from sklearn.metrics import confusion_matrix
confusion_matrix(df1_test['Default_ind'],pred_label)

array([[1870, 2729],
       [  50,  351]])

# median upsampling using FAMD

In [73]:
import pandas as pd
df2=pd.read_csv('Unconditional_median_impute.csv')
df2_test=pd.read_csv('Unconditional_median_impute_test.csv')
df2=upsample(df2)

In [74]:
train4=famdfitdata(df2,df2,10,10,)
test4=famdfitdata(df2,df2_test,10,10,)

In [75]:
from sklearn.linear_model import LogisticRegression
# all parameters not specified are set to their defaults
# default solver is incredibly slow which is why it was changed to 'lbfgs'
clf4 = LogisticRegression(solver = 'lbfgs')
clf4.fit(train4, df2['Default_ind'])
clf4.score(train4, df2['Default_ind'])


0.7075536904468033

In [81]:
pred_label = clf4.predict(test4)
from sklearn.metrics import accuracy_score
accuracy_score( pred_label,df2_test['Default_ind'])

0.4432

In [82]:
from sklearn.metrics import confusion_matrix
confusion_matrix(df2_test['Default_ind'],pred_label)

array([[1865, 2734],
       [  50,  351]])

# EM upsampling using FAMD

In [78]:
import pandas as pd
df4=pd.read_csv('Simulated_Data_Train_em.csv')
df4_test=pd.read_csv('Simulated_Data_test_em.csv')
df4=upsample(df4)

In [79]:
train5=famdfitdata(df4,df4,10,10,)
test5=famdfitdata(df4,df4_test,10,10,)

In [80]:
from sklearn.linear_model import LogisticRegression
# all parameters not specified are set to their defaults
# default solver is incredibly slow which is why it was changed to 'lbfgs'
clf5 = LogisticRegression(solver = 'lbfgs')
clf5.fit(train5, df4['Default_ind'])
clf5.score(train5, df4['Default_ind'])


0.7068378178227598

In [83]:
pred_label = clf5.predict(test4)
from sklearn.metrics import accuracy_score
accuracy_score( pred_label,df4_test['Default_ind'])

0.439

In [84]:
from sklearn.metrics import confusion_matrix
confusion_matrix(df4_test['Default_ind'],pred_label)

array([[1845, 2754],
       [  51,  350]])

# Random Forest upsampling using FAMD

In [85]:
import pandas as pd
df6=pd.read_csv('rf.csv')
df6_test=pd.read_csv('rf_Test.csv')
df6=upsample(df6)

In [86]:
train6=famdfitdata(df6,df6,10,10,)
test6=famdfitdata(df6,df6_test,10,10,)

In [87]:
from sklearn.linear_model import LogisticRegression
# all parameters not specified are set to their defaults
# default solver is incredibly slow which is why it was changed to 'lbfgs'
clf6 = LogisticRegression(solver = 'lbfgs')
clf6.fit(train6, df6['Default_ind'])
clf6.score(train6, df6['Default_ind'])


0.7063934830905949

In [88]:
pred_label = clf6.predict(test6)
from sklearn.metrics import accuracy_score
accuracy_score( pred_label,df6_test['Default_ind'])

0.4428

In [89]:
from sklearn.metrics import confusion_matrix
confusion_matrix(df6_test['Default_ind'],pred_label)

array([[1863, 2736],
       [  50,  351]])

# EMB upsampling using FAMD

In [90]:
import pandas as pd
df7=pd.read_csv('EMB_impute.csv')
df7_test=pd.read_csv('EMB_impute_Test.csv')
df7=upsample(df7)


In [91]:
train7=famdfitdata(df7,df7,10,10,)
test7=famdfitdata(df7,df7_test,10,10,)

In [92]:
from sklearn.linear_model import LogisticRegression
# all parameters not specified are set to their defaults
# default solver is incredibly slow which is why it was changed to 'lbfgs'
clf7 = LogisticRegression(solver = 'lbfgs')
clf7.fit(train7, df7['Default_ind'])
clf7.score(train7, df7['Default_ind'])


0.707948654653172

In [93]:
pred_label = clf7.predict(test7)
from sklearn.metrics import accuracy_score
accuracy_score( pred_label,df7_test['Default_ind'])

0.4436

In [94]:
from sklearn.metrics import confusion_matrix
confusion_matrix(df7_test['Default_ind'],pred_label)

array([[1867, 2732],
       [  50,  351]])

# Cart upsampling using FAMD

In [95]:
import pandas as pd
df5=pd.read_csv('cart_impute.csv')
df5_test=pd.read_csv('cart_impute_Test.csv')
df5=upsample(df5)

In [96]:
train8=famdfitdata(df5,df5,10,10,)
test8=famdfitdata(df5,df5_test,10,10,)

In [97]:
from sklearn.linear_model import LogisticRegression
# all parameters not specified are set to their defaults
# default solver is incredibly slow which is why it was changed to 'lbfgs'
clf8 = LogisticRegression(solver = 'lbfgs')
clf8.fit(train8, df5['Default_ind'])
clf8.score(train8, df5['Default_ind'])


0.7058010367810417

In [98]:
pred_label = clf8.predict(test8)
from sklearn.metrics import accuracy_score
accuracy_score( pred_label,df5_test['Default_ind'])

0.4438

In [99]:
from sklearn.metrics import confusion_matrix
confusion_matrix(df5_test['Default_ind'],pred_label)

array([[1868, 2731],
       [  50,  351]])