In [14]:
import pandas as pd
import numpy as np
import scipy.stats as stats
import matplotlib.pyplot as plt 
import seaborn as sns

#feature engineering
from sklearn.preprocessing import OneHotEncoder,StandardScaler
from sklearn.preprocessing import FunctionTransformer
from sklearn.compose import ColumnTransformer


#model building
from sklearn.model_selection import train_test_split 
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import VotingClassifier
import lightgbm as lgb
from imblearn.pipeline import Pipeline, make_pipeline
import xgboost as xgb 
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.utils.class_weight import compute_sample_weight
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline
from imblearn.under_sampling import RandomUnderSampler

#---
import joblib

In [2]:
df = pd.read_csv('../data/test/final_df.csv')
df.head()

Unnamed: 0,uid,applied_contract_type,total_tl,complete_tl,active_tl,amount_overdue_sum,loan_amount_avg,loan_amount_min,loan_amount_max,num_times_delinquent,...,other_loans,enquiry_count,max_enquiry_amount,time_since_last_enquiry,business_loans_enquiries,consumer_loans_enquiries,revolving_loans_enquiries,mortgages_and_real_estate_loans_enquiries,other_loans_enquiries,automobile_loans_enquiries
0,CMO22835242,Cash loans,11,11,5,0.0,939825.278182,59233.5,6525000.0,1,...,0,8,184000,50,3,4,0,0,1,0
1,MRJ34316727,Cash loans,5,5,5,0.0,353898.0,4500.0,972990.0,0,...,0,1,184000,76,0,0,0,0,0,1
2,UAV00534378,Cash loans,5,5,5,0.0,425817.0,179550.0,703125.0,0,...,0,1,188000,69,0,1,0,0,0,0
3,IPQ08190402,Cash loans,5,5,5,0.0,210009.6,60048.0,270000.0,0,...,0,21,192000,44,4,11,0,3,0,3
4,NQN84331006,Cash loans,7,7,5,0.0,621868.68,135000.0,1978024.5,0,...,0,6,170000,45,3,2,0,0,1,0


In [3]:
df.describe()

Unnamed: 0,total_tl,complete_tl,active_tl,amount_overdue_sum,loan_amount_avg,loan_amount_min,loan_amount_max,num_times_delinquent,months_since_recent_delinquency,automobile_loans,...,other_loans,enquiry_count,max_enquiry_amount,time_since_last_enquiry,business_loans_enquiries,consumer_loans_enquiries,revolving_loans_enquiries,mortgages_and_real_estate_loans_enquiries,other_loans_enquiries,automobile_loans_enquiries
count,46127.0,46127.0,46127.0,46127.0,46127.0,46127.0,46127.0,46127.0,46127.0,46127.0,...,46127.0,46127.0,46127.0,46127.0,46127.0,46127.0,46127.0,46127.0,46127.0,46127.0
mean,4.466256,4.466256,3.110586,137.5134,329199.2,113840.6,829410.2,1.247057,0.7529,0.075877,...,0.003165,7.320268,178081.540855,48.068767,1.896265,3.053873,0.458907,0.76142,0.761875,0.387929
std,3.910589,3.910589,1.905162,10643.33,672464.4,511166.8,1630622.0,3.48251,7.498031,0.318952,...,0.062389,7.013023,90278.598058,5.840659,2.165819,3.237506,0.858752,1.106279,1.104543,0.708507
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,1039.0,44.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,1.0,1.0,1.0,0.0,68920.78,16762.5,100053.9,0.0,0.0,0.0,...,0.0,2.0,141000.0,45.0,0.0,1.0,0.0,0.0,0.0,0.0
50%,4.0,4.0,4.0,0.0,165583.8,38655.0,324000.0,0.0,0.0,0.0,...,0.0,5.0,179000.0,46.0,1.0,2.0,0.0,0.0,0.0,0.0
75%,7.0,7.0,5.0,0.0,356625.0,89640.0,900000.0,1.0,0.0,0.0,...,0.0,10.0,194000.0,49.0,3.0,4.0,1.0,1.0,1.0,1.0
max,15.0,15.0,5.0,2079510.0,38167200.0,38167200.0,58032000.0,72.0,138.0,5.0,...,4.0,56.0,499962.0,98.0,18.0,30.0,12.0,11.0,10.0,7.0


In [4]:
df.drop(['uid'], inplace=True, axis=1)

In [5]:
df.shape

(46127, 24)

In [6]:
significant_features = ['total_tl','complete_tl','active_tl','amount_overdue_sum','loan_amount_avg','loan_amount_min','loan_amount_max','num_times_delinquent','months_since_recent_delinquency','automobile_loans','consumer_loans','mortgages_and_real_estate_loans','max_enquiry_amount','time_since_last_enquiry','revolving_loans_enquiries','automobile_loans_enquiries']

In [7]:
df_significant = df[significant_features]

In [9]:
df_significant.shape

(46127, 16)

In [11]:
df_significant['applied_contract_type'] = df['applied_contract_type']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_significant['applied_contract_type'] = df['applied_contract_type']


In [12]:
df_significant.shape

(46127, 17)

In [13]:
num_columns = list(df_significant.select_dtypes(exclude='object').columns)
cat_columns = list(df_significant.select_dtypes(include='object').columns)

In [20]:
loaded_pipeline = joblib.load('../voting_clf_pipeline.pkl')

In [21]:
predictions = loaded_pipeline.predict(df_significant)

In [22]:
print(predictions)

[1 1 1 ... 1 1 1]


In [25]:
for i in predictions:
    print(i)
    

1
1
1
1
1
0
1
0
1
1
1
1
1
1
0
1
1
1
0
1
0
1
0
1
1
1
1
1
1
1
0
1
1
1
1
1
1
1
1
0
1
0
1
1
1
1
1
1
1
1
1
0
1
1
1
1
1
0
1
1
1
1
1
1
1
1
1
1
1
1
0
1
0
0
1
1
0
1
1
1
0
1
1
1
1
1
0
1
0
1
1
0
1
1
1
1
1
1
0
1
0
1
1
0
1
1
1
1
1
1
0
1
0
0
1
0
1
1
1
1
1
0
1
1
1
1
1
1
1
1
0
0
1
1
1
0
1
1
1
0
1
1
0
1
0
1
1
1
0
1
1
1
1
0
1
1
1
1
0
1
1
1
1
1
0
1
1
1
1
1
0
1
0
1
1
1
1
1
1
1
1
1
1
1
0
1
1
1
1
1
1
1
1
0
1
1
0
0
0
0
1
1
1
1
1
1
1
0
1
1
1
1
1
0
0
0
1
1
1
1
1
0
1
1
0
1
1
1
1
1
1
1
1
1
1
0
1
1
1
0
0
1
1
0
1
0
0
1
1
1
0
1
0
1
1
1
1
1
1
0
0
0
1
1
0
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
0
1
1
1
1
1
1
1
1
1
1
0
1
1
1
1
1
0
0
1
0
0
1
1
1
1
1
0
1
1
1
1
1
1
0
1
1
1
1
1
1
1
0
0
1
1
1
0
1
1
1
1
1
1
1
1
0
1
1
1
0
1
0
1
1
0
1
0
0
1
1
0
0
1
1
1
1
1
1
0
1
1
1
0
1
0
1
0
1
1
1
1
1
1
1
0
0
1
1
1
1
1
1
1
0
1
1
0
1
1
1
1
1
0
1
0
1
1
1
1
1
1
1
0
1
1
1
1
0
1
1
1
1
0
0
1
1
0
1
0
1
1
0
1
1
1
1
1
0
1
1
1
1
0
0
1
0
1
0
1
0
1
1
1
1
1
0
1
1
1
1
1
1
1
0
1
1
1
1
1
1
1
1
1
0
1
1
0
0
1
1
1
0
1
1
0
1
1
1
1
1
1
1
1
1
1
1
1
1
0
1
1
1
1
1
1
1
1
0


In [27]:
pred_file_path = '../data/final_submission/sample_submission.csv'

df = pd.read_csv(pred_file_path)
df['pred'] = predictions
df.to_csv(pred_file_path, index=False)