In [3]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import roc_auc_score, auc

from datetime import datetime
from lightgbm import LGBMClassifier 
from xgboost import XGBClassifier

from sklearn.model_selection import RandomizedSearchCV, GridSearchCV

In [4]:
train_data = pd.read_csv('train.csv',index_col = 'Id')
test_data = pd.read_csv('test.csv', index_col = 'id')

In [5]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 252000 entries, 1 to 252000
Data columns (total 12 columns):
 #   Column               Non-Null Count   Dtype 
---  ------               --------------   ----- 
 0   income               252000 non-null  int64 
 1   age                  252000 non-null  int64 
 2   experience           252000 non-null  int64 
 3   married              252000 non-null  object
 4   house_ownership      252000 non-null  object
 5   car_ownership        252000 non-null  object
 6   profession           252000 non-null  object
 7   city                 252000 non-null  object
 8   state                252000 non-null  object
 9   current_job_years    252000 non-null  int64 
 10  current_house_years  252000 non-null  int64 
 11  risk_flag            252000 non-null  int64 
dtypes: int64(6), object(6)
memory usage: 25.0+ MB


In [6]:
train_data.describe()

Unnamed: 0,income,age,experience,current_job_years,current_house_years,risk_flag
count,252000.0,252000.0,252000.0,252000.0,252000.0,252000.0
mean,4997117.0,49.954071,10.084437,6.333877,11.997794,0.123
std,2878311.0,17.063863,6.00259,3.647053,1.399037,0.328438
min,10310.0,21.0,0.0,0.0,10.0,0.0
25%,2503015.0,35.0,5.0,3.0,11.0,0.0
50%,5000694.0,50.0,10.0,6.0,12.0,0.0
75%,7477502.0,65.0,15.0,9.0,13.0,0.0
max,9999938.0,79.0,20.0,14.0,14.0,1.0


In [7]:
cat_features = train_data.select_dtypes(['object']).columns
cat_features

Index(['married', 'house_ownership', 'car_ownership', 'profession', 'city',
       'state'],
      dtype='object')

In [8]:
for col in cat_features:
    print(train_data[col].nunique())

2
3
2
51
317
29


In [9]:
for col in cat_features:
    print(train_data[col].value_counts())

single     226272
married     25728
Name: married, dtype: int64
rented          231898
owned            12918
norent_noown      7184
Name: house_ownership, dtype: int64
no     176000
yes     76000
Name: car_ownership, dtype: int64
Physician                     5957
Statistician                  5806
Web_designer                  5397
Psychologist                  5390
Computer_hardware_engineer    5372
Drafter                       5359
Magistrate                    5357
Fashion_Designer              5304
Air_traffic_controller        5281
Comedian                      5259
Industrial_Engineer           5250
Mechanical_engineer           5217
Chemical_engineer             5205
Technical_writer              5195
Hotel_Manager                 5178
Financial_Analyst             5167
Graphic_Designer              5166
Flight_attendant              5128
Biomedical_Engineer           5127
Secretary                     5061
Software_Developer            5053
Petroleum_Engineer            5041

In [10]:
le = LabelEncoder()
for col in cat_features:
    train_data[col] = le.fit_transform(train_data[col])
    test_data[col] = le.fit_transform(test_data[col])

In [11]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 252000 entries, 1 to 252000
Data columns (total 12 columns):
 #   Column               Non-Null Count   Dtype
---  ------               --------------   -----
 0   income               252000 non-null  int64
 1   age                  252000 non-null  int64
 2   experience           252000 non-null  int64
 3   married              252000 non-null  int64
 4   house_ownership      252000 non-null  int64
 5   car_ownership        252000 non-null  int64
 6   profession           252000 non-null  int64
 7   city                 252000 non-null  int64
 8   state                252000 non-null  int64
 9   current_job_years    252000 non-null  int64
 10  current_house_years  252000 non-null  int64
 11  risk_flag            252000 non-null  int64
dtypes: int64(12)
memory usage: 25.0 MB


In [12]:
sc = StandardScaler()
col = 'income'
train_data[col] = sc.fit_transform(train_data[col].values.reshape(-1,1))
test_data[col] = sc.transform(test_data[col].values.reshape(-1,1))

In [13]:
sc = StandardScaler()
col = 'age'
train_data[col] = sc.fit_transform(train_data[col].values.reshape(-1,1))
test_data[col] = sc.transform(test_data[col].values.reshape(-1,1))

In [14]:
sc = StandardScaler()
col = 'experience'
train_data[col] = sc.fit_transform(train_data[col].values.reshape(-1,1))
test_data[col] = sc.transform(test_data[col].values.reshape(-1,1))

In [15]:
sc = StandardScaler()
col = 'current_job_years'
train_data[col] = sc.fit_transform(train_data[col].values.reshape(-1,1))
test_data[col] = sc.transform(test_data[col].values.reshape(-1,1))

In [17]:
sc = StandardScaler()
col = 'current_house_years'
train_data[col] = sc.fit_transform(train_data[col].values.reshape(-1,1))
test_data[col] = sc.transform(test_data[col].values.reshape(-1,1))

In [19]:
from sklearn.ensemble import RandomForestClassifier

In [20]:
RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=-1, oob_score=False, random_state=3696, verbose=0,
                       warm_start=False)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=-1, oob_score=False, random_state=3696, verbose=0,
                       warm_start=False)

In [21]:
y = train_data.loc[:,'risk_flag']
X = train_data.drop('risk_flag', axis = 1)
X

Unnamed: 0_level_0,income,age,experience,married,house_ownership,car_ownership,profession,city,state,current_job_years,current_house_years
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,-1.283145,-1.579603,-1.180232,1,2,0,33,251,13,-0.914131,0.716356
2,0.895457,-0.583343,-0.014067,1,2,0,43,227,14,0.731036,0.716356
3,-0.349269,0.940347,-1.013637,0,2,0,47,8,12,-0.639936,-1.427981
4,0.437526,-0.524740,-1.346827,1,2,1,43,54,17,-1.188325,0.001577
5,0.268128,-0.173119,0.152528,1,2,0,11,296,22,-0.914131,1.431135
...,...,...,...,...,...,...,...,...,...,...,...
251996,1.097092,-0.407533,0.485719,1,2,0,45,162,28,-0.091547,-0.713202
251997,-0.748199,-1.403792,-0.014067,1,2,0,3,251,13,-0.091547,-0.713202
251998,-0.164913,-0.231722,-0.513852,1,2,0,17,144,14,0.182647,0.001577
251999,0.524618,-0.290326,-1.680018,1,2,0,27,233,18,-1.736714,-1.427981


In [22]:
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, stratify = y, random_state = 42)

In [23]:
model = RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=-1, oob_score=False, random_state=4058, verbose=0,
                       warm_start=False)
clf = model.fit(x_train, y_train)

In [24]:
roc_auc_score(y_train, clf.predict(x_train))

0.8795748754879298

In [25]:
roc_auc_score(y_test,clf.predict_proba(x_test)[:,1])

0.9362374980337279

In [28]:
y_pred = model.predict(test_data)

In [29]:
y_pred

array([0, 0, 0, ..., 0, 0, 0])

In [30]:
sub = pd.DataFrame(y_pred,columns = ['risk_flag'],index = [i+1 for i in range(test_data.shape[0])])

sub.index.name = 'id'
sub.tail()
sub.head()

Unnamed: 0_level_0,risk_flag
id,Unnamed: 1_level_1
1,0
2,0
3,0
4,0
5,0


In [31]:
sub.to_csv('rf.csv')