In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import gc
from sklearn.model_selection import train_test_split
%matplotlib inline

In [2]:
#Reading training data
train=pd.read_csv('MergedData/merged_data_train.csv')
print(train.shape)

(307511, 199)


In [3]:
#As size of the data is huge, considering only 10% of data
train = train.sample(frac = 0.1, random_state= 1)

print(train.head())
print(train.shape)

        Unnamed: 0  SK_ID_CURR  TARGET NAME_CONTRACT_TYPE CODE_GENDER  \
64282        64282      174545       1         Cash loans           F   
94645        94645      209898       0         Cash loans           M   
306349      306349      454938       0         Cash loans           M   
258314      258314      398930       0         Cash loans           M   
87597        87597      201672       0         Cash loans           M   

       FLAG_OWN_CAR FLAG_OWN_REALTY  CNT_CHILDREN  AMT_INCOME_TOTAL  \
64282             N               Y             0          135000.0   
94645             Y               Y             0          135000.0   
306349            Y               Y             0          112500.0   
258314            Y               N             0          247500.0   
87597             N               Y             0          202500.0   

        AMT_CREDIT_x          ...            CNT_INSTALMENT_MATURE_CUM  \
64282       654498.0          ...                           

In [4]:
#Reading testing data
test=pd.read_csv('MergedData/merged_data_test.csv')
print(test.shape)

(48744, 198)


In [5]:
#Convert categorical variable into dummy/indicator variables 
from sklearn import preprocessing
categorical_feats = [
    f for f in train.columns if train[f].dtype == 'object'
]

for col in categorical_feats:
    lb = preprocessing.LabelEncoder()
    lb.fit(list(train[col].values.astype('str')) + list(test[col].values.astype('str')))
    train[col] = lb.transform(list(train[col].values.astype('str')))
    test[col] = lb.transform(list(test[col].values.astype('str')))

In [6]:
#Fill NA/NaN values using the specified method
train.fillna(-999, inplace = True)

In [7]:
#Extracting features and label
features = train.drop(['SK_ID_CURR', 'TARGET'],axis=1).columns.values
label = "TARGET"

x=train[features]
y=train[label]

print(x.shape)
print(y.size)

(30751, 197)
30751


In [8]:
#Split training and test data
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size = 1/3, random_state = 0)



In [9]:
#Using RandonForestClassifier to train the model
from sklearn.ensemble import RandomForestClassifier
randonForestAlgo = RandomForestClassifier(n_estimators=20)
randonForestAlgo.fit(X_train,y_train)
predictionAlgo = randonForestAlgo.predict(X_test)
print(predictionAlgo)

[0 0 0 ... 0 0 0]


In [10]:
#Compare predictions between the actual result and the algorithm prediction
from sklearn.metrics import confusion_matrix
confusion_matrix = confusion_matrix(predictionAlgo, y_test)
print(confusion_matrix)  

[[9401  847]
 [   1    2]]


In [11]:
#Using LogisticRegression to train the model
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression().fit(X_train, y_train)

y_predict = lr.predict(X_test)

from sklearn.metrics import confusion_matrix
confusion_matrix = confusion_matrix(y_predict, y_test)
print(confusion_matrix)

[[9399  849]
 [   3    0]]


In [12]:
#Fill NA/NaN values using the specified method
test.fillna(-999, inplace = True)

In [13]:
#Extracting features and label
testColumns = test.columns.tolist()

testFeatures = [c for c in testColumns if c not in ["SK_ID_CURR"]]
x_test=test[testFeatures]
print(x_test.shape)

(48744, 197)


In [14]:
#predict for the actual test data
testPredict = randonForestAlgo.predict(x_test)

In [15]:
# Concatenate the SK_ID_CURR, actual test data and the predicted target
result = np.column_stack((test["SK_ID_CURR"],x_test, testPredict))
resultDataframe = pd.DataFrame(data=result)

In [16]:
#Save the final data with the predictions
print(resultDataframe.shape)
resultDataframe.to_csv('final_prediction.csv')

(48744, 199)
