In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import Imputer
from sklearn.linear_model import LogisticRegression
from sklearn import svm
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier, VotingClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score
from xgboost import XGBClassifier

In [2]:
train = pd.read_csv('train_u6lujuX_CVtuZ9i.csv')
test = pd.read_csv('test_Y3wMUE5_7gLdaTN.csv')

Loan_ID = test['Loan_ID']

train = train.drop('Loan_ID', axis=1)
test = test.drop('Loan_ID', axis=1)

In [3]:
text_col = ['Gender', 'Married', 'Education', 'Self_Employed', 'Property_Area', 'Loan_Status']
obj_col = ['Dependents']

In [4]:
train.shape

(614, 12)

In [5]:
test.shape

(367, 11)

In [6]:
train['Property_Area'].value_counts()

Semiurban    233
Urban        202
Rural        179
Name: Property_Area, dtype: int64

In [7]:
Dependents = train['Dependents']
for idx, value in enumerate(Dependents):
    if value == '3+':
        Dependents[idx] = 3
train['Dependents'] = Dependents

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.


In [8]:
Dependents = test['Dependents']
for idx, value in enumerate(Dependents):
    if value == '3+':
        Dependents[idx] = 3
test['Dependents'] = Dependents

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.


In [9]:
train["Dependents"] = pd.to_numeric(train["Dependents"])

encoder = LabelEncoder()
for col in text_col:
    train[col] = train[col].astype('str') 
    train[col] = encoder.fit_transform(train[col])
    print(train[col].shape)

(614,)
(614,)
(614,)
(614,)
(614,)
(614,)


In [10]:
test["Dependents"] = pd.to_numeric(test["Dependents"])

encoder_test = LabelEncoder()
for col in text_col[0:5]:
    test[col] = test[col].astype('str') 
    test[col] = encoder_test.fit_transform(test[col])
    print(test[col].shape)

(367,)
(367,)
(367,)
(367,)
(367,)


In [11]:
train.head()

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,1,0,0.0,0,0,5849,0.0,,360.0,1.0,2,1
1,1,1,1.0,0,0,4583,1508.0,128.0,360.0,1.0,0,0
2,1,1,0.0,0,1,3000,0.0,66.0,360.0,1.0,2,1
3,1,1,0.0,1,0,2583,2358.0,120.0,360.0,1.0,2,1
4,1,0,0.0,0,0,6000,0.0,141.0,360.0,1.0,2,1


In [12]:
test.head()

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area
0,1,1,0.0,0,0,5720,0,110.0,360.0,1.0,2
1,1,1,1.0,0,0,3076,1500,126.0,360.0,1.0,2
2,1,1,2.0,0,0,5000,1800,208.0,360.0,1.0,2
3,1,1,2.0,0,0,2340,2546,100.0,360.0,,2
4,1,0,0.0,1,0,3276,0,78.0,360.0,1.0,2


In [13]:
train['Property_Area'].value_counts()

1    233
2    202
0    179
Name: Property_Area, dtype: int64

In [14]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 614 entries, 0 to 613
Data columns (total 12 columns):
Gender               614 non-null int64
Married              614 non-null int64
Dependents           599 non-null float64
Education            614 non-null int64
Self_Employed        614 non-null int64
ApplicantIncome      614 non-null int64
CoapplicantIncome    614 non-null float64
LoanAmount           592 non-null float64
Loan_Amount_Term     600 non-null float64
Credit_History       564 non-null float64
Property_Area        614 non-null int64
Loan_Status          614 non-null int64
dtypes: float64(5), int64(7)
memory usage: 57.6 KB


In [15]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 367 entries, 0 to 366
Data columns (total 11 columns):
Gender               367 non-null int64
Married              367 non-null int64
Dependents           357 non-null float64
Education            367 non-null int64
Self_Employed        367 non-null int64
ApplicantIncome      367 non-null int64
CoapplicantIncome    367 non-null int64
LoanAmount           362 non-null float64
Loan_Amount_Term     361 non-null float64
Credit_History       338 non-null float64
Property_Area        367 non-null int64
dtypes: float64(4), int64(7)
memory usage: 31.6 KB


In [16]:
imputer = Imputer(missing_values='NaN', strategy='median', axis=0)
xtrain = imputer.fit_transform(train.iloc[:, :11])
xtrain = pd.DataFrame(xtrain)

test = imputer.transform(test)
test = pd.DataFrame(test)

In [17]:
X_train, y_train = xtrain.iloc[:, :11], train.iloc[:, -1]
X_test = test.iloc[:, :11]
random_state = 42

In [18]:
X_train.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10
0,1.0,0.0,0.0,0.0,0.0,5849.0,0.0,128.0,360.0,1.0,2.0
1,1.0,1.0,1.0,0.0,0.0,4583.0,1508.0,128.0,360.0,1.0,0.0
2,1.0,1.0,0.0,0.0,1.0,3000.0,0.0,66.0,360.0,1.0,2.0
3,1.0,1.0,0.0,1.0,0.0,2583.0,2358.0,120.0,360.0,1.0,2.0
4,1.0,0.0,0.0,0.0,0.0,6000.0,0.0,141.0,360.0,1.0,2.0


In [19]:
y_train.head()

0    1
1    0
2    1
3    1
4    1
Name: Loan_Status, dtype: int64

In [20]:
X_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 367 entries, 0 to 366
Data columns (total 11 columns):
0     367 non-null float64
1     367 non-null float64
2     367 non-null float64
3     367 non-null float64
4     367 non-null float64
5     367 non-null float64
6     367 non-null float64
7     367 non-null float64
8     367 non-null float64
9     367 non-null float64
10    367 non-null float64
dtypes: float64(11)
memory usage: 31.6 KB


In [21]:
print(X_train.shape)
print(X_test.shape)

(614, 11)
(367, 11)


In [22]:
X_train.columns

RangeIndex(start=0, stop=11, step=1)

In [23]:
scaler = StandardScaler()

X_train[[5, 6, 7, 8]] = scaler.fit_transform(X_train[[5, 6, 7, 8]])
X_test[[5, 6, 7, 8]] = scaler.transform(X_test[[5, 6, 7, 8]])

In [24]:
X_train.shape

(614, 11)

In [25]:
X_test.shape

(367, 11)

In [26]:
y_train.shape

(614,)

In [27]:
log_clf = LogisticRegression()
svm_clf = svm.SVC(kernel='linear', probability=True)
tree_clf = tree.DecisionTreeClassifier()
forest_clf = RandomForestClassifier()
lda_clf = LinearDiscriminantAnalysis()
sgd_clf = SGDClassifier()
models = [log_clf, svm_clf, tree_clf, forest_clf, lda_clf, sgd_clf]

In [28]:
for model in models:
    score = cross_val_score(model, X_train, y_train, cv=3, scoring='accuracy')
    print(model, score)

(LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False), array([0.8       , 0.78536585, 0.81862745]))
(SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='linear',
  max_iter=-1, probability=True, random_state=None, shrinking=True,
  tol=0.001, verbose=False), array([0.8       , 0.79512195, 0.83333333]))
(DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best'), array([0.71707317, 0.71219512, 0.70098039]))
(RandomForestClas



In [30]:
######### RANDOM FOREST ##############

forest_clf = RandomForestClassifier(n_estimators=100)
cross_val_score(forest_clf, X_train, y_train, cv=3, scoring='accuracy')

array([0.7804878 , 0.78536585, 0.78431373])

In [32]:
# forest_clf.fit(X_train, y_train)
# y_predict = forest_clf.predict(X_test)
# test['Loan_Status']= pd.DataFrame(y_predict)
# test.to_csv('test_new.csv')

In [33]:
######### SVM ############## test accuracy : 77.7777%

svm_clf = svm.SVC(kernel='linear')
cross_val_score(svm_clf, X_train, y_train, cv=3, scoring='accuracy')

array([0.8       , 0.79512195, 0.83333333])

In [34]:
# svm_clf.fit(X_train, y_train)
# y_predict = svm_clf.predict(X_test)

# new = pd.DataFrame(columns=['Loan_ID', 'Loan_Status'])
# new['Loan_ID'] = Loan_ID
# new['Loan_Status']= pd.DataFrame(y_predict)
# new.to_csv('submission.csv', index=False)

In [33]:
######### ADABOOST ##############

bdt_clf = AdaBoostClassifier(tree.DecisionTreeClassifier(max_depth=1),
                         algorithm="SAMME",
                         n_estimators=200)
cross_val_score(bdt_clf, X_train, y_train, cv=3, scoring='accuracy')

array([0.8       , 0.79512195, 0.80882353])

In [34]:
######### GRADIENT BOOST ##############
gdt_clf = GradientBoostingClassifier(n_estimators=100)
cross_val_score(gdt_clf, X_train, y_train, cv=3, scoring='accuracy')

array([0.76097561, 0.77560976, 0.7745098 ])

In [35]:
######### Voting Classifier ##############
svm_clf = svm.SVC(kernel='linear', probability=True)
voting_clf = VotingClassifier(
estimators = [('lr', log_clf),
             ('svm', svm_clf),
             ('tree', tree_clf),
             ('forest', forest_clf),
             ('lda', lda_clf)
            ],
voting = 'soft')
cross_val_score(voting_clf, X_train, y_train, cv=3, scoring='accuracy')

  if diff:
  if diff:
  if diff:


array([0.8       , 0.7902439 , 0.81862745])

In [36]:
# voting_clf.fit(X_train, y_train)
# y_predict = voting_clf.predict(X_test)

# new = pd.DataFrame(columns=['Loan_ID', 'Loan_Status'])
# new['Loan_ID'] = Loan_ID
# new['Loan_Status']= pd.DataFrame(y_predict)
# new.to_csv('submission.csv', index=False)

In [37]:
######### XGBOOST ##############

xg = XGBClassifier()
cross_val_score(xg, X_train, y_train, cv=3, scoring='accuracy')

  if diff:
  if diff:
  if diff:


array([0.7902439 , 0.7902439 , 0.78921569])

In [38]:
xg.fit(X_train, y_train)
y_predict = xg.predict(X_test)

new = pd.DataFrame(columns=['Loan_ID', 'Loan_Status'])
new['Loan_ID'] = Loan_ID
new['Loan_Status']= pd.DataFrame(y_predict)
new.to_csv('submission.csv', index=False)

  if diff:


Submission Accuracy : 78.47% using XGBoost