In [105]:
from time import time
import pandas as pd

from sklearn.ensemble import ExtraTreesClassifier

from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

# Feature Extraction with Univariate Statistical Tests (Chi-squared for classification)
import numpy
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

def extract_features(dataframe):
    
    # load data
    array = dataframe.values

    #Get X (Input) and Y(Output) values  
    X = array[:,1:-1]
    Y = array[:, -1]
    
    Y = Y.astype(int) 
    
    # feature extraction
    test = SelectKBest(score_func=chi2, k=9)
    fit = test.fit(X, Y)
    # summarize scores
    numpy.set_printoptions(precision=3)
    print(fit.scores_)
    features = fit.transform(X)
    # summarize selected features
    print(features[0:5,:])

def clean_data(input_df):
    """Cleans dataframe data related to loan. Typically 
    replaces the NA values with default values and encodes
    string values to numeric values For Ex: M,F gets 
    transformed to 0,1

    Keyword arguments:
    input_df -- The loan dataframe created out of file
 
    """
    
    #Fill missing values with default values. These default values are chosen so that
    #bank would reject the loan with these default values
    input_df['Gender'].fillna('M', inplace=True)
    input_df['Married'].fillna('No', inplace=True)
    input_df['Dependents'].fillna('0', inplace=True)
    input_df['Education'].fillna('Not Graduate', inplace=True)
#     input_df['Self_Employed'].fillna('No', inplace=True)
    input_df['ApplicantIncome'].fillna('0', inplace=True)
    input_df['CoapplicantIncome'].fillna('0', inplace=True)
    input_df['LoanAmount'].fillna('0', inplace=True)
    input_df['Loan_Amount_Term'].fillna('360', inplace=True)
    input_df['Credit_History'].fillna('0', inplace=True)
    input_df['Property_Area'].fillna('Rural', inplace=True)
  
    le = preprocessing.LabelEncoder()

    #Transform character values to numeric values
#     input_df['Gender'] = le.fit_transform(input_df['Gender'])
#     input_df['Married'] = le.fit_transform(input_df['Married'])
#     input_df['Education'] = le.fit_transform(input_df['Education'])
#     input_df['Self_Employed'] = le.fit_transform(input_df['Self_Employed'])
#     input_df['Property_Area'] = le.fit_transform(input_df['Property_Area'])
    
    
    input_df['Gender'] = [ 1 if(val=='M') else 2 if(val=='F') else 3 for val in input_df['Gender']] 
    input_df['Married'] = [ 1 if(val=='Yes') else 2 if(val=='No') else 3 for val in input_df['Married']]  
    input_df['Education'] = [ 1 if(val=='Graduate') else 2 if(val=='Not Graduate') else 3  for val in input_df['Education']]  
    input_df['Self_Employed'] = [ 1 if(val=='Yes') else 2 if(val=='No') else 3 for val in input_df['Self_Employed']]  
    input_df['Property_Area'] = [ 1 if(val=='Urban') else 2 if(val=='Semiurban') else 3 for val in input_df['Property_Area']] 
     
      
    #Clean dependents with values like 3+ etc
    input_df['Dependents'] = [ int(val.replace('+','')) for val in input_df['Dependents']]  
    

def pre_process_training_data():
    """Pre processes the training data for loan data set 
    Merges the training features dataset with labels
    We will not use absolute values of the loan amount and incomes 
    for training. Rather loan amount % is calculated w.r.t 
    total income of the customer. This will be fed to the algorithm. 

    Keyword arguments:
    None
 
    """
 
    df_loandata = pd.read_csv("train_data.csv")
      
    clean_data(df_loandata)
  
    #Calculate the % of loan amount w.r.t total income
    #df_loandata['LoanAmountPct'] = [ (float(loanAmt)*100/(float(appIncom) + float(coAppIncome)))  for appIncom, coAppIncome, loanAmt in zip(df_loandata['ApplicantIncome'], df_loandata['CoapplicantIncome'], df_loandata['LoanAmount'] ) ]  
     
    #Assign correct value to loan status based on Y or N
    df_loandata['Loan_Status'] = [ 1 if val == 'Y' else 0 for val in df_loandata['Loan_Status']]  
    
    print(df_loandata.head())
     
    #Drop amounts that are no longer relevant (loan_id, loan related amounts, loan term)
    
     
    df_loandata.drop(df_loandata.columns[[0,1,2,3 ]], axis=1, inplace=True) 
    #df_loandata.drop(df_loandata.columns[[0, 6, 7, 8, 9]], axis=1, inplace=True) 
    
    #Re arrange columns so that the loan status will come at the end
#     colnames = df_loandata.columns.tolist()
#     colnames = colnames[:-2] + colnames[-1:] + colnames[-2:-1]
#     df_loandata = df_loandata[colnames]
    
#     df_loandata.drop(df_loandata.columns[[0, 3, 4, 5]], axis=1, inplace=True) 
    
    #rint(df_loandata.head())
    
    #xtract_features(df_loandata)
    
    
    #Get X (Input) and Y(Output) values  
    X = df_loandata.iloc[:,0:-1]
    Y = df_loandata.iloc[:, -1]
    
    #Preparte training and test data
    features_train, features_test, labels_train, labels_test = train_test_split(X, Y, test_size=0.1, random_state=42)
    
    
  
    return features_train, features_test, labels_train, labels_test
 
def train_model(features_train, labels_train):
    """Trains the model for loans
    We will use random forest that fares better than
    decision tree algorithm

    Keyword arguments:
    features_train -- The input features of the training data set
    labels_train -- The label for each of the training observations
    """
  
#     clf = ExtraTreesClassifier()
#     clf.fit(features_train, labels_train)
#     print(clf.feature_importances_)
    
    clf = RandomForestClassifier(n_estimators=9,  max_features=None, min_samples_split=7 ,  random_state=0 )
    
    clf = clf.fit(features_train, labels_train)
     
    score = clf.score(features_test, labels_test)
    
    print("Training Score %.2f %% " % (score*100))    

    return clf

def pre_process_test_data():

    """Pre process the test data
    We will not use absolute values of the loan amount and incomes 
    for training. Rather loan amount % is calculated w.r.t 
    total income of the customer. This will be fed to the algorithm. 
     
    Keyword arguments:
    None
    """
    df_loan_test_data = pd.read_csv("test_data.csv")
    
    clean_data(df_loan_test_data)
  
    #Calculate the % of loan amount w.r.t total income
#     df_loan_test_data['LoanAmountPct'] = [ (float(loanAmt)*100/(float(appIncom) + float(coAppIncome))) 
#                                             for appIncom, coAppIncome, loanAmt in
#                                               zip(df_loan_test_data['ApplicantIncome'], df_loan_test_data['CoapplicantIncome'], df_loan_test_data['LoanAmount']  ) ]  
    
    loan_numbers = df_loan_test_data.ix[:,0]
    
    
    
    #Drop amounts that are no longer relevant
    df_loan_test_data.drop(df_loan_test_data.columns[[0,1,2,3]], axis=1, inplace=True) 
    #df_loan_test_data.drop(df_loan_test_data.columns[[0, 6, 7, 8, 9]], axis=1, inplace=True) 
    
    #Based on feature extraction algorithm
    #df_loan_test_data.drop(df_loan_test_data.columns[[0, 3, 4, 5]], axis=1, inplace=True) 
  
    print(df_loan_test_data.head())
    #Get  Input  values  
    features_test = df_loan_test_data 
    
    
 
    return loan_numbers, features_test


In [106]:
#Measure training time
t0 = time()	

#Split data
features_train, features_test, labels_train, labels_test = pre_process_training_data()

#Train model
clf = train_model(features_train, labels_train)

print ("Training time: %3.3f seconds" %  (round(time()-t0, 3)))

t0 = time()	

#Load test data
loan_numbers, features_test = pre_process_test_data()

#Use model for prediction
results = clf.predict(features_test)

# print(features_test.head())

#Create dataframe based on the expected output
df_loan_results = pd.DataFrame({'Application_ID' : loan_numbers, 'Loan_Status' : results})

#Transform loan status
df_loan_results['Loan_Status'] = [ 'Y' if val == 1 else 'N' for val in df_loan_results['Loan_Status']]  
 
#Write the prediction results to output file    
df_loan_results.to_csv("sample_output.csv", index=False)

print ("Prediction time: %3.3f seconds" %  (round(time()-t0, 3)))

   Application_ID  Gender  Married  Dependents  Education  Self_Employed  \
0            1002       1        2           0          1              2   
1            1003       1        1           1          1              2   
2            1005       1        1           0          1              1   
3            1006       1        1           0          2              2   
4            1008       1        2           0          1              2   

   ApplicantIncome  CoapplicantIncome LoanAmount Loan_Amount_Term  \
0             5849                  0          0              360   
1             4583               1508        128              360   
2             3000                  0         66              360   
3             2583               2358        120              360   
4             6000                  0        141              360   

  Credit_History  Property_Area  Loan_Status  
0              1              1            1  
1              1              3   