# Calculating Voting Classifier Accuracy with Classification Models

In [1]:
import pandas as pd

data = pd.read_csv("E:/Data Science/CSV dataset/Loan payments data.csv")
data

Unnamed: 0,Loan_ID,loan_status,Principal,terms,effective_date,due_date,paid_off_time,past_due_days,age,education,Gender
0,xqd20166231,PAIDOFF,1000,30,9/8/2016,10/7/2016,9/14/2016 19:31,,45,High School or Below,male
1,xqd20168902,PAIDOFF,1000,30,9/8/2016,10/7/2016,10/7/2016 9:00,,50,Bechalor,female
2,xqd20160003,PAIDOFF,1000,30,9/8/2016,10/7/2016,9/25/2016 16:58,,33,Bechalor,female
3,xqd20160004,PAIDOFF,1000,15,9/8/2016,9/22/2016,9/22/2016 20:00,,27,college,male
4,xqd20160005,PAIDOFF,1000,30,9/9/2016,10/8/2016,9/23/2016 21:36,,28,college,female
...,...,...,...,...,...,...,...,...,...,...,...
495,xqd20160496,COLLECTION_PAIDOFF,1000,30,9/12/2016,10/11/2016,10/14/2016 19:08,3.0,28,High School or Below,male
496,xqd20160497,COLLECTION_PAIDOFF,1000,15,9/12/2016,9/26/2016,10/10/2016 20:02,14.0,26,High School or Below,male
497,xqd20160498,COLLECTION_PAIDOFF,800,15,9/12/2016,9/26/2016,9/29/2016 11:49,3.0,30,college,male
498,xqd20160499,COLLECTION_PAIDOFF,1000,30,9/12/2016,11/10/2016,11/11/2016 22:40,1.0,38,college,female


In [2]:
# Mapping output variable with binary values

data['loan_status'] = data['loan_status'].map({'PAIDOFF': 1, 'COLLECTION_PAIDOFF': 0, 'COLLECTION': 0})
data

Unnamed: 0,Loan_ID,loan_status,Principal,terms,effective_date,due_date,paid_off_time,past_due_days,age,education,Gender
0,xqd20166231,1,1000,30,9/8/2016,10/7/2016,9/14/2016 19:31,,45,High School or Below,male
1,xqd20168902,1,1000,30,9/8/2016,10/7/2016,10/7/2016 9:00,,50,Bechalor,female
2,xqd20160003,1,1000,30,9/8/2016,10/7/2016,9/25/2016 16:58,,33,Bechalor,female
3,xqd20160004,1,1000,15,9/8/2016,9/22/2016,9/22/2016 20:00,,27,college,male
4,xqd20160005,1,1000,30,9/9/2016,10/8/2016,9/23/2016 21:36,,28,college,female
...,...,...,...,...,...,...,...,...,...,...,...
495,xqd20160496,0,1000,30,9/12/2016,10/11/2016,10/14/2016 19:08,3.0,28,High School or Below,male
496,xqd20160497,0,1000,15,9/12/2016,9/26/2016,10/10/2016 20:02,14.0,26,High School or Below,male
497,xqd20160498,0,800,15,9/12/2016,9/26/2016,9/29/2016 11:49,3.0,30,college,male
498,xqd20160499,0,1000,30,9/12/2016,11/10/2016,11/11/2016 22:40,1.0,38,college,female


# Filling missing values:: 

In [3]:
mean  = data['past_due_days'].mean()
data['past_due_days'].fillna(mean, axis = 0, inplace = True)
data.isnull().sum()

Loan_ID             0
loan_status         0
Principal           0
terms               0
effective_date      0
due_date            0
paid_off_time     100
past_due_days       0
age                 0
education           0
Gender              0
dtype: int64

In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500 entries, 0 to 499
Data columns (total 11 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Loan_ID         500 non-null    object 
 1   loan_status     500 non-null    int64  
 2   Principal       500 non-null    int64  
 3   terms           500 non-null    int64  
 4   effective_date  500 non-null    object 
 5   due_date        500 non-null    object 
 6   paid_off_time   400 non-null    object 
 7   past_due_days   500 non-null    float64
 8   age             500 non-null    int64  
 9   education       500 non-null    object 
 10  Gender          500 non-null    object 
dtypes: float64(1), int64(4), object(6)
memory usage: 43.1+ KB


In [5]:
data.describe()

Unnamed: 0,loan_status,Principal,terms,past_due_days,age
count,500.0,500.0,500.0,500.0,500.0
mean,0.6,943.2,22.824,36.01,31.116
std,0.490389,115.240274,8.000064,18.55415,6.084784
min,0.0,300.0,7.0,1.0,18.0
25%,0.0,1000.0,15.0,36.01,27.0
50%,1.0,1000.0,30.0,36.01,30.0
75%,1.0,1000.0,30.0,36.01,35.0
max,1.0,1000.0,30.0,76.0,51.0


In [6]:
# Defining our Classification Models :

In [7]:
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier


random_state = 1234
model1 = RandomForestClassifier(n_estimators=250, criterion='gini', n_jobs= -1, random_state = random_state)
model2 = RandomForestClassifier(n_estimators=250, criterion='entropy', n_jobs= -1, random_state = random_state)
model3 = GradientBoostingClassifier()




In [8]:
# Selecting columns with object values

col_list = list(data.select_dtypes(include = 'object'))
col_list

['Loan_ID',
 'effective_date',
 'due_date',
 'paid_off_time',
 'education',
 'Gender']

In [9]:
# Dropping columns with object values

data1 = data.drop(['Loan_ID', 'effective_date', 'due_date', 'paid_off_time', 'education', 'Gender'], axis = 1)
data1

Unnamed: 0,loan_status,Principal,terms,past_due_days,age
0,1,1000,30,36.01,45
1,1,1000,30,36.01,50
2,1,1000,30,36.01,33
3,1,1000,15,36.01,27
4,1,1000,30,36.01,28
...,...,...,...,...,...
495,0,1000,30,3.00,28
496,0,1000,15,14.00,26
497,0,800,15,3.00,30
498,0,1000,30,1.00,38


In [10]:
# Changing categorial values into binary values with get dummies

dummies = pd.get_dummies(data[['education', 'Gender']])

df = pd.concat([data1, dummies], axis = 1)
df

Unnamed: 0,loan_status,Principal,terms,past_due_days,age,education_Bechalor,education_High School or Below,education_Master or Above,education_college,Gender_female,Gender_male
0,1,1000,30,36.01,45,0,1,0,0,0,1
1,1,1000,30,36.01,50,1,0,0,0,1,0
2,1,1000,30,36.01,33,1,0,0,0,1,0
3,1,1000,15,36.01,27,0,0,0,1,0,1
4,1,1000,30,36.01,28,0,0,0,1,1,0
...,...,...,...,...,...,...,...,...,...,...,...
495,0,1000,30,3.00,28,0,1,0,0,0,1
496,0,1000,15,14.00,26,0,1,0,0,0,1
497,0,800,15,3.00,30,0,0,0,1,0,1
498,0,1000,30,1.00,38,0,0,0,1,1,0


In [11]:
Y = data['loan_status']             # Target Variable
Y

0      1
1      1
2      1
3      1
4      1
      ..
495    0
496    0
497    0
498    0
499    0
Name: loan_status, Length: 500, dtype: int64

# Splitting the Data into training and validation data

In [12]:
from sklearn.model_selection import train_test_split

x_train, x_val, y_train, y_val = train_test_split(df, Y, test_size = 0.2)

In [13]:
# training our models and fitting values 

model1.fit(x_train, y_train)
model2.fit(x_train, y_train)
model3.fit(x_train, y_train)

GradientBoostingClassifier(ccp_alpha=0.0, criterion='friedman_mse', init=None,
                           learning_rate=0.1, loss='deviance', max_depth=3,
                           max_features=None, max_leaf_nodes=None,
                           min_impurity_decrease=0.0, min_impurity_split=None,
                           min_samples_leaf=1, min_samples_split=2,
                           min_weight_fraction_leaf=0.0, n_estimators=100,
                           n_iter_no_change=None, presort='deprecated',
                           random_state=None, subsample=1.0, tol=0.0001,
                           validation_fraction=0.1, verbose=0,
                           warm_start=False)

In [14]:
# Making predictions

pred1 = model1.predict_proba(x_val)
pred2 = model2.predict_proba(x_val)
pred3 = model3.predict_proba(x_val)


In [15]:
# Calculating Loss of various models

from sklearn.metrics import log_loss
x = log_loss(y_val, pred1)
y = log_loss(y_val, pred2)
z = log_loss(y_val, pred3)

x,y,z

(0.008413128540431948, 0.008718514918346562, 0.00028196712634336605)

# Using Voting Classifier using all three models ::

In [16]:
eclf = VotingClassifier(estimators=[('rf1', model1), ('rf2', model2), ('gbc', model3)], voting='soft', weights = [1,1,2])
eclf.fit(x_train, y_train)
y_val_pred = eclf.predict_proba(x_val)
log_loss(y_val, y_val_pred)

0.004376847583545313

In [17]:
# Using mode to store the best values of prediction among three models using same input values

import numpy as np
from scipy.stats import mode
final_pred = []

for i in range(0, len(x_val)):
    final_pred = np.append(final_pred, mode([pred1[i], pred2[i], pred3[i]]))
    

# Using Voting Regression to predict for continuous data :

In [18]:
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor

random_state = 1111

m1 = LinearRegression()
m2 = DecisionTreeRegressor(random_state = random_state)
m3 = RandomForestRegressor(random_state = random_state)

In [19]:
dataset = pd.read_csv("E:/Data Science/CSV dataset/Placement_Data_Full_Class.csv")
dataset

Unnamed: 0,sl_no,gender,ssc_p,ssc_b,hsc_p,hsc_b,hsc_s,degree_p,degree_t,workex,etest_p,specialisation,mba_p,status,salary
0,1,M,67.00,Others,91.00,Others,Commerce,58.00,Sci&Tech,No,55.0,Mkt&HR,58.80,Placed,270000.0
1,2,M,79.33,Central,78.33,Others,Science,77.48,Sci&Tech,Yes,86.5,Mkt&Fin,66.28,Placed,200000.0
2,3,M,65.00,Central,68.00,Central,Arts,64.00,Comm&Mgmt,No,75.0,Mkt&Fin,57.80,Placed,250000.0
3,4,M,56.00,Central,52.00,Central,Science,52.00,Sci&Tech,No,66.0,Mkt&HR,59.43,Not Placed,
4,5,M,85.80,Central,73.60,Central,Commerce,73.30,Comm&Mgmt,No,96.8,Mkt&Fin,55.50,Placed,425000.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
210,211,M,80.60,Others,82.00,Others,Commerce,77.60,Comm&Mgmt,No,91.0,Mkt&Fin,74.49,Placed,400000.0
211,212,M,58.00,Others,60.00,Others,Science,72.00,Sci&Tech,No,74.0,Mkt&Fin,53.62,Placed,275000.0
212,213,M,67.00,Others,67.00,Others,Commerce,73.00,Comm&Mgmt,Yes,59.0,Mkt&Fin,69.72,Placed,295000.0
213,214,F,74.00,Others,66.00,Others,Commerce,58.00,Comm&Mgmt,No,70.0,Mkt&HR,60.23,Placed,204000.0


In [20]:
dataset.isnull().sum()

sl_no              0
gender             0
ssc_p              0
ssc_b              0
hsc_p              0
hsc_b              0
hsc_s              0
degree_p           0
degree_t           0
workex             0
etest_p            0
specialisation     0
mba_p              0
status             0
salary            67
dtype: int64

# Filling missing values:: 

In [21]:
mean_val = round(dataset['salary'].mean(), 0)
dataset['salary'].fillna(mean_val, axis = 0, inplace = True)
dataset

Unnamed: 0,sl_no,gender,ssc_p,ssc_b,hsc_p,hsc_b,hsc_s,degree_p,degree_t,workex,etest_p,specialisation,mba_p,status,salary
0,1,M,67.00,Others,91.00,Others,Commerce,58.00,Sci&Tech,No,55.0,Mkt&HR,58.80,Placed,270000.0
1,2,M,79.33,Central,78.33,Others,Science,77.48,Sci&Tech,Yes,86.5,Mkt&Fin,66.28,Placed,200000.0
2,3,M,65.00,Central,68.00,Central,Arts,64.00,Comm&Mgmt,No,75.0,Mkt&Fin,57.80,Placed,250000.0
3,4,M,56.00,Central,52.00,Central,Science,52.00,Sci&Tech,No,66.0,Mkt&HR,59.43,Not Placed,288655.0
4,5,M,85.80,Central,73.60,Central,Commerce,73.30,Comm&Mgmt,No,96.8,Mkt&Fin,55.50,Placed,425000.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
210,211,M,80.60,Others,82.00,Others,Commerce,77.60,Comm&Mgmt,No,91.0,Mkt&Fin,74.49,Placed,400000.0
211,212,M,58.00,Others,60.00,Others,Science,72.00,Sci&Tech,No,74.0,Mkt&Fin,53.62,Placed,275000.0
212,213,M,67.00,Others,67.00,Others,Commerce,73.00,Comm&Mgmt,Yes,59.0,Mkt&Fin,69.72,Placed,295000.0
213,214,F,74.00,Others,66.00,Others,Commerce,58.00,Comm&Mgmt,No,70.0,Mkt&HR,60.23,Placed,204000.0


In [22]:
columns = list(dataset.select_dtypes(include = 'object'))
columns

['gender',
 'ssc_b',
 'hsc_b',
 'hsc_s',
 'degree_t',
 'workex',
 'specialisation',
 'status']

In [23]:
# Ceating new dataframe consisting only categorial values

new_df = pd.DataFrame(dataset[columns])
new_df

Unnamed: 0,gender,ssc_b,hsc_b,hsc_s,degree_t,workex,specialisation,status
0,M,Others,Others,Commerce,Sci&Tech,No,Mkt&HR,Placed
1,M,Central,Others,Science,Sci&Tech,Yes,Mkt&Fin,Placed
2,M,Central,Central,Arts,Comm&Mgmt,No,Mkt&Fin,Placed
3,M,Central,Central,Science,Sci&Tech,No,Mkt&HR,Not Placed
4,M,Central,Central,Commerce,Comm&Mgmt,No,Mkt&Fin,Placed
...,...,...,...,...,...,...,...,...
210,M,Others,Others,Commerce,Comm&Mgmt,No,Mkt&Fin,Placed
211,M,Others,Others,Science,Sci&Tech,No,Mkt&Fin,Placed
212,M,Others,Others,Commerce,Comm&Mgmt,Yes,Mkt&Fin,Placed
213,F,Others,Others,Commerce,Comm&Mgmt,No,Mkt&HR,Placed


In [24]:
# Using dummies function to change categorial values into binary values

dummies = pd.get_dummies(new_df)
dummies

Unnamed: 0,gender_F,gender_M,ssc_b_Central,ssc_b_Others,hsc_b_Central,hsc_b_Others,hsc_s_Arts,hsc_s_Commerce,hsc_s_Science,degree_t_Comm&Mgmt,degree_t_Others,degree_t_Sci&Tech,workex_No,workex_Yes,specialisation_Mkt&Fin,specialisation_Mkt&HR,status_Not Placed,status_Placed
0,0,1,0,1,0,1,0,1,0,0,0,1,1,0,0,1,0,1
1,0,1,1,0,0,1,0,0,1,0,0,1,0,1,1,0,0,1
2,0,1,1,0,1,0,1,0,0,1,0,0,1,0,1,0,0,1
3,0,1,1,0,1,0,0,0,1,0,0,1,1,0,0,1,1,0
4,0,1,1,0,1,0,0,1,0,1,0,0,1,0,1,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
210,0,1,0,1,0,1,0,1,0,1,0,0,1,0,1,0,0,1
211,0,1,0,1,0,1,0,0,1,0,0,1,1,0,1,0,0,1
212,0,1,0,1,0,1,0,1,0,1,0,0,0,1,1,0,0,1
213,1,0,0,1,0,1,0,1,0,1,0,0,1,0,0,1,0,1


In [25]:
# dropping the categorial values

df_excluded = dataset.drop(columns, axis = 1)
df_excluded

Unnamed: 0,sl_no,ssc_p,hsc_p,degree_p,etest_p,mba_p,salary
0,1,67.00,91.00,58.00,55.0,58.80,270000.0
1,2,79.33,78.33,77.48,86.5,66.28,200000.0
2,3,65.00,68.00,64.00,75.0,57.80,250000.0
3,4,56.00,52.00,52.00,66.0,59.43,288655.0
4,5,85.80,73.60,73.30,96.8,55.50,425000.0
...,...,...,...,...,...,...,...
210,211,80.60,82.00,77.60,91.0,74.49,400000.0
211,212,58.00,60.00,72.00,74.0,53.62,275000.0
212,213,67.00,67.00,73.00,59.0,69.72,295000.0
213,214,74.00,66.00,58.00,70.0,60.23,204000.0


In [26]:
# concatenating two created datset to be used for evaluation

final = pd.concat([df_excluded, dummies], axis = 1)
final

Unnamed: 0,sl_no,ssc_p,hsc_p,degree_p,etest_p,mba_p,salary,gender_F,gender_M,ssc_b_Central,...,hsc_s_Science,degree_t_Comm&Mgmt,degree_t_Others,degree_t_Sci&Tech,workex_No,workex_Yes,specialisation_Mkt&Fin,specialisation_Mkt&HR,status_Not Placed,status_Placed
0,1,67.00,91.00,58.00,55.0,58.80,270000.0,0,1,0,...,0,0,0,1,1,0,0,1,0,1
1,2,79.33,78.33,77.48,86.5,66.28,200000.0,0,1,1,...,1,0,0,1,0,1,1,0,0,1
2,3,65.00,68.00,64.00,75.0,57.80,250000.0,0,1,1,...,0,1,0,0,1,0,1,0,0,1
3,4,56.00,52.00,52.00,66.0,59.43,288655.0,0,1,1,...,1,0,0,1,1,0,0,1,1,0
4,5,85.80,73.60,73.30,96.8,55.50,425000.0,0,1,1,...,0,1,0,0,1,0,1,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
210,211,80.60,82.00,77.60,91.0,74.49,400000.0,0,1,0,...,0,1,0,0,1,0,1,0,0,1
211,212,58.00,60.00,72.00,74.0,53.62,275000.0,0,1,0,...,1,0,0,1,1,0,1,0,0,1
212,213,67.00,67.00,73.00,59.0,69.72,295000.0,0,1,0,...,0,1,0,0,0,1,1,0,0,1
213,214,74.00,66.00,58.00,70.0,60.23,204000.0,1,0,0,...,0,1,0,0,1,0,0,1,0,1


In [27]:
A = final.drop(['salary'], axis = 1)
B = final['salary']

In [28]:
A            # Predictor Variable

Unnamed: 0,sl_no,ssc_p,hsc_p,degree_p,etest_p,mba_p,gender_F,gender_M,ssc_b_Central,ssc_b_Others,...,hsc_s_Science,degree_t_Comm&Mgmt,degree_t_Others,degree_t_Sci&Tech,workex_No,workex_Yes,specialisation_Mkt&Fin,specialisation_Mkt&HR,status_Not Placed,status_Placed
0,1,67.00,91.00,58.00,55.0,58.80,0,1,0,1,...,0,0,0,1,1,0,0,1,0,1
1,2,79.33,78.33,77.48,86.5,66.28,0,1,1,0,...,1,0,0,1,0,1,1,0,0,1
2,3,65.00,68.00,64.00,75.0,57.80,0,1,1,0,...,0,1,0,0,1,0,1,0,0,1
3,4,56.00,52.00,52.00,66.0,59.43,0,1,1,0,...,1,0,0,1,1,0,0,1,1,0
4,5,85.80,73.60,73.30,96.8,55.50,0,1,1,0,...,0,1,0,0,1,0,1,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
210,211,80.60,82.00,77.60,91.0,74.49,0,1,0,1,...,0,1,0,0,1,0,1,0,0,1
211,212,58.00,60.00,72.00,74.0,53.62,0,1,0,1,...,1,0,0,1,1,0,1,0,0,1
212,213,67.00,67.00,73.00,59.0,69.72,0,1,0,1,...,0,1,0,0,0,1,1,0,0,1
213,214,74.00,66.00,58.00,70.0,60.23,1,0,0,1,...,0,1,0,0,1,0,0,1,0,1


In [29]:
B                   #Target variable

0      270000.0
1      200000.0
2      250000.0
3      288655.0
4      425000.0
         ...   
210    400000.0
211    275000.0
212    295000.0
213    204000.0
214    288655.0
Name: salary, Length: 215, dtype: float64

In [30]:
a_train, a_val, b_train, b_val  = train_test_split(A, B, test_size = 0.2, random_state = random_state)


In [31]:
m1.fit(a_train, b_train)
m2.fit(a_train, b_train)
m3.fit(a_train, b_train)

RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=None, max_features='auto', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      n_estimators=100, n_jobs=None, oob_score=False,
                      random_state=1111, verbose=0, warm_start=False)

In [32]:
predict1 = m1.predict(a_val)
predict2 = m2.predict(a_val)
predict3 = m3.predict(a_val)

In [33]:
# Calculating accuracy of our regression models

from sklearn.metrics import r2_score

a = r2_score(b_val, predict1)
b = r2_score(b_val, predict2)
c = r2_score(b_val, predict3)
a, b, c

(-0.12435243810604057, -0.5732721219724719, -0.049625803243785294)

# Using Voting Regressor to predict continuous values

In [34]:
from sklearn.ensemble import VotingRegressor

vc = VotingRegressor([('lr', m1), ('dt', m2), ('rt', m3)])
vc

VotingRegressor(estimators=[('lr',
                             LinearRegression(copy_X=True, fit_intercept=True,
                                              n_jobs=None, normalize=False)),
                            ('dt',
                             DecisionTreeRegressor(ccp_alpha=0.0,
                                                   criterion='mse',
                                                   max_depth=None,
                                                   max_features=None,
                                                   max_leaf_nodes=None,
                                                   min_impurity_decrease=0.0,
                                                   min_impurity_split=None,
                                                   min_samples_leaf=1,
                                                   min_samples_split=2,
                                                   min_weight_fraction_leaf=0.0,
                                                   p

In [35]:
# training new model and fitting the values

vc.fit(a_train, b_train)
prediction = vc.predict(a_val)
d = r2_score(b_val, prediction)
d

-0.13546343899725666

In [36]:
a

-0.12435243810604057

In [37]:
b

-0.5732721219724719

In [38]:
c

-0.049625803243785294

In [39]:
d

-0.13546343899725666

# Calculating Root Mean Squared Error to check for accuaracy of models

In [40]:
from sklearn.metrics import mean_squared_error

e = np.sqrt(mean_squared_error(b_val, predict1))
f = np.sqrt(mean_squared_error(b_val, predict2))
g = np.sqrt(mean_squared_error(b_val, predict3))
h = np.sqrt(mean_squared_error(b_val, prediction))

In [41]:
e

81282.84632525464

In [42]:
f

96150.06755430634

In [43]:
g

78535.30223495112

In [44]:
h

81683.4829416218

* It seems that Random Regressor model outperforms the other models and hence make the best prediction here. 