In [1]:
import numpy as np 
import pandas as pd
from sklearn.preprocessing import StandardScaler

In [2]:
df = pd.read_csv('suv_data.csv')
df.head()

Unnamed: 0,User ID,Gender,Age,EstimatedSalary,Purchased
0,15624510,Male,19,19000,0
1,15810944,Male,35,20000,0
2,15668575,Female,26,43000,0
3,15603246,Female,27,57000,0
4,15804002,Male,19,76000,0


In [3]:
df.shape

(400, 5)

In [4]:
df = df.iloc[:,1:]
df.sample(5)

Unnamed: 0,Gender,Age,EstimatedSalary,Purchased
125,Female,39,61000,0
74,Male,32,18000,0
376,Female,46,74000,0
326,Male,41,72000,0
130,Male,31,58000,0


In [5]:
# Encoding gender column

df = pd.get_dummies(df,columns=['Gender'], drop_first=True)
df

Unnamed: 0,Age,EstimatedSalary,Purchased,Gender_Male
0,19,19000,0,1
1,35,20000,0,1
2,26,43000,0,0
3,27,57000,0,0
4,19,76000,0,1
...,...,...,...,...
395,46,41000,1,0
396,51,23000,1,1
397,50,20000,1,0
398,36,33000,0,1


In [6]:
X = df[['Age', 'EstimatedSalary', 'Gender_Male']]
y = df['Purchased']

In [7]:
X

Unnamed: 0,Age,EstimatedSalary,Gender_Male
0,19,19000,1
1,35,20000,1
2,26,43000,0
3,27,57000,0
4,19,76000,1
...,...,...,...
395,46,41000,0
396,51,23000,1
397,50,20000,0
398,36,33000,1


In [8]:
y

0      0
1      0
2      0
3      0
4      0
      ..
395    1
396    1
397    1
398    0
399    1
Name: Purchased, Length: 400, dtype: int64

In [9]:
# Scaling

scaler = StandardScaler()

df = scaler.fit_transform(df)
df

array([[-1.78179743, -1.49004624, -0.74593581,  1.02020406],
       [-0.25358736, -1.46068138, -0.74593581,  1.02020406],
       [-1.11320552, -0.78528968, -0.74593581, -0.98019606],
       ...,
       [ 1.17910958, -1.46068138,  1.34059793, -0.98019606],
       [-0.15807423, -1.07893824, -0.74593581,  1.02020406],
       [ 1.08359645, -0.99084367,  1.34059793, -0.98019606]])

# Modelling

In [10]:
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.model_selection import cross_val_score

In [11]:
clf1 = LogisticRegression()         # base model 1
clf2 = KNeighborsClassifier()       # base model 2
clf3 = DecisionTreeClassifier()     # base model 3

In [12]:
estimators = [('lr',clf1),('knn',clf2),('dt',clf3)]

In [13]:
for estimator in estimators:
    x = cross_val_score(estimator[1],X,y,cv=10,scoring='accuracy')
    print(estimator[0],np.round(np.mean(x),2))

lr 0.64
knn 0.78
dt 0.84


# Hard Voting

In [14]:
vc = VotingClassifier(estimators=estimators,voting='hard')

x = cross_val_score(vc,X,y,cv=10,scoring='accuracy')
print(np.round(np.mean(x),2))

0.81


# Soft Voting

In [15]:
vc1 = VotingClassifier(estimators=estimators,voting='soft')

x = cross_val_score(vc1,X,y,cv=10,scoring='accuracy')
print(np.round(np.mean(x),2))

0.83


# Weighted Voting

In [16]:
for i in range(1,4):
    for j in range(1,4):
        for k in range(1,4):
            vc = VotingClassifier(estimators=estimators,voting='soft',weights=[i,j,k])
            x = cross_val_score(vc,X,y,cv=10,scoring='accuracy')
            print("for i={},j={},k={}".format(i,j,k),np.round(np.mean(x),2))

for i=1,j=1,k=1 0.84
for i=1,j=1,k=2 0.84
for i=1,j=1,k=3 0.84
for i=1,j=2,k=1 0.86
for i=1,j=2,k=2 0.84
for i=1,j=2,k=3 0.84
for i=1,j=3,k=1 0.86
for i=1,j=3,k=2 0.84
for i=1,j=3,k=3 0.83
for i=2,j=1,k=1 0.84
for i=2,j=1,k=2 0.84
for i=2,j=1,k=3 0.85
for i=2,j=2,k=1 0.85
for i=2,j=2,k=2 0.83
for i=2,j=2,k=3 0.83
for i=2,j=3,k=1 0.85
for i=2,j=3,k=2 0.85
for i=2,j=3,k=3 0.84
for i=3,j=1,k=1 0.84
for i=3,j=1,k=2 0.84
for i=3,j=1,k=3 0.84
for i=3,j=2,k=1 0.85
for i=3,j=2,k=2 0.84
for i=3,j=2,k=3 0.84
for i=3,j=3,k=1 0.86
for i=3,j=3,k=2 0.85
for i=3,j=3,k=3 0.83
