In [1]:
!pip install --upgrade scikit-learn --user



You should consider upgrading via the 'c:\users\user\anaconda3\python.exe -m pip install --upgrade pip' command.


In [2]:
from numpy import mean
from numpy import std
from sklearn.datasets import make_classification
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import StackingClassifier
from matplotlib import pyplot
from sklearn.datasets import load_iris
from matplotlib.pyplot import figure
figure(num=2,figsize=(16,12),dpi=80,facecolor='w',edgecolor='k')

import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

In [17]:
def get_stacking():
  # define the base models
  level0 = list()
  level0.append(('lr', LogisticRegression()))
  level0.append(('knn', KNeighborsClassifier()))
  level0.append(('cart', DecisionTreeClassifier()))
  level0.append(('svm', SVC()))
  level0.append(('bayes', GaussianNB()))
  # define meta learner model
  level1 = LogisticRegression()
  # define the stacking ensemble
  model = StackingClassifier(estimators=level0, final_estimator=level1, cv=5)
  return model

In [18]:
def get_models():
    models = dict()
    models['LogisticRegression'] = LogisticRegression()
    models['KNeighborsClassifier'] = KNeighborsClassifier()
    models['Decision Tree'] = DecisionTreeClassifier()
    models['svm'] = SVC()
    models['GaussianNB'] = GaussianNB()
    models['stacking'] = get_stacking()
    return models

In [19]:
def evaluate_model(model):
    cv = RepeatedStratifiedKFold(n_splits=10,n_repeats=3,random_state=1)
    scores = cross_val_score(model,X,y,scoring='accuracy',cv=cv,n_jobs=-1,error_score='raise')
    scores1 = cross_val_score(model,X1,y1,scoring='accuracy',cv=cv,n_jobs=-1,error_score='raise')
    return (scores,scores1)

In [20]:
dataset = pd.read_csv("train.csv")

In [21]:
a = dataset.iloc[:,[2,5,6,7,9,11,1]]

In [22]:
print("\nMissing values before imputing ",a.isna().sum())
imp = SimpleImputer(missing_values=np.nan,strategy="mean")
b = a.values
b[:,[0,1,2,3,4]] = imp.fit_transform(b[:,[0,1,2,3,4]])
print("\nMissing values after imputing ",pd.DataFrame(b).isna().sum())


Missing values before imputing  Pclass        0
Age         177
SibSp         0
Parch         0
Fare          0
Embarked      2
Survived      0
dtype: int64

Missing values after imputing  0    0
1    0
2    0
3    0
4    0
5    2
6    0
dtype: int64


In [23]:
print("\nMissing values after removal\n",a.isna().sum())
a = pd.DataFrame(a)
a = a.dropna()
print("\nMissing values after removal\n",pd.DataFrame(a).isna().sum())
a = a.values


Missing values after removal
 Pclass        0
Age         177
SibSp         0
Parch         0
Fare          0
Embarked      2
Survived      0
dtype: int64

Missing values after removal
 Pclass      0
Age         0
SibSp       0
Parch       0
Fare        0
Embarked    0
Survived    0
dtype: int64


In [24]:
lb = LabelEncoder()
a[:,-1] = lb.fit_transform(a[:,-1])
print("\nAfter label encoding ",a)


After label encoding  [[3 22.0 1 ... 7.25 'S' 0]
 [1 38.0 1 ... 71.2833 'C' 1]
 [3 26.0 0 ... 7.925 'S' 1]
 ...
 [1 19.0 0 ... 30.0 'S' 1]
 [1 26.0 0 ... 30.0 'C' 1]
 [3 32.0 0 ... 7.75 'Q' 0]]


In [25]:
transformer = ColumnTransformer(
    transformers = [("OneHot",OneHotEncoder(),[5])],
    remainder = 'passthrough'
)
a = transformer.fit_transform(a.tolist())
print("\nAfter one hot encoding ",a)


After one hot encoding  [[0.0 0.0 1.0 ... 0 7.25 0]
 [1.0 0.0 0.0 ... 0 71.2833 1]
 [0.0 0.0 1.0 ... 0 7.925 1]
 ...
 [0.0 0.0 1.0 ... 0 30.0 1]
 [1.0 0.0 0.0 ... 0 30.0 1]
 [0.0 1.0 0.0 ... 0 7.75 0]]


In [26]:
X1 = a[:,:-1]
y1 = a[:,-1]

In [27]:
scaler = StandardScaler()
X1 = scaler.fit_transform(X1)

In [28]:
y1 = y1.astype('int')

In [29]:
X, y = load_iris().data, load_iris().target

In [30]:
models = get_models()
print(models.items())
results, names, results1 = list(),list(),list()
for (name,model) in models.items():
    scores, scores1 = evaluate_model(model)
    results.append(scores)
    results1.append(scores1)
    names.append(name)
    print('->%s -> %.3f (%.3f) --- Iris dataset' % (name,mean(scores),std(scores)))
    print('->%s -> %.3f (%.3f) --- Titanic dataset' % (name,mean(scores1),std(scores1)))

dict_items([('LogisticRegression', LogisticRegression()), ('KNeighborsClassifier', KNeighborsClassifier()), ('Decision Tree', DecisionTreeClassifier()), ('svm', SVC()), ('GaussianNB', GaussianNB()), ('stacking', StackingClassifier(cv=5,
                   estimators=[('lr', LogisticRegression()),
                               ('knn', KNeighborsClassifier()),
                               ('cart', DecisionTreeClassifier()),
                               ('svm', SVC()), ('bayes', GaussianNB())],
                   final_estimator=LogisticRegression()))])
->LogisticRegression -> 0.964 (0.041) --- Iris dataset
->LogisticRegression -> 0.712 (0.041) --- Titanic dataset
->KNeighborsClassifier -> 0.964 (0.037) --- Iris dataset
->KNeighborsClassifier -> 0.701 (0.035) --- Titanic dataset
->Decision Tree -> 0.947 (0.056) --- Iris dataset
->Decision Tree -> 0.662 (0.045) --- Titanic dataset
->svm -> 0.964 (0.045) --- Iris dataset
->svm -> 0.724 (0.044) --- Titanic dataset
->GaussianNB -> 0.956 