In [None]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import VotingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import AdaBoostClassifier
import time

# Functions

In [None]:
def fit_and_test_model(model, train_X, train_y, test_X, test_y):
    start_time = time.time()
    model.fit(train_X, train_y)
    end_time = time.time()
    duration = end_time - start_time
    accuracy = model.score(test_X, test_y)
    
    result = (duration, accuracy)
    return result

# Load and explore the data

In [None]:
train_data = pd.read_csv("/kaggle/input/mnist-in-csv/mnist_train.csv")

In [None]:
print(f'The train data shape: {train_data.shape}')
print(f'The columns: {train_data.columns}')

In [None]:
train_data.info()

In [None]:
train_data.describe()

In [None]:
train_data.head(10)

In [None]:
print('The column label can take the following values: ')
train_data['label'].unique()

In [None]:
X = train_data.iloc[:,1:]
y = train_data.iloc[:,0]

In [None]:
y.head(10)

In [None]:
X.head(10)

In [None]:
# scale the data
X = X/255

In [None]:
X.describe()

In [None]:
train_X, test_X, train_y, test_y = train_test_split(X, y, test_size=0.2, random_state=0)

print('Initial size of the dataset:', len(X))
print('The size of the dataset used for training the model: ',len(train_X))
print('The size of the dataset used for validating the model: ',len(test_X))

# 1. Max Voting

In [None]:
# create the models
model1_1 = LogisticRegression(random_state = 1, max_iter = 150)
model1_2 = DecisionTreeClassifier(random_state = 1)

voting_classifier_model = VotingClassifier(estimators=[('lr', model1_1), ('dt', model1_2)], voting='hard')
duration_voting_classifier, acc_voting_classifier = fit_and_test_model(voting_classifier_model, train_X, train_y, test_X, test_y)

# Bagging classifier

In [None]:
bagging_classifier = BaggingClassifier(DecisionTreeClassifier(random_state=1))
duration_bagging_classifier, acc_bagging_classifier = fit_and_test_model(bagging_classifier, train_X, train_y, test_X, test_y)

# AdaBoost(Adaptive boosting)

Steps for performing the AdaBoost algorithm:
 - Initially, all observations in the dataset are given equal weights.
 - A model is built on a subset of data.
 - Using this model, predictions are made on the whole dataset.
 - Errors are calculated by comparing the predictions and actual values.
 - While creating the next model, higher weights are given to the data points which were predicted incorrectly.
 - Weights can be determined using the error value. For instance, higher the error more is the weight assigned to the observation.
 - This process is repeated until the error function does not change, or the maximum limit of the number of estimators is reached.

In [None]:
ada_boost_classifier = AdaBoostClassifier(random_state=1)
duration_ada_boost, acc_ada_boost = fit_and_test_model(ada_boost_classifier, train_X, train_y, test_X, test_y)

# Compare the results

In [None]:
print('\t\tMaxVoting\t\tAdaBoostClassifier\tBaggingClassifier\n')
print(f'Duration(sec):\t{duration_voting_classifier}\t{duration_bagging_classifier}\t{duration_ada_boost}\n')
print(f'Accuracy:\t{acc_voting_classifier}\t\t\t{acc_bagging_classifier}\t{acc_ada_boost}\n')