In [1]:
import pandas as pd #for manipulating the csv data

import numpy as np #for mathematical calculation

# ID3 Implementation

In [2]:
class ID3:
    def __init__(self):
        self.tree={}
        
    def calc_total_entropy(self,train_data, label, class_list):
        total_row = train_data.shape[0]
        total_entr = 0

        for c in class_list:
            total_class_count = train_data[train_data[label] == c].shape[0]
            total_class_entr = - (total_class_count/total_row)*np.log2(total_class_count/total_row) 
            total_entr += total_class_entr

        return total_entr
    
    def calc_entropy(self,feature_value_data, label, class_list):
        class_count = feature_value_data.shape[0]
        entropy = 0

        for c in class_list:
            label_class_count = feature_value_data[feature_value_data[label] == c].shape[0]

            entropy_class = 0
            if label_class_count != 0:
                probability_class = label_class_count/class_count
                entropy_class = - probability_class * np.log2(probability_class) 

            entropy += entropy_class

        return entropy
    
    def calc_info_gain(self,feature_name, train_data, label, class_list):
        feature_value_list = train_data[feature_name].unique()
        total_row = train_data.shape[0]
        feature_info = 0.0

        for feature_value in feature_value_list:
            feature_value_data = train_data[train_data[feature_name] == feature_value]
            feature_value_count = feature_value_data.shape[0]
            feature_value_entropy = self.calc_entropy(feature_value_data, label, class_list)
            feature_value_probability = feature_value_count/total_row
            feature_info += feature_value_probability * feature_value_entropy

        return self.calc_total_entropy(train_data, label, class_list) - feature_info
    
    def find_most_informative_feature(self,train_data, label, class_list):
        feature_list = train_data.columns.drop(label)
        max_info_gain = -1
        max_info_feature = None

        for feature in feature_list:  
            feature_info_gain = self.calc_info_gain(feature, train_data, label, class_list)
            if max_info_gain < feature_info_gain:
                max_info_gain = feature_info_gain
                max_info_feature = feature

        return max_info_feature
    
    def generate_sub_tree(self,feature_name, train_data, label, class_list):
        feature_value_count_dict = train_data[feature_name].value_counts(sort=False)
        tree = {}

        for feature_value, count in feature_value_count_dict.iteritems():
            feature_value_data = train_data[train_data[feature_name] == feature_value]

            assigned_to_node = False
            for c in class_list:
                class_count = feature_value_data[feature_value_data[label] == c].shape[0]

                if class_count == count:
                    tree[feature_value] = c
                    train_data = train_data[train_data[feature_name] != feature_value]
                    assigned_to_node = True
            if not assigned_to_node:
                tree[feature_value] = "?"

        return tree, train_data
    
    def make_tree(self,root, prev_feature_value, train_data, label, class_list):
        if train_data.shape[0] != 0:
            max_info_feature = self.find_most_informative_feature(train_data, label, class_list)
            tree, train_data = self.generate_sub_tree(max_info_feature, train_data, label, class_list)
            next_root = None

            if prev_feature_value != None:
                root[prev_feature_value] = dict()
                root[prev_feature_value][max_info_feature] = tree
                next_root = root[prev_feature_value][max_info_feature]
            else:
                root[max_info_feature] = tree
                next_root = root[max_info_feature]

            for node, branch in list(next_root.items()):
                if branch == "?":
                    feature_value_data = train_data[train_data[max_info_feature] == node]
                    self.make_tree(next_root, node, feature_value_data, label, class_list)
    
    def id3(self,train_data_m, label):
        train_data = train_data_m.copy()
        tree = {}
        class_list = train_data[label].unique()
        self.make_tree(tree, None, train_data_m, label, class_list)

        return tree
    

In [2]:
def predict(tree, instance):
    if not isinstance(tree, dict):
        return tree
    else:
        root_node = next(iter(tree))
        feature_value = instance[root_node]
        if feature_value in tree[root_node]:
            return predict(tree[root_node][feature_value], instance)
        else:
            return None
        
def evaluate(tree, test_data_m, label):
    correct_preditct = 0
    wrong_preditct = 0
    for index, row in test_data_m.iterrows():
        result = predict(tree, test_data_m.loc[index])
        if result == test_data_m[label].loc[index]:
            correct_preditct += 1
        else:
            wrong_preditct += 1
    accuracy = correct_preditct / (correct_preditct + wrong_preditct)
    return accuracy

# Ada Boost Implementation

In [None]:
class DecisionStump:
    def __init__(self):
        self.polarity = 1
        self.threshold = None
        self.feature_idx = None
        self.alpha = None
        
    def predict(self,X):
        n_samples = X.shape[0]
        X_c = X[:,self.feature_idx]
        preds = np.ones(n_samples)
        
        if self.polarity ==1:
            preds[X_c < self.threshold] = -1
        else:
            preds[X_c > self.threshold] = -1
            
        return preds
      
class myAdaBoost:
    def __init__(self,n_clf=5):
        self.n_clf = n_clf
        
    def fit(self,X,y):
        n_samples,n_features = X.shape
        w = np.full(n_samples, (1/n_samples))
        
        self.clfs=[]
        for _ in range(self.n_clf):
            clf = DecisionStump()
            min_error = float('inf')
            for feat in range(n_features):
                X_c = X[:,feat]
                thresholds=np.unique(X_c)
                for threshold in thresholds:
                    p=1
                    preds=np.ones(n_samples)
                    preds[X_c<threshold]=-1
                    
                    misclassified = w[y!=preds]
                    error=sum(misclassified)
                    
                    if error >0.5:
                        p=-1
                        error=1-error
                    
                    if error<min_error:
                        min_error=error
                        clf.threshold=threshold
                        clf.feature_idx=feat
                        clf.polarity=p
            
            EPS=1e-10
            clf.alpha=0.5*np.log((1.0-min_error+EPS)/(min_error+EPS))
            preds = clf.predict(X)
            w *= np.exp(-clf.alpha*y*preds)
            w/=np.sum(w)
            self.clfs.append(clf)
            
    def predict(self,X):
        clf_preds = [clf.alpha*clf.predict(X) for clf in self.clfs]
        y_pred = np.sum(clf_preds,axis=0)
        y_pred = np.sign(y_pred)
        return y_pred

## Method to calculate accuracy
def accuracy(y_true, y_pred):
    accuracy = np.sum(y_true == y_pred) / len(y_true)
    return accuracy

# Random Forest Implementation

In [None]:
from collections import Counter

def entropy(y):
    hist = np.bincount(y)
    ps = hist / len(y)
    return -np.sum([p * np.log2(p) for p in ps if p > 0])

class Node:
    def __init__(
        self, feature=None, threshold=None, left=None, right=None, *, value=None
    ):
        self.feature = feature
        self.threshold = threshold
        self.left = left
        self.right = right
        self.value = value

    def is_leaf_node(self):
        return self.value is not None


class DecisionTree:
    def __init__(self, min_samples_split=2, max_depth=100, n_feats=None):
        self.min_samples_split = min_samples_split
        self.max_depth = max_depth
        self.n_feats = n_feats
        self.root = None

    def fit(self, X, y):
        self.n_feats = X.shape[1] if not self.n_feats else min(self.n_feats, X.shape[1])
        self.root = self._grow_tree(X, y)

    def predict(self, X):
        return np.array([self._traverse_tree(x, self.root) for x in X])

    def _grow_tree(self, X, y, depth=0):
        n_samples, n_features = X.shape
        n_labels = len(np.unique(y))

        # stopping criteria
        if (
            depth >= self.max_depth
            or n_labels == 1
            or n_samples < self.min_samples_split
        ):
            leaf_value = self._most_common_label(y)
            return Node(value=leaf_value)

        feat_idxs = np.random.choice(n_features, self.n_feats, replace=False)

        # greedily select the best split according to information gain
        best_feat, best_thresh = self._best_criteria(X, y, feat_idxs)

        # grow the children that result from the split
        left_idxs, right_idxs = self._split(X[:, best_feat], best_thresh)
        left = self._grow_tree(X[left_idxs, :], y[left_idxs], depth + 1)
        right = self._grow_tree(X[right_idxs, :], y[right_idxs], depth + 1)
        return Node(best_feat, best_thresh, left, right)

    def _best_criteria(self, X, y, feat_idxs):
        best_gain = -1
        split_idx, split_thresh = None, None
        for feat_idx in feat_idxs:
            X_column = X[:, feat_idx]
            thresholds = np.unique(X_column)
            for threshold in thresholds:
                gain = self._information_gain(y, X_column, threshold)

                if gain > best_gain:
                    best_gain = gain
                    split_idx = feat_idx
                    split_thresh = threshold

        return split_idx, split_thresh

    def _information_gain(self, y, X_column, split_thresh):
        # parent loss
        parent_entropy = entropy(y)

        # generate split
        left_idxs, right_idxs = self._split(X_column, split_thresh)

        if len(left_idxs) == 0 or len(right_idxs) == 0:
            return 0

        # compute the weighted avg. of the loss for the children
        n = len(y)
        n_l, n_r = len(left_idxs), len(right_idxs)
        e_l, e_r = entropy(y[left_idxs]), entropy(y[right_idxs])
        child_entropy = (n_l / n) * e_l + (n_r / n) * e_r

        # information gain is difference in loss before vs. after split
        ig = parent_entropy - child_entropy
        return ig

    def _split(self, X_column, split_thresh):
        left_idxs = np.argwhere(X_column <= split_thresh).flatten()
        right_idxs = np.argwhere(X_column > split_thresh).flatten()
        return left_idxs, right_idxs

    def _traverse_tree(self, x, node):
        if node.is_leaf_node():
            return node.value

        if x[node.feature] <= node.threshold:
            return self._traverse_tree(x, node.left)
        return self._traverse_tree(x, node.right)

    def _most_common_label(self, y):
        counter = Counter(y)
        most_common = counter.most_common(1)[0][0]
        return most_common


def bootstrap_sample(X, y):
    n_samples = X.shape[0]
    idxs = np.random.choice(n_samples, n_samples, replace=True)
    return X[idxs], y[idxs]


def most_common_label(y):
    counter = Counter(y)
    most_common = counter.most_common(1)[0][0]
    return most_common


class RandomForest:
    def __init__(self, n_trees=10, min_samples_split=2, max_depth=100, n_feats=None):
        self.n_trees = n_trees
        self.min_samples_split = min_samples_split
        self.max_depth = max_depth
        self.n_feats = n_feats
        self.trees = []

    def fit(self, X, y):
        self.trees = []
        for _ in range(self.n_trees):
            tree = DecisionTree(
                min_samples_split=self.min_samples_split,
                max_depth=self.max_depth,
                n_feats=self.n_feats,
            )
            X_samp, y_samp = bootstrap_sample(X, y)
            tree.fit(X_samp, y_samp)
            self.trees.append(tree)

    def predict(self, X):
        tree_preds = np.array([tree.predict(X) for tree in self.trees])
        tree_preds = np.swapaxes(tree_preds, 0, 1)
        y_pred = [most_common_label(tree_pred) for tree_pred in tree_preds]
        return np.array(y_pred)


In [10]:
class ID3:
    def __init__(self):
        self.tree={}
        
    def calc_entropy(self,feature_data, label, class_list):
        class_count = feature_data.shape[0]
        entropy = 0

        for c in class_list:
            label_class_count = feature_data[feature_data[label] == c].shape[0]

            entropy_class = 0
            if label_class_count != 0:
                probability_class = label_class_count/class_count
                entropy_class = - probability_class * np.log2(probability_class) 

            entropy += entropy_class

        return entropy
        
    def total_entropy(self,train_data, label, class_list):
        total_row = train_data.shape[0]
        total_entr = 0

        for c in class_list:
            total_class_count = train_data[train_data[label] == c].shape[0]
            total_class_entr = - (total_class_count/total_row)*np.log2(total_class_count/total_row) 
            total_entr += total_class_entr

        return total_entr
    
    
    def info_gain(self,feature_name, train_data, label, class_list):
        flist = train_data[feature_name].unique()
        total_row = train_data.shape[0]
        feature_info = 0.0

        for feature_value in flist:
            fdata = train_data[train_data[feature_name] == feature_value]
            feature_count = fdata.shape[0]
            feature_value_entropy = self.calc_entropy(fdata, label, class_list)
            feature_value_probability = feature_count/total_row
            feature_info += feature_value_probability * feature_value_entropy

        return self.total_entropy(train_data, label, class_list) - feature_info
    
    def most_informative_feature(self,train_data, label, class_list):
        feature_list = train_data.columns.drop(label)
        max_info = -1
        maxfeature = None

        for feature in feature_list:  
            feature_info_gain = self.info_gain(feature, train_data, label, class_list)
            if max_info < feature_info_gain:
                max_info = feature_info_gain
                maxfeature = feature

        return maxfeature
    
    def sub_tree(self,fname, train_data, label, class_list):
        feature_value_count_dict = train_data[fname].value_counts(sort=False)
        tree = {}

        for fvalue, count in feature_value_count_dict.iteritems():
            feature_value_data = train_data[train_data[fname] == fvalue]

            assigned = False
            for c in class_list:
                ccount = feature_value_data[feature_value_data[label] == c].shape[0]
                if ccount == count:
                    tree[fvalue] = c
                    train_data = train_data[train_data[fname] != fvalue]
                    assigned = True
            if not assigned:
                tree[fvalue] = "?"

        return tree, train_data
    
    def buildtree(self,root, prev_feature, train_data, label, cls):
        if train_data.shape[0] != 0:
            max_info_f = self.most_informative_feature(train_data, label, cls)
            tree, train_data = self.sub_tree(max_info_f, train_data, label, cls)
            next_root = None

            if prev_feature != None:
                root[prev_feature] = dict()
                root[prev_feature][max_info_f] = tree
                next_root = root[prev_feature][max_info_f]
            else:
                root[max_info_f] = tree
                next_root = root[max_info_f]

            for node, branch in list(next_root.items()):
                if branch == "?":
                    feature_value = train_data[train_data[max_info_f] == node]
                    self.buildtree(next_root, node, feature_value, label, cls)
    
    def id3(self,train_data_m, label):
        train_data = train_data_m.copy()
        tree = {}
        class_list = train_data[label].unique()
        self.buildtree(tree, None, train_data_m, label, class_list)

        return tree

# Naive Bayes Implementation

In [4]:
class NaiveBayes:
    def fit(self, X, y):
        n_samples, n_features = X.shape
        self._classes = np.unique(y)
        n_classes = len(self._classes)

        # calculate mean, var, and prior for each class
        self._mean = np.zeros((n_classes, n_features), dtype=np.float64)
        self._var = np.zeros((n_classes, n_features), dtype=np.float64)
        self._priors = np.zeros(n_classes, dtype=np.float64)

        for idx, c in enumerate(self._classes):
            X_c = X[y == c]
            self._mean[idx, :] = X_c.mean(axis=0)
            self._var[idx, :] = X_c.var(axis=0)
            self._priors[idx] = X_c.shape[0] / float(n_samples)

    def predict(self, X):
        y_pred = [self._predict(x) for x in X]
        return np.array(y_pred)

    def _predict(self, x):
        posteriors = []

        # calculate posterior probability for each class
        for idx, c in enumerate(self._classes):
            prior = np.log(self._priors[idx])
            posterior = np.sum(np.log(self._pdf(idx, x)))
            posterior = prior + posterior
            posteriors.append(posterior)

        # return class with highest posterior probability
        return self._classes[np.argmax(posteriors)]

    def _pdf(self, class_idx, x):
        mean = self._mean[class_idx]
        var = self._var[class_idx]
        numerator = np.exp(-((x - mean) ** 2) / (2 * var))
        denominator = np.sqrt(2 * np.pi * var)
        return numerator / denominator

# KNN Distance 3 Implementation

In [None]:
from collections import Counter

import numpy as np


def euclidean_distance(x1, x2):
    return np.sqrt(np.sum(x1 -x2) ** 2)


class KNN:
    def __init__(self, k=3):
        self.k = k

    def fit(self, X, y):
        self.X_train = X
        self.y_train = y

    def predict(self, X):
        y_pred = [self._predict(x) for x in X]
        return np.array(y_pred)

    def _predict(self, x):
        # Compute distances between x and all examples in the training set
        distances = [euclidean_distance(x, x_train) for x_train in self.X_train]
        # Sort by distance and return indices of the first k neighbors
        k_idx = np.argsort(distances)[: self.k]
        # Extract the labels of the k nearest neighbor training samples
        k_neighbor_labels = [self.y_train[i] for i in k_idx]
        # return the most common class label
        most_common = Counter(k_neighbor_labels).most_common(1)
        return most_common[0][0]

In [12]:
class NaiveBayes:
    def fit(self, X, y):
        n_samples, n_features = X.shape
        self._classes = np.unique(y)
        n_classes = len(self._classes)

        self.mean_val = np.zeros((n_classes, n_features), dtype=np.float64)
        self.var_val = np.zeros((n_classes, n_features), dtype=np.float64)
        self.priors = np.zeros(n_classes, dtype=np.float64)

        for idx, c in enumerate(self._classes):
            X_change = X[y == c]
            self.mean_val[idx, :] = X_change.mean(axis=0)
            self.var_val[idx, :] = X_change.var(axis=0)
            self.priors[idx] = X_change.shape[0] / float(n_samples)

    def predict(self, X):
        y_pred = [self._predict(x) for x in X]
        return y_pred

    def _predict(self, x):
        posteriors = []

        for idx, c in enumerate(self._classes):
            prior = np.log(self.priors[idx])
            posterior = np.sum(np.log(self.pdf(idx, x)))
            posterior = prior + posterior
            posteriors.append(posterior)

        return self._classes[np.argmax(posteriors)]

    def pdf(self, class_idx, x):
        mean = self.mean_val[class_idx]
        var = self.var_val[class_idx]
        numerator = np.exp(-((x - mean) ** 2) / (2 * var))
        denominator = np.sqrt(2 * np.pi * var)
        return numerator / denominator

# Breast Cancer Dataset

In [4]:
data = pd.read_csv('breast-cancer-wisconsin.data', header=None)
data.columns = ['Sample code', 'Clump Thickness', 'Uniformity of Cell Size', 'Uniformity of Cell Shape',
                'Marginal Adhesion', 'Single Epithelial Cell Size', 'Bare Nuclei', 'Bland Chromatin',
                'Normal Nucleoli', 'Mitoses','Class']

data = data.drop(['Sample code'],axis=1)
print('Number of instances = %d' % (data.shape[0]))
print('Number of attributes = %d' % (data.shape[1]))
data.head()

Number of instances = 699
Number of attributes = 10


Unnamed: 0,Clump Thickness,Uniformity of Cell Size,Uniformity of Cell Shape,Marginal Adhesion,Single Epithelial Cell Size,Bare Nuclei,Bland Chromatin,Normal Nucleoli,Mitoses,Class
0,5,1,1,1,2,1,3,1,1,2
1,5,4,4,5,7,10,3,2,1,2
2,3,1,1,1,2,2,3,1,1,2
3,6,8,8,1,3,4,3,7,1,2
4,4,1,1,3,2,1,3,1,1,2


In [5]:
import numpy as np

data = data.replace('?',np.NaN)

print('Number of instances = %d' % (data.shape[0]))
print('Number of attributes = %d' % (data.shape[1]))

print('Number of missing values:')
for col in data.columns:
    print('\t%s: %d' % (col,data[col].isna().sum()))

Number of instances = 699
Number of attributes = 10
Number of missing values:
	Clump Thickness: 0
	Uniformity of Cell Size: 0
	Uniformity of Cell Shape: 0
	Marginal Adhesion: 0
	Single Epithelial Cell Size: 0
	Bare Nuclei: 16
	Bland Chromatin: 0
	Normal Nucleoli: 0
	Mitoses: 0
	Class: 0


In [6]:
data2 = data['Bare Nuclei']

print('Before replacing missing values:')
print(data2[20:25])
data2 = data2.fillna(data2.median())

print('\nAfter replacing missing values:')
print(data2[20:25])

Before replacing missing values:
20     10
21      7
22      1
23    NaN
24      1
Name: Bare Nuclei, dtype: object

After replacing missing values:
20    10
21     7
22     1
23     1
24     1
Name: Bare Nuclei, dtype: object


In [7]:
print('Number of rows in original data = %d' % (data.shape[0]))

data2 = data.dropna()
print('Number of rows after discarding missing values = %d' % (data2.shape[0]))

Number of rows in original data = 699
Number of rows after discarding missing values = 683


In [8]:
x = data2.iloc[:,:-1].values.astype(int)

y = data2.iloc[:,-1].values.astype(int)
x.shape,y.shape

((683, 9), (683,))

### Random Forest

In [None]:
from sklearn.model_selection import RepeatedKFold
cv = RepeatedKFold(n_splits=5, n_repeats=10, random_state=None)
cnt=0
r=0
# X is the feature set and y is the target
for train_index, test_index in cv.split(x,y):
       
    #print("Train:", train_index, "Validation:", test_index)
    X_train, X_test = x[train_index], x[test_index]
    y_train, y_test = y[train_index], y[test_index]
    clf = RandomForest(n_trees=3, max_depth=10)
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    acc = accuracy(y_test, y_pred)

    print("Accuracy:", acc)

### ID3

In [11]:
from sklearn.model_selection import RepeatedKFold
cv = RepeatedKFold(n_splits=5, n_repeats=10, random_state=None)
cnt=0
r=0
# X is the feature set and y is the target
for train_index, test_index in cv.split(x,y):
    scores=[]
     
    #print("Train:", train_index, "Validation:", test_index)
    X_train, X_test = x[train_index], x[test_index]
    y_train, y_test = y[train_index], y[test_index]
    X_train = pd.DataFrame(X_train)
    y_train = pd.DataFrame(y_train)
    y_test = pd.DataFrame(y_test)
    x_test = pd.DataFrame(X_test)
    X_train['Class']=y_train
    train=X_train
    x_test['Class']=y_test
    test=x_test
    t=ID3()
    tree=t.id3(train,'Class')
    accuracy = evaluate(tree, test, 'Class')
    scores.append(accuracy)
print('Accuracy: %.2f%%' % (sum(scores)/float(len(scores))))

    

Accuracy: 0.95%


### Naive Bayes

In [13]:
def accuracy(y_true, y_pred):
    accuracy = np.sum(y_true == y_pred) / len(y_true)
    return accuracy


from sklearn.model_selection import RepeatedKFold
cv = RepeatedKFold(n_splits=5, n_repeats=10, random_state=None)
cnt=0
r=0
# X is the feature set and y is the target
for train_index, test_index in cv.split(x,y):
    scores=[]   
    #print("Train:", train_index, "Validation:", test_index)
    X_train, X_test = x[train_index], x[test_index]
    y_train, y_test = y[train_index], y[test_index]

    nb = NaiveBayes()
    nb.fit(X_train, y_train)
    predictions = nb.predict(X_test)
    score=accuracy(y_test, predictions)
    scores.append(score)
print('Accuracy: %.2f%%' % (sum(scores)/float(len(scores))))



Accuracy: 0.98%


### KNN DISTANCE 3

In [None]:
def accuracy(y_true, y_pred):
    accuracy = np.sum(y_true == y_pred) / len(y_true)
    return accuracy


from sklearn.model_selection import RepeatedKFold
cv = RepeatedKFold(n_splits=5, n_repeats=10, random_state=None)
cnt=0
r=0
# X is the feature set and y is the target
for train_index, test_index in cv.split(x,y):
    scores=[]
    #print("Train:", train_index, "Validation:", test_index)
    X_train, X_test = x[train_index], x[test_index]
    y_train, y_test = y[train_index], y[test_index]
    k = 3
    clf = KNN(k=k)
    clf.fit(X_train, y_train)
    predictions = clf.predict(X_test)
    score=accuracy(y_test, predictions)
    scores.append(score)
print('Accuracy: %.2f%%' % (sum(scores)/float(len(scores))))

### ADA BOOST

In [None]:
def accuracy(y_true, y_pred):
    accuracy = np.sum(y_true == y_pred) / len(y_true)
    return accuracy

from sklearn.model_selection import RepeatedKFold
cv = RepeatedKFold(n_splits=5, n_repeats=10, random_state=None)
cnt=0
r=0
y[y==2]=1
y[y==4]=-1
# X is the feature set and y is the target
for train_index, test_index in cv.split(x,y):
    scores=[]  
    #print("Train:", train_index, "Validation:", test_index)
    X_train, X_test = x[train_index], x[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    clf = myAdaBoost()
    ## Fitting the model on train data
    clf.fit(X_train, y_train)

    ## Predict the target's for 20% test data
    y_pred = clf.predict(X_test)
    acc = accuracy(y_test, y_pred)
    scores.append(acc)
print('Accuracy: %.2f%%' % (sum(scores)/float(len(scores))))


# Car Datasetr

In [None]:
df = pd.read_csv('car.data', header=0)

In [None]:
df.describe()

In [None]:
col_names = ['buying', 'maint', 'doors', 'persons', 'lug_boot', 'safety', 'class']


df.columns = col_names



In [None]:
X = df.drop(['class'], axis=1)

y = df['class']

In [None]:
df.shape

In [None]:
df=df.dropna()

In [None]:
! pip install --upgrade category_encoders

In [None]:
# importing necessary package for encoding our categorial features
import category_encoders as ce

encoder_X = ce.OrdinalEncoder(cols=['buying', 'maint', 'doors', 'persons', 'lug_boot', 'safety'])
x= encoder_X.fit_transform(X)

encoder_Y = ce.OrdinalEncoder()
y=np.ravel(encoder_Y.fit_transform(y))

In [None]:
x=pd.DataFrame(x)

In [None]:
x

In [None]:
y=pd.DataFrame(y)
y.columns=['class']
y

In [None]:
x = x.values.astype(int)

y = y.values.astype(int)

In [None]:
from sklearn.model_selection import RepeatedKFold
cv = RepeatedKFold(n_splits=5, n_repeats=10, random_state=None)
cnt=0
r=0

y[y==1]=2
y[y==2]=4
y[y==3]=6
y[y==4]=8
# X is the feature set and y is the target
for train_index, test_index in cv.split(x,y):
    
    scores=[]
    #print("Train:", train_index, "Validation:", test_index)
    X_train, X_test = x[train_index], x[test_index]
    y_train, y_test = y[train_index], y[test_index]
    X_train = pd.DataFrame(X_train)
    y_train = pd.DataFrame(y_train)
    y_test = pd.DataFrame(y_test)
    x_test = pd.DataFrame(X_test)
    X_train['class']=y_train
    train=X_train
    x_test['class']=y_test
    test=x_test
    t=ID3()
    tree=t.id3(train,'class')
    score = evaluate(tree, test, 'class')
#     score=accuracy(y_test, test_predictions)
    scores.append(score)
print('Accuracy: %.2f%%' % (sum(scores)/float(len(scores))))


In [None]:
x

In [None]:
y

In [None]:
from sklearn.model_selection import RepeatedKFold
cv = RepeatedKFold(n_splits=5, n_repeats=10, random_state=None)
cnt=0
r=0
# X is the feature set and y is the target
for train_index, test_index in cv.split(x,y):
       
    #print("Train:", train_index, "Validation:", test_index)
    X_train, X_test = x[train_index], x[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    clf = myAdaBoost()
    ## Fitting the model on train data
    clf.fit(X_train, y_train)

#     ## Predict the target's for 20% test data
#     y_pred = clf.predict(X_test)
#     acc = accuracy(y_test, y_pred)
#     print('accuracy: ', acc)

In [None]:
def accuracy(y_true, y_pred):
    accuracy = np.sum(y_true == y_pred) / len(y_true)
    return accuracy


from sklearn.model_selection import RepeatedKFold
cv = RepeatedKFold(n_splits=5, n_repeats=10, random_state=None)
cnt=0
r=0
# X is the feature set and y is the target
for train_index, test_index in cv.split(x,y):
       
    #print("Train:", train_index, "Validation:", test_index)
    X_train, X_test = x[train_index], x[test_index]
    y_train, y_test = y[train_index], y[test_index]
    k = 3
    clf = KNN(k=k)
    clf.fit(X_train, y_train)
    predictions = clf.predict(X_test)
    print("KNN classification accuracy", accuracy(y_test, predictions))

# Mushroom

In [None]:
import pandas as pd
df =  pd.read_csv("mushroom.data",header=None)

In [None]:
df.columns = ['class', 'cap-shape', 'cap-surface', 'cap-color', 'bruises', 'odor',
       'gill-attachment', 'gill-spacing', 'gill-size', 'gill-color',
       'stalk-shape', 'stalk-root', 'stalk-surface-above-ring',
       'stalk-surface-below-ring', 'stalk-color-above-ring',
       'stalk-color-below-ring', 'veil-type', 'veil-color', 'ring-number',
       'ring-type', 'spore-print-color', 'population', 'habitat']
col=['class', 'cap-shape', 'cap-surface', 'cap-color', 'bruises', 'odor',
       'gill-attachment', 'gill-spacing', 'gill-size', 'gill-color',
       'stalk-shape', 'stalk-root', 'stalk-surface-above-ring',
       'stalk-surface-below-ring', 'stalk-color-above-ring',
       'stalk-color-below-ring', 'veil-type', 'veil-color', 'ring-number',
       'ring-type', 'spore-print-color', 'population', 'habitat']

In [None]:
df

In [None]:
from sklearn.preprocessing import LabelEncoder
labelencoder=LabelEncoder()
for column in df.columns:
    df[column] = labelencoder.fit_transform(df[column])

In [None]:
for column in col:
    print(len(df[column].unique())-1)

In [None]:
for column in col:

    fi=(len(df[column].unique())-1)
    value=0
    step = 1/(fi+1)
    for i in df[column].unique():
        df[column] = [value if letter == i else letter for letter in df[column]]
        value += step

In [None]:
df.describe()

In [None]:
df

In [None]:
x = df.iloc[:,1:].values.astype(int)
y= df.iloc[:,:1].values.astype(int)


In [None]:
x

In [None]:
y

In [None]:
from sklearn.model_selection import RepeatedKFold
cv = RepeatedKFold(n_splits=5, n_repeats=10, random_state=None)
cnt=0
r=0
# X is the feature set and y is the target
for train_index, test_index in cv.split(x,y):
       
    #print("Train:", train_index, "Validation:", test_index)
    X_train, X_test = x[train_index], x[test_index]
    y_train, y_test = y[train_index], y[test_index]
    clf = RandomForest(n_trees=5, max_depth=10)
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    acc = accuracy(y_test, y_pred)

    print("Accuracy:", acc)

In [None]:
from sklearn.model_selection import RepeatedKFold
cv = RepeatedKFold(n_splits=5, n_repeats=10, random_state=None)
cnt=0
r=0

y[y==1]=2
y[y==2]=4
# X is the feature set and y is the target
for train_index, test_index in cv.split(x,y):
    
     
    #print("Train:", train_index, "Validation:", test_index)
    X_train, X_test = x[train_index], x[test_index]
    y_train, y_test = y[train_index], y[test_index]
    X_train = pd.DataFrame(X_train)
    y_train = pd.DataFrame(y_train)
    y_test = pd.DataFrame(y_test)
    x_test = pd.DataFrame(X_test)
    X_train['Class']=y_train
    train=X_train
    x_test['Class']=y_test
    test=x_test
    t=ID3()
    tree=t.id3(train,'Class')
    accuracy = evaluate(tree, test, 'Class')
    print('accuracy: '+ str(accuracy))

In [None]:
def accuracy(y_true, y_pred):
    accuracy = np.sum(y_true == y_pred) / len(y_true)
    return accuracy


from sklearn.model_selection import RepeatedKFold
cv = RepeatedKFold(n_splits=5, n_repeats=10, random_state=None)
cnt=0
r=0
# X is the feature set and y is the target
for train_index, test_index in cv.split(x,y):
       
    #print("Train:", train_index, "Validation:", test_index)
    X_train, X_test = x[train_index], x[test_index]
    y_train, y_test = y[train_index], y[test_index]

    nb = NaiveBayes()
    nb.fit(X_train, y_train)
    predictions = nb.predict(X_test)

    print("Naive Bayes classification accuracy", accuracy(y_test, predictions))

In [None]:
def accuracy(y_true, y_pred):
    accuracy = np.sum(y_true == y_pred) / len(y_true)
    return accuracy


from sklearn.model_selection import RepeatedKFold
cv = RepeatedKFold(n_splits=5, n_repeats=10, random_state=None)
cnt=0
r=0
# X is the feature set and y is the target
for train_index, test_index in cv.split(x,y):
       
    #print("Train:", train_index, "Validation:", test_index)
    X_train, X_test = x[train_index], x[test_index]
    y_train, y_test = y[train_index], y[test_index]
    k = 5
    clf = KNN(k=k)
    clf.fit(X_train, y_train)
    predictions = clf.predict(X_test)
    print("KNN classification accuracy", accuracy(y_test, predictions))

In [None]:
from sklearn.model_selection import RepeatedKFold
cv = RepeatedKFold(n_splits=5, n_repeats=10, random_state=None)
cnt=0
r=0

y[y==1]=1
y[y==2]=-1
# X is the feature set and y is the target
for train_index, test_index in cv.split(x,y):
       
    #print("Train:", train_index, "Validation:", test_index)
    X_train, X_test = x[train_index], x[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    clf = myAdaBoost()
    ## Fitting the model on train data
    clf.fit(X_train, y_train)

    ## Predict the target's for 20% test data
    y_pred = clf.predict(X_test)
    acc = accuracy(y_test, y_pred)
    print('accuracy: ', acc)

In [None]:
import numpy as np
from numpy.core.umath_tests import inner1d
from copy import deepcopy


class AdaBoostClassifier(object):
    '''

    '''

    def __init__(self, *args, **kwargs):
        if kwargs and args:
            raise ValueError(
                '''AdaBoostClassifier can only be called with keyword
                   arguments for the following keywords: base_estimator ,n_estimators,
                    learning_rate,algorithm,random_state''')
        allowed_keys = ['base_estimator', 'n_estimators', 'learning_rate', 'algorithm', 'random_state']
        keywords_used = kwargs.keys()
        for keyword in keywords_used:
            if keyword not in allowed_keys:
                raise ValueError(keyword + ":  Wrong keyword used --- check spelling")

        n_estimators = 50
        learning_rate = 1
        algorithm = 'SAMME.R'
        random_state = None

        if kwargs and not args:
            if 'base_estimator' in kwargs:
                base_estimator = kwargs.pop('base_estimator')
            else:
                raise ValueError('''base_estimator can not be None''')
            if 'n_estimators' in kwargs: n_estimators = kwargs.pop('n_estimators')
            if 'learning_rate' in kwargs: learning_rate = kwargs.pop('learning_rate')
            if 'algorithm' in kwargs: algorithm = kwargs.pop('algorithm')
            if 'random_state' in kwargs: random_state = kwargs.pop('random_state')

        self.base_estimator_ = base_estimator
        self.n_estimators_ = n_estimators
        self.learning_rate_ = learning_rate
        self.algorithm_ = algorithm
        self.random_state_ = random_state
        self.estimators_ = list()
        self.estimator_weights_ = np.zeros(self.n_estimators_)
        self.estimator_errors_ = np.ones(self.n_estimators_)


    def _samme_proba(self, estimator, n_classes, X):
        """Calculate algorithm 4, step 2, equation c) of Zhu et al [1].
        References
        ----------
        .. [1] J. Zhu, H. Zou, S. Rosset, T. Hastie, "Multi-class AdaBoost", 2009.
        """
        proba = estimator.predict_proba(X)

        # Displace zero probabilities so the log is defined.
        # Also fix negative elements which may occur with
        # negative sample weights.
        proba[proba < np.finfo(proba.dtype).eps] = np.finfo(proba.dtype).eps
        log_proba = np.log(proba)

        return (n_classes - 1) * (log_proba - (1. / n_classes)
                                  * log_proba.sum(axis=1)[:, np.newaxis])


    def fit(self, X, y):
        self.n_samples = X.shape[0]
        # There is hidden trouble for classes, here the classes will be sorted.
        # So in boost we have to ensure that the predict results have the same classes sort
        self.classes_ = np.array(sorted(list(set(y))))
        self.n_classes_ = len(self.classes_)
        for iboost in range(self.n_estimators_):
            if iboost == 0:
                sample_weight = np.ones(self.n_samples) / self.n_samples

            sample_weight, estimator_weight, estimator_error = self.boost(X, y, sample_weight)

            # early stop
            if estimator_error == None:
                break

            # append error and weight
            self.estimator_errors_[iboost] = estimator_error
            self.estimator_weights_[iboost] = estimator_weight

            if estimator_error <= 0:
                break

        return self


    def boost(self, X, y, sample_weight):
        if self.algorithm_ == 'SAMME':
            return self.discrete_boost(X, y, sample_weight)
        elif self.algorithm_ == 'SAMME.R':
            return self.real_boost(X, y, sample_weight)

    def real_boost(self, X, y, sample_weight):
        estimator = deepcopy(self.base_estimator_)
        if self.random_state_:
            estimator.set_params(random_state=1)

        estimator.fit(X, y, sample_weight=sample_weight)

        y_pred = estimator.predict(X)
        incorrect = y_pred != y
        estimator_error = np.dot(incorrect, sample_weight) / np.sum(sample_weight, axis=0)

        # if worse than random guess, stop boosting
        if estimator_error >= 1.0 - 1 / self.n_classes_:
            return None, None, None

        y_predict_proba = estimator.predict_proba(X)
        # repalce zero
        y_predict_proba[y_predict_proba < np.finfo(y_predict_proba.dtype).eps] = np.finfo(y_predict_proba.dtype).eps

        y_codes = np.array([-1. / (self.n_classes_ - 1), 1.])
        y_coding = y_codes.take(self.classes_ == y[:, np.newaxis])

        # for sample weight update
        intermediate_variable = (-1. * self.learning_rate_ * (((self.n_classes_ - 1) / self.n_classes_) *
                                                              inner1d(y_coding, np.log(
                                                                  y_predict_proba))))  #dot iterate for each row

        # update sample weight
        sample_weight *= np.exp(intermediate_variable)

        sample_weight_sum = np.sum(sample_weight, axis=0)
        if sample_weight_sum <= 0:
            return None, None, None

        # normalize sample weight
        sample_weight /= sample_weight_sum

        # append the estimator
        self.estimators_.append(estimator)

        return sample_weight, 1, estimator_error


    def discrete_boost(self, X, y, sample_weight):
        estimator = deepcopy(self.base_estimator_)
        if self.random_state_:
            estimator.set_params(random_state=1)

        estimator.fit(X, y, sample_weight=sample_weight)

        y_pred = estimator.predict(X)
        incorrect = y_pred != y
        estimator_error = np.dot(incorrect, sample_weight) / np.sum(sample_weight, axis=0)

        # if worse than random guess, stop boosting
        if estimator_error >= 1 - 1 / self.n_classes_:
            return None, None, None

        # update estimator_weight
        estimator_weight = self.learning_rate_ * np.log((1 - estimator_error) / estimator_error) + np.log(
            self.n_classes_ - 1)

        if estimator_weight <= 0:
            return None, None, None

        # update sample weight
        sample_weight *= np.exp(estimator_weight * incorrect)

        sample_weight_sum = np.sum(sample_weight, axis=0)
        if sample_weight_sum <= 0:
            return None, None, None

        # normalize sample weight
        sample_weight /= sample_weight_sum

        # append the estimator
        self.estimators_.append(estimator)

        return sample_weight, estimator_weight, estimator_error

    def predict(self, X):
        n_classes = self.n_classes_
        classes = self.classes_[:, np.newaxis]
        pred = None

        if self.algorithm_ == 'SAMME.R':
            # The weights are all 1. for SAMME.R
            pred = sum(self._samme_proba(estimator, n_classes, X) for estimator in self.estimators_)
        else:  # self.algorithm == "SAMME"
            pred = sum((estimator.predict(X) == classes).T * w
                       for estimator, w in zip(self.estimators_,
                                               self.estimator_weights_))

        pred /= self.estimator_weights_.sum()
        if n_classes == 2:
            pred[:, 0] *= -1
            pred = pred.sum(axis=1)
            return self.classes_.take(pred > 0, axis=0)

        return self.classes_.take(np.argmax(pred, axis=1), axis=0)


    def predict_proba(self, X):
        if self.algorithm_ == 'SAMME.R':
            # The weights are all 1. for SAMME.R
            proba = sum(self._samme_proba(estimator, self.n_classes_, X)
                        for estimator in self.estimators_)
        else:  # self.algorithm == "SAMME"
            proba = sum(estimator.predict_proba(X) * w
                        for estimator, w in zip(self.estimators_,
                                                self.estimator_weights_))

        proba /= self.estimator_weights_.sum()
        proba = np.exp((1. / (n_classes - 1)) * proba)
        normalizer = proba.sum(axis=1)[:, np.newaxis]
        normalizer[normalizer == 0.0] = 1.0
        proba /= normalizer

        return proba