# Cheat sheet

#### Models
1. Decision Tree
2. Random Forest
3. Adaboost
4. Gradient Boost
5. K-Means
6. Gaussian Mixed Model
7. Neural Network

#### Preprocessing
1. PCA

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

## 1. Decision Tree

In [2]:
class DecisionTree:
    def __init__(self,max_depth):
        self.root = None
        self.max_depth = max_depth
        
    class Node: 
        def __init__(self, gini, num_samples, num_samples_per_class, predicted_class):
            self.gini = gini
            self.num_samples = num_samples
            self.num_samples_per_class = num_samples_per_class
            self.predicted_class = predicted_class
            self.feature_index = 0
            self.threshold = 0
            self.left = None
            self.right = None
            
    def find_split(self, X,y,n_classes):
        n_samples, n_features = X.shape
        if n_samples <= 1:
            return None, None

        #so it will not have any warning about "referenced before assignments"
        feature_ix, threshold = None, None

        #count the samples for each class
        sample_per_class_parent = [np.sum(y == c) for c in range(n_classes)]

        # Gini of parent node.
        best_gini = 1.0 - sum((n / n_samples) ** 2 for n in sample_per_class_parent)

        # Loop through all features.
        for feature in range(n_features):
            # Sort data along selected feature.
            sample_sorted = sorted(X[:, feature]) #[2, 3, 10, 19]
            sort_idx = np.argsort(X[:, feature])
            y_sorted = y[sort_idx] #[0, 0, 1, 1]
            sample_per_class_left = [0] * n_classes
            #[0, 0]
            sample_per_class_right = sample_per_class_parent.copy() #[2, 2]
            # loop through each threshold, 2.5, 6.5, 14.5
            for i in range(1, n_samples):
                #the class of that sample
                c = y_sorted[i - 1] #[0]
                #put the sample to the left
                
                sample_per_class_left[c] += 1
                #[1, 0]
                #take the sample out from the right
                sample_per_class_right[c] -= 1

                gini_left = 1.0 - sum((sample_per_class_left[x] / i) ** 2 for x in range(n_classes))

                #since left side has already i samples
                gini_right = 1.0 - sum(
                (sample_per_class_right[x] / (n_samples - i)) ** 2 for x in range(n_classes)
                )
                #weighted gini
                weighted_gini = (i / n_samples) * gini_left + ((n_samples - i)/n_samples) * gini_right

                # in case the value are the same, we do not split
                # (both have to end up on the same side of a split).
                if sample_sorted[i] == sample_sorted[i - 1]:
                    continue
                if weighted_gini < best_gini:
                    best_gini = weighted_gini
                    feature_ix = feature
                    threshold = (sample_sorted[i] + sample_sorted[i - 1]) / 2

        return feature_ix, threshold
    
    def fit(self,X,y):
        self.root = self._fit(X,y,len(set(y)))
    
    def _fit(self,X,y,n_classes,depth = 0):
        assert y.dtype == int
        
        n_samples,n_features = X.shape
        num_samples_per_class = [np.sum(y == i) for i in range(n_classes)]
        predicted_class = np.argmax(num_samples_per_class)

        node = self.Node(
            gini = 1-sum((np.sum(y==c)/n_samples)**2 for c in range(n_classes)),
            predicted_class = predicted_class,
            num_samples = y.size,
            num_samples_per_class = num_samples_per_class
        )

        if (depth >= self.max_depth):
            return node
        
        feature, threshold = self.find_split(X,y,n_classes)
        if feature is not None:
            indices_left = X[:,feature] < threshold
            X_left, y_left = X[indices_left], y[indices_left]
            #tilde for negation
            X_right, y_right = X[~indices_left], y[~indices_left]
            #take note for later decision
            node.feature_index = feature
            node.threshold = threshold
            node.left = self._fit(X_left, y_left, n_classes, depth + 1)
            node.right = self._fit(X_right, y_right, n_classes, depth + 1)
        return node

    def predict(self,sample):
        tree = self.root
        while tree.left:
            if sample[tree.feature_index] < tree.threshold:
                tree = tree.left 
            else:
                tree = tree.right 
        return tree.predicted_class

In [3]:
from sklearn.datasets import make_blobs
from sklearn.model_selection import train_test_split

X, y = make_blobs(n_samples=1000, centers=4,random_state=0, cluster_std=1.0)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [4]:
tree = DecisionTree(max_depth = 7)

tree.fit(X_train,y_train)

In [5]:
pred = [tree.predict(x) for x in X_test]
print("Prediction:\t",np.array(pred))
print("Y:\t\t", y_test)
print("Accuracy", 100*np.sum(y_test==np.array(pred))/y_test.size)

Prediction:	 [3 0 0 3 0 1 1 3 1 2 0 0 0 1 2 0 1 3 1 3 2 2 1 1 0 1 2 3 3 3 1 3 0 0 3 2 1
 1 2 3 3 2 3 2 3 3 0 3 3 2 2 0 3 1 1 1 1 3 0 3 3 3 0 0 1 2 3 2 3 1 3 2 1 0
 0 3 2 2 0 0 3 1 3 2 0 2 3 0 1 1 2 3 0 1 3 2 1 2 2 2 1 1 2 2 2 1 3 0 0 0 0
 2 3 0 2 2 0 2 2 3 0 0 3 1 1 2 0 1 0 3 2 2 2 3 1 1 1 0 2 3 3 3 1 3 3 1 2 3
 2 3 0 1 2 0 1 2 0 1 1 1 0 2 0 3 1 2 0 2 2 0 0 0 2 0 2 3 2 3 3 0 2 2 0 0 1
 0 3 2 0 2 2 2 3 2 3 2 3 3 0 2]
Y:		 [3 0 0 3 0 1 1 3 1 2 0 0 3 1 2 0 1 0 1 3 2 2 1 1 3 1 2 3 3 3 1 3 0 0 3 2 1
 1 2 3 3 2 3 2 3 3 0 3 3 2 2 2 3 1 1 1 1 3 0 3 3 3 1 0 1 2 3 2 3 1 3 2 0 0
 0 3 2 2 0 0 3 0 3 2 0 2 3 0 1 1 2 3 0 1 3 2 1 2 2 0 1 1 2 2 2 1 3 0 0 0 0
 2 3 0 0 2 0 1 2 3 0 0 3 1 1 2 0 0 0 3 2 2 2 3 1 1 1 0 2 3 3 3 1 3 3 1 2 3
 2 3 2 1 2 0 1 0 0 1 1 1 0 2 0 3 1 2 0 2 2 0 0 0 0 1 2 3 2 0 3 0 2 2 0 2 1
 0 3 2 2 2 2 2 3 2 3 0 3 3 0 2]
Accuracy 90.5


In [6]:
from sklearn.tree import DecisionTreeClassifier

model = DecisionTreeClassifier(max_depth=7).fit(X_train,y_train)

pred = model.predict(X_test)
print("Prediction:\t",np.array(pred))
print("Y:\t\t", y_test)
print("Accuracy", 100*np.sum(y_test==np.array(pred))/y_test.size)

Prediction:	 [0 0 0 3 0 1 1 3 1 2 0 0 0 1 2 0 0 3 1 3 2 2 1 1 0 1 2 3 3 3 1 3 0 0 3 2 1
 1 2 3 3 2 3 2 3 3 0 3 3 2 2 0 3 1 1 1 1 3 0 3 3 3 0 0 1 2 3 2 3 0 3 2 0 0
 0 3 2 2 0 0 3 0 3 2 0 2 3 0 1 1 2 3 0 1 3 2 1 2 2 2 1 1 2 2 2 1 3 0 0 0 0
 2 3 0 2 2 0 2 2 3 0 0 3 1 1 2 0 1 0 3 2 2 2 3 1 1 1 0 2 3 3 3 1 3 3 1 2 3
 2 3 0 1 2 0 1 2 0 1 1 1 0 2 0 3 1 2 0 2 2 0 0 0 2 0 2 3 2 3 3 0 2 2 0 0 1
 0 3 2 0 2 2 2 3 2 3 2 3 3 0 2]
Y:		 [3 0 0 3 0 1 1 3 1 2 0 0 3 1 2 0 1 0 1 3 2 2 1 1 3 1 2 3 3 3 1 3 0 0 3 2 1
 1 2 3 3 2 3 2 3 3 0 3 3 2 2 2 3 1 1 1 1 3 0 3 3 3 1 0 1 2 3 2 3 1 3 2 0 0
 0 3 2 2 0 0 3 0 3 2 0 2 3 0 1 1 2 3 0 1 3 2 1 2 2 0 1 1 2 2 2 1 3 0 0 0 0
 2 3 0 0 2 0 1 2 3 0 0 3 1 1 2 0 0 0 3 2 2 2 3 1 1 1 0 2 3 3 3 1 3 3 1 2 3
 2 3 2 1 2 0 1 0 0 1 1 1 0 2 0 3 1 2 0 2 2 0 0 0 0 1 2 3 2 0 3 0 2 2 0 2 1
 0 3 2 2 2 2 2 3 2 3 0 3 3 0 2]
Accuracy 90.0


## 2. Random Forest

In [7]:
class RandomForest:
    def __init__(self, no_of_trees = 1, max_depth = 3):
        self.no_of_trees = no_of_trees
        self.trees = [DecisionTree(max_depth = max_depth) for _ in range(no_of_trees)]
        
    def bagging(self,X,y):
        m,n = X.shape
        
        x_samples = np.zeros((self.no_of_trees, m, n))
        y_samples = np.zeros((self.no_of_trees,m),dtype=y.dtype)

        for i in range(self.no_of_trees):
            for j in range(m):
                idx = np.random.randint(m)
                x_samples[i,j,:] = X_train[idx]
                y_samples[i,j] = y_train[idx]
        
        return x_samples, y_samples
        
    def fit(self,X,y):
        x_samples, y_samples = self.bagging(X,y)
        
        for i, tree in enumerate(self.trees):
            _X = x_samples[i, :]
            _y = y_samples[i,:]
            tree.fit(_X,_y)
    
    def predict(self,X):
        from scipy import stats
        predictions = np.zeros((self.no_of_trees,X.shape[0]))
        for i, tree in enumerate(self.trees):
            yhat = np.array([tree.predict(x) for x in X])
            predictions[i,:] = yhat
        pred = stats.mode(predictions)[0][0].astype(int)
        return pred

In [12]:
forest = RandomForest(no_of_trees = 20, max_depth = 5)

forest.fit(X_train,y_train)

pred = forest.predict(X_test)

print("Prediction:\t",np.array(pred))
print("Y:\t\t", y_test)
print("Accuracy", 100*np.sum(y_test==np.array(pred))/y_test.size)

Prediction:	 [0 0 0 3 0 1 1 3 1 2 0 0 0 1 2 0 0 3 1 3 2 2 1 1 0 1 2 3 3 3 1 3 0 0 3 2 1
 1 2 3 3 2 3 2 3 3 0 3 3 2 2 0 3 1 1 1 1 3 0 3 3 3 1 0 1 2 3 2 3 1 3 2 0 0
 0 3 2 2 0 0 3 0 3 2 0 2 3 0 1 1 2 3 0 1 3 2 1 2 2 2 1 1 2 2 2 1 3 0 0 0 0
 2 3 0 2 2 0 2 2 3 0 0 3 1 1 2 0 1 0 3 2 2 2 3 1 1 1 0 2 3 3 3 1 3 3 1 2 3
 2 3 2 1 2 0 1 2 0 1 1 1 0 2 0 3 1 2 0 2 2 0 0 0 0 0 2 3 2 3 3 0 2 2 0 2 1
 0 3 2 0 2 2 2 3 2 3 2 3 3 0 2]
Y:		 [3 0 0 3 0 1 1 3 1 2 0 0 3 1 2 0 1 0 1 3 2 2 1 1 3 1 2 3 3 3 1 3 0 0 3 2 1
 1 2 3 3 2 3 2 3 3 0 3 3 2 2 2 3 1 1 1 1 3 0 3 3 3 1 0 1 2 3 2 3 1 3 2 0 0
 0 3 2 2 0 0 3 0 3 2 0 2 3 0 1 1 2 3 0 1 3 2 1 2 2 0 1 1 2 2 2 1 3 0 0 0 0
 2 3 0 0 2 0 1 2 3 0 0 3 1 1 2 0 0 0 3 2 2 2 3 1 1 1 0 2 3 3 3 1 3 3 1 2 3
 2 3 2 1 2 0 1 0 0 1 1 1 0 2 0 3 1 2 0 2 2 0 0 0 0 1 2 3 2 0 3 0 2 2 0 2 1
 0 3 2 2 2 2 2 3 2 3 0 3 3 0 2]
Accuracy 92.5


In [13]:
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier()

model.fit(X_train,y_train)

pred = model.predict(X_test)

print("Prediction:\t",np.array(pred))
print("Y:\t\t", y_test)
print("Accuracy", 100*np.sum(y_test==np.array(pred))/y_test.size)

Prediction:	 [0 0 0 3 0 1 1 3 1 2 0 0 0 1 2 0 1 3 1 3 2 2 1 1 0 1 2 3 3 3 1 3 0 0 3 2 1
 1 2 3 3 2 3 2 3 3 0 3 3 2 2 0 3 1 1 1 1 3 0 3 3 3 1 0 1 2 3 2 3 1 3 2 1 0
 0 3 2 2 0 0 3 0 3 2 0 2 3 0 1 1 2 3 0 1 3 2 1 2 2 2 1 1 2 2 2 1 3 0 0 0 0
 2 3 0 2 2 0 2 2 3 0 0 3 1 1 2 0 1 0 3 2 2 2 3 1 1 1 0 2 3 3 3 1 3 3 1 2 3
 2 3 2 1 2 0 1 2 0 1 1 1 0 2 0 3 1 2 0 2 2 0 1 0 0 0 2 3 2 3 3 0 2 2 0 2 1
 0 3 2 0 2 2 2 3 2 3 0 3 3 0 2]
Y:		 [3 0 0 3 0 1 1 3 1 2 0 0 3 1 2 0 1 0 1 3 2 2 1 1 3 1 2 3 3 3 1 3 0 0 3 2 1
 1 2 3 3 2 3 2 3 3 0 3 3 2 2 2 3 1 1 1 1 3 0 3 3 3 1 0 1 2 3 2 3 1 3 2 0 0
 0 3 2 2 0 0 3 0 3 2 0 2 3 0 1 1 2 3 0 1 3 2 1 2 2 0 1 1 2 2 2 1 3 0 0 0 0
 2 3 0 0 2 0 1 2 3 0 0 3 1 1 2 0 0 0 3 2 2 2 3 1 1 1 0 2 3 3 3 1 3 3 1 2 3
 2 3 2 1 2 0 1 0 0 1 1 1 0 2 0 3 1 2 0 2 2 0 0 0 0 1 2 3 2 0 3 0 2 2 0 2 1
 0 3 2 2 2 2 2 3 2 3 0 3 3 0 2]
Accuracy 92.5
