In [1]:
import numpy as np
import pandas as pd
from scipy.linalg import svd
from pprint import pprint
from copy import deepcopy
from pdb import set_trace
from collections import Counter
from sklearn.tree import DecisionTreeClassifier

In [2]:
def pca(X, n_components = 5):
    #  Use svd to perform PCA on X
    #  Inputs:
    #     X: input matrix
    #     n_components: number of principal components to keep
    #  Output:
    #     principal_components: the top n_components principal_components
    #     X_pca = X.dot(principal_components)

    U, s, Vh = svd(X)
    s_diag = np.diag(s)
    Vh_transpose = Vh.T
 
    # Write your own code
    principal_components = Vh_transpose[:, :n_components]
    return principal_components

In [3]:
def vector_norm(x, norm="Min-Max"):
    # Calculate the normalized vector
    # Input x: 1-d np.array
    if norm == "Min-Max":
        x_norm = (x - np.min(x))/(np.max(x) - np.min(x))
    elif norm == "L1":
        x_norm = (np.abs(x) / np.sum(np.abs(x)))
    elif norm == "L2":
        x_norm = (np.abs(x) / np.sqrt(np.sum(np.square(x))))
    elif norm == "Standard_Score":
        x_norm = (x - np.mean(x) / np.stdev(x))
    else:
        raise Exception("Unknown normlization.")
    return x_norm


In [4]:
def normalize(X, norm="Min-Max", axis = 1):
    #  Inputs:
    #     X: input matrix
    #     norm = {"L1", "L2", "Min-Max", "Standard_Score"}
    #     axis = 0: normalize rows
    #     axis = 1: normalize columns
    #  Output:
    #     X_norm: normalized matrix (numpy.array)

    
    X_norm = deepcopy(np.asarray(X))
    m, n = X_norm.shape
    
    if axis == 1:
        for col in range(n):
            X_norm[:,col] = vector_norm(X_norm[:,col], norm=norm)
    elif axis == 0:
        X_norm = np.array([vector_norm(X_norm[i], norm=norm) for i in range(m)])
    else:
        raise Exception("Unknown axis.")
    return X_norm

In [5]:
def stratified_sampling(y, ratio, replace = True):
    #  Inputs:
    #     y: class labels
    #     0 < ratio < 1: number of samples = len(y) * ratio
    #     replace = True: sample with replacement
    #     replace = False: sample without replacement
    #  Output:
    #     sample: indices of stratified sampled points
    #             (ratio is the same across each class,
    #             samples for each class = int(np.ceil(ratio * # data in each class)) )

    if ratio<=0 or ratio>=1:
        raise Exception("ratio must be 0 < ratio < 1.")
    y_array = np.asarray(y)
    
    # we need unique list of classes in y
    y_classes_ = list(set(y))
    # y_classes_ =["iris_setosa","iris_versicolor","iris_virginica"]
    #store the occurances of these classes somewhere (indices)
    y_indices = {}
    
    for y_class in y_classes_:
        for classes in y:
            indices = np.where(y == y_class)
            y_indices[y_class] = indices
            
    sample = []
    
    for classes in y_classes_: 
        n = np.ceil(len(y_indices[classes][0] * ratio))
        sample.append(np.random.choice(y_indices[classes][0], int(n), replace = replace))
    
    sample = np.concatenate(sample)
    


    return sample.astype(int)

In [6]:
# Load training data
data_train = pd.read_csv("../data/Iris_train.csv")
# Separate independent variables and dependent variables
independent = ["SepalLengthCm", "SepalWidthCm", "PetalLengthCm", "PetalWidthCm"]
X = data_train[independent]
y = data_train["Species"]
# Preprocess (train)
X_norm = normalize(X)
principal_components = pca(X_norm, n_components=5)
X_pca = X_norm.dot(principal_components)
sample = stratified_sampling(y, ratio=0.5, replace=False)

X_sample = X_pca[sample]
y_sample = y[sample].to_numpy()
print(X_pca)
print(Counter(y_sample))
print(Counter(y))
# Fit model
clf = DecisionTreeClassifier()
clf.fit(X_sample, y_sample)
# Load testing data
data_test = pd.read_csv("../data/Iris_test.csv")
X_test = data_test[independent]
# Preprocess (test)
X_test_norm = normalize(X_test)
X_test_pca = X_test_norm.dot(principal_components)
# Predict
predictions = clf.predict(X_test_pca)
# Output predictions on test data
print(predictions)

[[-4.02412365e-01 -5.53625050e-01  1.96288772e-02  3.28499258e-03]
 [-2.80684541e-01 -3.28001783e-01  1.90197040e-02 -3.15031555e-03]
 [-2.81780612e-01 -4.24248126e-01 -4.54209789e-02 -1.47872560e-02]
 [-2.68275973e-01 -3.69891801e-01 -5.96011710e-02 -4.83634150e-02]
 [-4.07711338e-01 -5.99373471e-01 -1.26360440e-02 -9.13693203e-03]
 [-5.94194937e-01 -6.92860481e-01  5.75859378e-03  2.60314733e-02]
 [-3.38909190e-01 -4.97354462e-01 -1.10145034e-01 -2.12908937e-02]
 [-3.79405850e-01 -5.04018225e-01  5.53784477e-03 -1.70842577e-02]
 [-1.93757318e-01 -2.84926652e-01 -8.75157572e-02 -4.94811891e-02]
 [-2.85666050e-01 -3.81750613e-01  3.30374698e-02 -4.07420032e-02]
 [-4.90535728e-01 -6.38144650e-01  7.06768605e-02  1.41948997e-02]
 [-3.61698309e-01 -5.00159822e-01 -4.08181090e-02 -4.98754325e-02]
 [-2.43655785e-01 -3.41642788e-01  1.91247565e-02 -3.46974358e-02]
 [-5.92681332e-01 -6.59416420e-01  1.07528759e-01  3.62826768e-02]
 [-4.92049332e-01 -6.71588711e-01 -3.10933045e-02  3.94369624e