In [5]:
import numpy as np
import pandas as pd
from scipy.linalg import svd
from copy import deepcopy
from pdb import set_trace
from collections import Counter
from sklearn.tree import DecisionTreeClassifier

In [6]:
def pca(X, n_components = 5):
    #  Use svd to perform PCA on X
    #  Inputs:
    #     X: input matrix
    #     n_components: number of principal components to keep
    #  Output:
    #     principal_components: the top n_components principal_components
    #     X_pca = X.dot(principal_components)
    #  's' is eigen value and 'Vh' is eigen_vector
    #   what is the use of "U"

    U, s, Vh = svd(X)
    Vh_transpose = Vh.T
    
    principal_components = Vh_transpose[:, :n_components]
    return principal_components

In [7]:
def vector_norm(x, norm="Min-Max"):
    # Calculate the normalized vector
    # Input x: 1-d np.array
    if norm == "Min-Max":
        x_norm = (x - np.min(x))/(np.max(x) - np.min(x))
    elif norm == "L1":
        x_norm = (np.abs(x) / np.sum(np.abs(x)))
    elif norm == "L2":
        x_norm = (np.abs(x) / np.sqrt(np.sum(np.square(x))))
    elif norm == "Standard_Score":
        x_norm = (x - np.mean(x) / np.std(x))
    else:
        raise Exception("Unknown normlization.")
    return x_norm

In [8]:
def normalize(X, norm="Min-Max", axis = 1):
    #  Inputs:
    #     X: input matrix
    #     norm = {"L1", "L2", "Min-Max", "Standard_Score"}
    #     axis = 0: normalize rows
    #     axis = 1: normalize columns
    #  Output:
    #     X_norm: normalized matrix (numpy.array)

    X_norm = deepcopy(np.asarray(X))
    m, n = X_norm.shape
    if axis == 1:
        for col in range(n):
            X_norm[:,col] = vector_norm(X_norm[:,col], norm=norm)
    elif axis == 0:
        X_norm = np.array([vector_norm(X_norm[i], norm=norm) for i in range(m)])
    else:
        raise Exception("Unknown axis.")
    return X_norm


In [11]:
def stratified_sampling(y, ratio, replace = True):
    #  Inputs:
    #     y: class labels
    #     0 < ratio < 1: number of samples = len(y) * ratio
    #     replace = True: sample with replacement
    #     replace = False: sample without replacement
    #  Output:
    #     sample: indices of stratified sampled points
    #             (ratio is the same across each class,
    #             samples for each class = int(np.ceil(ratio * # data in each class)) )

    if ratio<=0 or ratio>=1:
        raise Exception("ratio must be 0 < ratio < 1.")
    y_array = np.asarray(y)
    
    # we need unique list of classes in y
    y_classes_ = list(set(y))
    
    # store the occurances of these classes somewhere (indices)
    y_indices = {}
    
    for y_class in y_classes_:
        for classes in y:
            indices = np.where(y == y_class)
            y_indices[y_class] = indices
            #set_trace()
    
    y_indices
    sample = []
    
    for classes in y_classes_:
        n = np.ceil(len(y_indices[classes][0] * ratio))
        sample.append(np.random.choice(y_indices[classes][0], int(n), replace = replace))
    
    sample = np.concatenate(sample)
        


    return sample.astype(int)

In [12]:
# Load training data
data_train = pd.read_csv("../data/Iris_train.csv")
# Separate independent variables and dependent variables
independent = ["SepalLengthCm", "SepalWidthCm", "PetalLengthCm", "PetalWidthCm"]
X = data_train[independent]
y = data_train["Species"]
# Preprocess (train)
X_norm = normalize(X)
principal_components = pca(X_norm, n_components=2)
X_pca = X_norm.dot(principal_components)
sample = stratified_sampling(y, ratio=0.5, replace=False)

X_sample = X_pca[sample]
y_sample = y[sample].to_numpy()
print(X_pca)
print(Counter(y_sample))
print(Counter(y))
# Fit model
clf = DecisionTreeClassifier()
clf.fit(X_sample, y_sample)
# Load testing data
data_test = pd.read_csv("../data/Iris_test.csv")
X_test = data_test[independent]
# Preprocess (test)
X_test_norm = normalize(X_test)
X_test_pca = X_test_norm.dot(principal_components)
# Predict
predictions = clf.predict(X_test_pca)
# Output predictions on test data
print(predictions)


[[-4.02412365e-01 -5.53625050e-01]
 [-2.80684541e-01 -3.28001783e-01]
 [-2.81780612e-01 -4.24248126e-01]
 [-2.68275973e-01 -3.69891801e-01]
 [-4.07711338e-01 -5.99373471e-01]
 [-5.94194937e-01 -6.92860481e-01]
 [-3.38909190e-01 -4.97354462e-01]
 [-3.79405850e-01 -5.04018225e-01]
 [-1.93757318e-01 -2.84926652e-01]
 [-2.85666050e-01 -3.81750613e-01]
 [-4.90535728e-01 -6.38144650e-01]
 [-3.61698309e-01 -5.00159822e-01]
 [-2.43655785e-01 -3.41642788e-01]
 [-5.92681332e-01 -6.59416420e-01]
 [-4.92049332e-01 -6.71588711e-01]
 [-4.52828434e-01 -4.92737031e-01]
 [-4.96569699e-01 -6.13090381e-01]
 [-3.15285005e-01 -6.20153666e-01]
 [-4.63382771e-01 -4.09184432e-01]
 [-3.90203934e-01 -4.85911322e-01]
 [-3.13293000e-01 -3.18057234e-01]
 [-4.35755821e-01 -4.72877811e-01]
 [-4.25518949e-01 -5.48430001e-01]
 [-3.97113392e-01 -5.07876628e-01]
 [-3.10286238e-01 -4.09999625e-01]
 [-3.04987265e-01 -3.64251204e-01]
 [-4.80672779e-01 -4.75845117e-01]
 [-5.15516989e-01 -8.33442696e-01]
 [-5.89156968e-01 -8