In [None]:
# -*- coding: utf-8 -*-
"""
Predicitve_Analytics.py
"""
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from scipy import stats
import time

from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import accuracy_score

In [None]:
df = import_data('data.csv')

input_data = df.values

X = input_data[:, :input_data.shape[1]-1]
y = input_data[:, input_data.shape[1]-1]


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

scalar = MinMaxScaler()

scalar.fit(X_train)
scalar.transform(X_train)
scalar.fit(X_test)
scalar.transform(X_test)


In [None]:
k = 10

In [None]:
def find_center(cluster_centers, row, k):
    
    rows = np.tile(row, (k,1))
    
    diff_vector = np.sum(np.absolute(cluster_centers - rows), axis = 1)
    
    return np.argmin(diff_vector)

In [None]:
def Accuracy(y_true,y_pred):
    """
    :type y_true: numpy.ndarray
    :type y_pred: numpy.ndarray
    :rtype: float
    
    """
    diff = y_true - y_pred
    t = np.where(diff == 0)
    return t[0].shape[0]/y_true.shape[0]
#     return accuracy_score(y_true=y_true, y_pred=y_pred)

In [None]:
def Recall(y_true,y_pred):
     """
    :type y_true: numpy.ndarray
    :type y_pred: numpy.ndarray
    :rtype: float
    """
    labels = [1,2,3,4,5,6,7,8,9,10,11]
    macro = []
    for label in labels:
        actual = y_true
        pred = y_pred
        
        act_diff = actual - label
        pred_diff = pred - label
        diff = np.where(act_diff == pred_diff)
        denominator = np.where(y_true == label)
        
        macro.append(diff[0].shape[0]/denominator[0].shape[0])
    
    return np.mean(macro)
        

In [None]:
def Precision(y_true,y_pred):
    """
    :type y_true: numpy.ndarray
    :type y_pred: numpy.ndarray
    :rtype: float
    """
    labels = [1,2,3,4,5,6,7,8,9,10,11]
    macro = []
    for label in labels:
        actual = y_true
        pred = y_pred
        
        act_diff = actual - label
        pred_diff = pred - label
        diff = np.where(act_diff == pred_diff)
        denominator = np.where(y_pred == label)
        
        macro.append(diff[0].shape[0]/denominator[0].shape[0])
    
    return np.mean(macro)
        
    

In [None]:
def WCSS(Clusters):
    """
    :Clusters List[numpy.ndarray]
    """
    wcss = 0
    # Get a cluster
    for cluster in Clusters:
        
        cluster_center = np.mean(cluster, axis = 0)
        
        # Compute all distances from cluster center.
        center_vector = np.tile(cluster_center, (cluster.shape[0], 1))
        
        wcss = wcss + np.sum(np.absolute(center_vector - cluster))
    
    return wcss

    

In [None]:
def ConfusionMatrix(y_true,y_pred):
    
    """
    :type y_true: numpy.ndarray
    :type y_pred: numpy.ndarray
    :rtype: float
    """  
    cf_matrix = np.zeros((11,11))
    
    for x in range(y_true):
        cf_matrix[y_true][y_pred] = cf_matrix[y_true][y_pred] + 1
    
    return cf_matrix

In [None]:
def KNN(X_train,X_test,Y_train):
    """
    :type X_train: numpy.ndarray
    :type X_test: numpy.ndarray
    :type Y_train: numpy.ndarray
    
    :rtype: numpy.ndarray
    """
    k=10
    labels = np.zeros(X_test.shape[0])
    for x in range(X_test.shape[0]):
        row = np.tile(X_test[x], (X_train.shape[0],1))
        diff_matrix = np.mean(np.absolute(X_train - row), axis=1)
        labelled_matrix = np.vstack((diff_matrix, Y_train))
        labelled_matrix = labelled_matrix.T
        sorted_matrix = labelled_matrix[labelled_matrix[:,0].argsort()]
        top_k = sorted_matrix[:k]
        labels[x] = stats.mode(top_k[:,1])[0]
    return labels

In [None]:
def RandomForest(X_train,Y_train,X_test):
    """
    :type X_train: numpy.ndarray
    :type X_test: numpy.ndarray
    :type Y_train: numpy.ndarray
    
    :rtype: numpy.ndarray
    """

In [None]:
def PCA(X_train,N):
    """
    :type X_train: numpy.ndarray
    :type N: int
    :rtype: numpy.ndarray
    """
    Covariance = np.dot(X_train.T, X_train) / (N-1)
    __ , vector = np.linalg.eig(Covariance)
    return np.dot(X_train, vector)

In [None]:
def Kmeans(X_train,N):
    """
    :type X_train: numpy.ndarray
    :type N: int
    :rtype: List[numpy.ndarray]
    """
    
    cluster_centers = X_train[np.random.choice(X_train.shape[0], size = N), ]
    clusters = [np.zeros((1,X_train.shape[1])) for x in range(N)]
    
    epochs = 1000
    
    for epoch in range(epochs):
        for x in range(X_train.shape[0]):
            center_index = find_center(cluster_centers, X_train[x], N)

            # Adding the new point to the cluster.        
            clusters[center_index] = np.vstack((clusters[center_index], X_train[x]))

        # Recomputing the center.
        cluster_centers[center_index] = np.mean(clusters[center_index], axis = 0)

    
    return clusters

In [None]:
def SklearnSupervisedLearning(X_train,Y_train,X_test):
    """
    :type X_train: numpy.ndarray
    :type X_test: numpy.ndarray
    :type Y_train: numpy.ndarray
    
    :rtype: List[numpy.ndarray] 
    """

In [None]:
def SklearnVotingClassifier(X_train,Y_train,X_test):
    
    """
    :type X_train: numpy.ndarray
    :type X_test: numpy.ndarray
    :type Y_train: numpy.ndarray
    
    :rtype: List[numpy.ndarray] 
    """

In [None]:
"""
Create your own custom functions for Matplotlib visualization of hyperparameter search. 
Make sure that plots are labeled and proper legends are used
"""

def import_data(filename):
    df = pd.read_csv(filename)
    return df

    

In [None]:
start_time = time.time()
z = Kmeans(X_train, k)
WCSS(z)

print("--- %s seconds ---" % (time.time() - start_time))

In [None]:
start_time = time.time()
z= KNN(X_train, X_test, y_train)
print("--- %s seconds ---" % (time.time() - start_time))

In [None]:
Accuracy(z, y_test)

In [None]:
z


In [None]:
start_time = time.time()
print(PCA(X_train, X_train.shape))
print("--- %s seconds ---" % (time.time() - start_time))