In [1]:
# -*- coding: utf-8 -*-
"""
Predicitve_Analytics.py
"""
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt 
# import seaborn as sn


from sklearn.model_selection import train_test_split
from scipy import stats

import time


from sklearn.preprocessing import MinMaxScaler

In [6]:
def zscorenormalisation(X_train):
    sta_vec = np.std(X_train ,axis = 0)
    mean_vec = np.mean(X_train, axis = 0)
    
    for row in range(X_train.shape[0]):
        for col in range(X_train.shape[1]):
            X_train[row][col] = (X_train[row][col] - mean_vec[col])/sta_vec[col]
    
    return X_train

In [7]:
def import_data(filename):
    df = pd.read_csv(filename)
    return df

df = import_data('data.csv')

input_data = df.values

X = input_data[:, :input_data.shape[1]-1]
y = input_data[:, input_data.shape[1]-1]


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

X_train = zscorenormalisation(X_train)
X_test = zscorenormalisation(X_test)


In [9]:
k = 10

In [10]:
def find_center(cluster_centers, row, k):
    
    rows = np.tile(row, (k,1))
    
#     diff_vector = np.sum(np.absolute(cluster_centers - rows), axis = 1)
    diff_vector = np.sqrt(np.sum(np.square(cluster_centers - rows), axis = 1))
    
    return np.argmin(diff_vector)

In [11]:
def Accuracy(y_true,y_pred):
    """
    :type y_true: numpy.ndarray
    :type y_pred: numpy.ndarray
    :rtype: float
    
    """
    diff = y_true - y_pred
    t = np.where(diff == 0)
    return t[0].shape[0]/y_true.shape[0]


In [12]:
def Recall(y_true,y_pred):
    """
    :type y_true: numpy.ndarray
    :type y_pred: numpy.ndarray
    :rtype: float
    """
    labels = [1,2,3,4,5,6,7,8,9,10,11]
    macro = []
    tp = [0 for x in labels]
    fn = [0 for x in labels]
    for i in range(len(labels)):
        for x in range(y_true.shape[0]):
            if y_true[x] == labels[i]:
                if y_pred[x] == y_true[x]:
                    tp[i] = tp[i] + 1
                else:
                    fn[i] = fn[i] + 1
        

    
    return np.sum(tp)/(np.sum(tp)+np.sum(fn))
        

In [13]:
def Precision(y_true,y_pred):
    """
    :type y_true: numpy.ndarray
    :type y_pred: numpy.ndarray
    :rtype: float
    """
    labels = [1,2,3,4,5,6,7,8,9,10,11]
    macro = []
    tp = [0 for x in labels]
    fp = [0 for x in labels]
    for i in range(len(labels)):
        actual = y_true
        pred = y_pred
        
        act_diff = actual - labels[i]
        pred_diff = pred - labels[i]
        
        for x in range(y_true.shape[0]):
            if y_pred[x] == labels[i]:
                if y_pred[x] == y_true[x]:
                    tp[i] = tp[i] + 1
                else:
                    fp[i] = fp[i] + 1
    
    return np.sum(tp)/(np.sum(tp)+np.sum(fp))
        
    

In [14]:
def WCSS(Clusters):
    """
    :Clusters List[numpy.ndarray]
    """
    wcss = 0
    # Get a cluster
    for cluster in Clusters:
        
        cluster_center = np.mean(cluster, axis = 0)
        
        # Compute all distances from cluster center.
        center_vector = np.tile(cluster_center, (cluster.shape[0], 1))
        
        
        wcss = wcss + np.sum(np.sqrt(np.sum(np.square(center_vector - cluster), axis = 1)))
#         wcss = wcss + np.sum(np.absolute(center_vector - cluster))
    
    return wcss

    

In [24]:
def ConfusionMatrix(y_true,y_pred):
    """
    :type y_true: numpy.ndarray
    :type y_pred: numpy.ndarray
    :rtype: float
    """  
    cf_matrix = np.zeros((11,11))
    y_true = y_true.astype(int)
    y_pred = y_pred.astype(int)
    
    for x in range(y_true.shape[0]):
        cf_matrix[y_true[x]-1][y_pred[x]-1] = cf_matrix[y_true[x]-1][y_pred[x]-1] + 1
    
    return cf_matrix

In [16]:
def KNN(X_train,X_test,Y_train):
    """
    :type X_train: numpy.ndarray
    :type X_test: numpy.ndarray
    :type Y_train: numpy.ndarray
    
    :rtype: numpy.ndarray
    """
    k=10
    labels = np.zeros(X_test.shape[0])
    for x in range(X_test.shape[0]):
        row = np.tile(X_test[x], (X_train.shape[0],1))
        diff_matrix = np.sum(np.absolute(X_train - row), axis=1)
        labelled_matrix = np.vstack((diff_matrix, Y_train))
        labelled_matrix = labelled_matrix.T
        sorted_matrix = labelled_matrix[labelled_matrix[:,0].argsort()]
        top_k = sorted_matrix[:k]
        labels[x] = stats.mode(top_k[:,1])[0]
    return labels

In [17]:
def RandomForest(X_train,Y_train,X_test):
    """
    :type X_train: numpy.ndarray
    :type X_test: numpy.ndarray
    :type Y_train: numpy.ndarray
    
    :rtype: numpy.ndarray
    """

In [18]:
def PCA(X_train,N):
    """
    :type X_train: numpy.ndarray
    :type N: int
    :rtype: numpy.ndarray
    """
    Covariance = np.dot(X_train.T, X_train) / (N-1)
    __ , vector = np.linalg.eig(Covariance)
    return np.dot(X_train, vector)

In [19]:
def Kmeans(X_train,N):
    """
    :type X_train: numpy.ndarray
    :type N: int
    :rtype: List[numpy.ndarray]
    """
    
    cluster_centers = X_train[np.random.choice(X_train.shape[0], size = N), ]
    clusters = [np.zeros((1,X_train.shape[1])) for x in range(N)]
    
    epochs = 10
    
    for epoch in range(epochs):
        print(epoch)
        clusters = [np.zeros((1,X_train.shape[1])) for x in range(N)]
        
        print('starting main loop')
        for x in range(X_train.shape[0]):
            center_index = find_center(cluster_centers, X_train[x], N)

            # Adding the new point to the cluster.        
            clusters[center_index] = np.vstack((clusters[center_index], X_train[x]))
        
        print('recomputing center.')
        
        # Recomputing the center.
        for x in range(cluster_centers.shape[0]): 
            cluster_centers[x] = np.mean(clusters[x], axis = 0)
        
        print('done computing centers.')

    
    return clusters

In [20]:
def SklearnSupervisedLearning(X_train,Y_train,X_test):
    """
    :type X_train: numpy.ndarray
    :type X_test: numpy.ndarray
    :type Y_train: numpy.ndarray
    
    :rtype: List[numpy.ndarray] 
    """
    

In [21]:
def SklearnVotingClassifier(X_train,Y_train,X_test):
    
    """
    :type X_train: numpy.ndarray
    :type X_test: numpy.ndarray
    :type Y_train: numpy.ndarray
    
    :rtype: List[numpy.ndarray] 
    """
    

In [22]:
"""
Create your own custom functions for Matplotlib visualization of hyperparameter search. 
Make sure that plots are labeled and proper legends are used
""" 

'\nCreate your own custom functions for Matplotlib visualization of hyperparameter search. \nMake sure that plots are labeled and proper legends are used\n'

In [None]:
start_time = time.time()
K_clusters = Kmeans(X_train, k)
print("--- %s seconds ---" % (time.time() - start_time))

print(WCSS(K_clusters))

print("--- %s seconds ---" % (time.time() - start_time))

[[ 3.76618601e+00  1.41063062e-01 -7.05023645e-02 ... -8.23321600e-06
   1.33054387e-05 -5.49644651e-06]
 [-1.80230248e+00 -5.14345531e+00  1.04877802e+00 ...  2.51316160e-05
   1.39384120e-05  3.35489134e-06]
 [-4.62899403e+00  5.70147241e-01 -2.06546064e-01 ... -6.34725045e-08
  -1.09501304e-05 -7.36333563e-06]
 ...
 [-7.30578815e-01  1.60339097e+00 -3.50987211e+00 ... -3.02131617e-06
   1.80793214e-07  1.06166782e-06]
 [-5.33428142e+00  9.49478082e-01  9.23443769e-01 ...  4.87389830e-06
  -1.93034038e-05  4.81275218e-06]
 [ 2.30791616e+00  3.36582379e+00  2.47445477e+00 ... -1.90478036e-05
   1.22871639e-05  2.64681671e-07]]
--- 0.02955794334411621 seconds ---


array([[691.,   0.,   2.,   0.,   1.,  20.,   0.,   1.,   1.,   0.,   0.],
       [  0., 739.,   0.,   0.,   0.,   2.,   0.,   1.,   0.,  29.,   0.],
       [  0.,   1., 787.,   1.,   1.,   0.,   0.,   0.,   0.,   1.,   0.],
       [  0.,   0.,   1., 694.,   5.,   0.,   0.,   1.,   0.,   0.,   0.],
       [  3.,   0.,   9.,   9., 741.,   0.,   0.,  11.,   0.,   0.,   0.],
       [ 29.,   1.,   2.,   0.,   6., 686.,   0.,   0.,   4.,   1.,   0.],
       [  0.,   0.,   0.,   0.,   0.,   0., 734.,   0.,   0.,   0.,   0.],
       [  0.,   0.,   1.,  11.,  18.,   2.,   0., 720.,   2.,   0.,   0.],
       [  2.,   1.,   0.,   1.,   4.,  13.,   0.,   5., 743.,   0.,   0.],
       [  0.,  45.,   0.,   0.,   0.,   2.,   0.,   0.,   0., 670.,   1.],
       [  0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0., 736.]])