<a href="https://colab.research.google.com/github/pskaranth/thelearningcurve/blob/master/Classification/generative/Seeds_generative.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Seed Classification using Generative Model
Using seeds data set from https://archive.ics.uci.edu/ml/machine-learning-databases/00236/seeds_dataset.txt



In [11]:
!git clone -l -s git://github.com/pskaranth/thelearningcurve.git cloned-repo
%cd cloned-repo
!ls

Cloning into 'cloned-repo'...
remote: Enumerating objects: 153, done.[K
remote: Counting objects: 100% (153/153), done.[K
remote: Compressing objects: 100% (136/136), done.[K
remote: Total 153 (delta 45), reused 0 (delta 0), pack-reused 0[K
Receiving objects: 100% (153/153), 2.06 MiB | 2.52 MiB/s, done.
Resolving deltas: 100% (45/45), done.
/content/cloned-repo/cloned-repo
Classification	README.md


In [7]:
!ls

Classification	README.md


In [8]:
import numpy as np
from scipy.stats import norm, multivariate_normal
import matplotlib.pyplot as plt
import ipywidgets as widgets
from IPython.display import display
from ipywidgets import interact, interactive, fixed, interact_manual, IntSlider

In [12]:
data = np.loadtxt('Classification/generative/seeds_dataset.txt')
features = ['area','perimeter','compactness','lengthk','widthk','asymmeterycoeff','lengthkg']


In [13]:
print("Dataset dimensions: ", np.shape(data))
type(data)

Dataset dimensions:  (210, 8)


numpy.ndarray

In [14]:
np.random.seed(0)
perm = np.random.permutation(210)
train_x = data[perm[0:190],0:7]
train_y = data[perm[0:190],7]
test_x = data[perm[190:210], 0:7]
test_y = data[perm[190:210],7]


In [15]:
print("Training dataset dimensions: ", np.shape(train_x))
print("Training label dimensions: ", np.shape(train_y))
print(train_x[2])
print(train_y[2])

Training dataset dimensions:  (190, 7)
Training label dimensions:  (190,)
[13.94   14.17    0.8728  5.585   3.15    2.124   5.012 ]
1.0


In [16]:
def fit_univariate_model(x,y,feature):
    k = 3 # number of classes
    mu = np.zeros(4) # list of means
    var = np.zeros(4) # list of variances
    p = np.zeros(4) # list of class probabilities
    
    for label in range(1,4):
        indices = (y==label)
        # print(x[indices,feature])
        mu[label] = np.mean(x[indices,feature])
        var[label] = np.var(x[indices,feature])      
        p[label] = float(sum(indices))/float(len(y))
    return mu, var, p

In [17]:
# consider just one feature - compactness
feature = 4
mu, var, p = fit_univariate_model(train_x, train_y, feature)
print(p[1:])
print(mu)

[0.33157895 0.34210526 0.32631579]
[0.         3.23388889 3.67167692 2.84474194]


# Univariate
Predicting the output based on just one feature.

In [19]:
@interact_manual( feature=IntSlider(0,0,6) )
def show_densities(feature):
    mu, var, pi = fit_univariate_model(train_x, train_y, feature)
    colors = ['r', 'b', 'g']
    for label in range(1,4):
        m = mu[label]
        s = np.sqrt(var[label])
        x_axis = np.linspace(m - 3*s, m+3*s, 1000)
        plt.plot(x_axis, norm.pdf(x_axis,m,s), colors[label-1], label="class " + str(label))
    plt.xlabel(features[feature], fontsize=14, color='red')
    plt.ylabel('Density', fontsize=14, color='red')
    plt.legend()
    plt.show()

interactive(children=(IntSlider(value=0, description='feature', max=6), Button(description='Run Interact', sty…

In [20]:
def pred_model(feature):
   mu, var, p = fit_univariate_model(train_x, train_y, feature)
   n = len(test_y)
   print('num of tests : ',n)
   score = np.zeros((n,4))
   for i in range(n): #run for each entry in test_y, get the score of each labels to find the argmax of the score.
      for j in range(1,4):
        score[i,j] = np.log(p[j]) + norm.logpdf(test_x[i,feature], mu[j], np.sqrt(var[j])) # taking log for the joint distribution(P(x). distribution of test_x )
  #  print(score[1:,1:4])
   print(score[:,1:4])
   prediction = np.argmax(score[:,1:4], axis=1) + 1
   print('prediction : ',prediction)
   print('errors : ',np.sum(prediction != test_y))


In [21]:
pred_model(6) # feature =2

num of tests :  20
[[ -0.68787935  -7.23295544  -0.22500219]
 [ -0.78334591  -5.93861696  -0.28670531]
 [ -1.09678825 -11.56370078  -1.8596426 ]
 [ -0.71693415  -8.60391833  -0.48870042]
 [ -0.68724477  -7.26202154  -0.22743169]
 [ -0.8561807   -5.46769178  -0.40546697]
 [ -1.46100027  -3.55473297  -1.66887636]
 [-10.51958294  -1.01920397 -24.21550473]
 [ -7.83951772  -0.63345286 -17.38724288]
 [-15.55492207  -2.55159548 -37.15660456]
 [ -2.21533603  -2.41532447  -3.41758416]
 [ -0.84953322  -5.50506369  -0.39384954]
 [ -2.25564697  -2.37011831  -3.51320967]
 [-10.4857785   -1.01200413 -24.12905396]
 [ -0.85703419 -10.05165161  -1.043755  ]
 [ -0.71371242  -6.65112155  -0.20956069]
 [ -0.71371242  -6.65112155  -0.20956069]
 [ -9.75680369  -0.86970458 -22.26658747]
 [ -0.76707966  -6.06963197  -0.26376847]
 [ -0.71791893  -8.61987498  -0.49340411]]
prediction :  [3 3 1 3 3 3 1 2 2 2 1 3 1 2 1 3 3 2 3 3]
errors :  8


# Bivariate
Predict the output based on two features. 

In [22]:
def fit_bivariate_model(x,y,twofeatures):
    k = 3 # number of classes
    mu = np.zeros((4,2)) # list of means
    var = np.zeros((4,2,2)) # list of variances
    p = np.zeros(4) # list of class probabilities
    
    for label in range(1,4):
        indices = (y==label)
        # print(x[indices,feature])
        # np.mean(x[:,features], axis=0)
        # print(x[indices,:][:,twofeatures])
        mu[label,:] = np.mean(x[indices,:][:,twofeatures],axis =0)
        var[label] = np.cov(x[indices,:][:,twofeatures], rowvar=0, bias=1)      
        p[label] = float(sum(indices))/float(len(y))
    return mu, var, p

In [23]:
twofeatures=[4,6]
fit_bivariate_model(train_x, train_y, twofeatures)

(array([[0.        , 0.        ],
        [3.23388889, 5.06819048],
        [3.67167692, 6.00367692],
        [2.84474194, 5.12116129]]), array([[[ 0.        ,  0.        ],
         [ 0.        ,  0.        ]],
 
        [[ 0.03157819,  0.01909507],
         [ 0.01909507,  0.06873711]],
 
        [[ 0.03406428,  0.01860505],
         [ 0.01860505,  0.0628689 ]],
 
        [[ 0.0181179 , -0.00183522],
         [-0.00183522,  0.02565249]]]), array([0.        , 0.33157895, 0.34210526, 0.32631579]))

In [24]:
def pred_model_2(twofeatures):
   mu, covar, p = fit_bivariate_model(train_x, train_y, twofeatures)
   n = len(test_y)
   print('num of tests : ',n)
   score = np.zeros((n,4))
   for i in range(n): #run for each entry in test_y, get the score of each labels to find the argmax of the score.
      for j in range(1,4):
        # print(test_x[i,twofeatures] ,'mean mu :', mu[j,:],'  variance : ', var[j,:])        
          score[i,j] = np.log(p[j]) + multivariate_normal.logpdf(test_x[i,twofeatures], mu[j,:], covar[j,:]) # taking log for the joint distribution(P(x). distribution of test_x )
  #  print(score[1:,1:4])
  #  print(score[:,1:4])
   prediction = np.argmax(score[:,1:4], axis=1) + 1
   print('prediction : ',prediction)
   print('errors : ',np.sum(prediction != test_y))

In [25]:
twofeatures=[4,6]
pred_model_2(twofeatures)

num of tests :  20
prediction :  [1 3 1 1 3 1 3 2 2 2 1 1 2 2 1 1 3 2 1 3]
errors :  4


In [26]:
test = np.array([[2,4,5],[2,6,8],[3,4,5],[2,7,9]])
print(type(test) , ': type ::' ,np.shape(test))
# print(test[:,0])
print(test[:,1]) # second column
test_2mn = np.mean(test[:,1],axis =0) # mean across second column
print(test_2mn)
print('---------------')
# to get the mean across those columns which have rows starting with 2
mf = [1,2]
indices = (test[:,0]==2) # indices true for first column of each row == 2
print(indices,'indices')
print(test[indices,:])
print('---------------')
print(test[indices,:][:,mf])
mean_mf = np.mean(test[indices,:][:,mf],axis =0)
print(mean_mf)

<class 'numpy.ndarray'> : type :: (4, 3)
[4 6 4 7]
5.25
---------------
[ True  True False  True] indices
[[2 4 5]
 [2 6 8]
 [2 7 9]]
---------------
[[4 5]
 [6 8]
 [7 9]]
[5.66666667 7.33333333]


# Multivariate
Prediction based on more than two features

In [29]:
def fit_multivariate_model(x,y,multifeatures):
    k = 3 # number of classes
    mu = np.zeros((4,len(multifeatures))) # list of means
    var = np.zeros((4,len(multifeatures),len(multifeatures))) # list of variances
    p = np.zeros(4) # list of class probabilities
    # print(type(x))
    
    for label in range(1,4):
        indices = (y==label) 
        # print(indices)       
        # print(x[indices,:])        
        # print(x[indices,:][:,multifeatures])
        mu[label,:] = np.mean(x[indices,:][:,multifeatures],axis =0)
        var[label] = np.cov(x[indices,:][:,multifeatures], rowvar=0, bias=1)      
        p[label] = float(sum(indices))/float(len(y))
    return mu, var, p

In [None]:
multifeatures = range(0,7)
fit_multivariate_model(train_x,train_y,multifeatures)

In [31]:
def pred_model_multi(multifeatures):
   mu, covar, p = fit_multivariate_model(train_x, train_y, multifeatures)
   n = len(test_y)
   print('num of tests : ',n)
   score = np.zeros((n,4))
   for i in range(n): #run for each entry in test_y, get the score of each labels to find the argmax of the score.
      for j in range(1,4):
        # print(test_x[i,twofeatures] ,'mean mu :', mu[j,:],'  variance : ', var[j,:])        
          score[i,j] = np.log(p[j]) + multivariate_normal.logpdf(test_x[i,multifeatures], mu[j,:], covar[j,:]) # taking log for the joint distribution(P(x). distribution of test_x )
  #  print(score[1:,1:4])
  #  print(score[:,1:4])
   prediction = np.argmax(score[:,1:4], axis=1) + 1
   print('prediction : ',prediction)
   print('errors : ',np.sum(prediction != test_y))

In [32]:
multifeatures = range(0,7)
pred_model_multi(multifeatures)

num of tests :  20
prediction :  [1 3 1 1 3 1 3 2 2 2 1 1 1 2 3 1 3 2 1 3]
errors :  2


Errors decreased as the number of features in the model used was increased.