In [31]:
import pathlib
import collections
import itertools

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn import discriminant_analysis, metrics

In [11]:
data_folder = pathlib.Path("./../data/vowel")

train_filepath = data_folder / "vowel.train"
test_filepath = data_folder / "vowel.test"
info_filepath = data_folder / "vowel.info.txt"

In [12]:
print(info_filepath.read_text())

This info is the original source information for these data.


NAME: Vowel Recognition (Deterding data)

SUMMARY: Speaker independent recognition of the eleven steady state vowels
of British English using a specified training set of lpc derived log area
ratios.

SOURCE: David Deterding  (data and non-connectionist analysis)
        Mahesan Niranjan (first connectionist analysis)
        Tony Robinson    (description, program, data, and results)

To contact Tony Robinson by electronic mail, use address 
"ajr@dsl.eng.cam.ac.uk"

MAINTAINER: neural-bench@cs.cmu.edu

PROBLEM DESCRIPTION:

The problem is specified by the accompanying data file, "vowel.data".  This
file is in the standard CMU Neural Network Benchmark format.

For a more detailed explanation of the problem, see the excerpt from Tony
Robinson's Ph.D. thesis in the COMMENTS section.  In Robinson's opinion,
connectionist problems fall into two classes, the possible and the
impossible.  He is interested in the latter, by which he

# Evaluation metric
#### Number of correctly classified vowels

## Read Data

In [72]:
train_df = pd.read_csv(train_filepath)
test_df = pd.read_csv(test_filepath)

In [73]:
train_df.dtypes

row.names      int64
y              int64
x.1          float64
x.2          float64
x.3          float64
x.4          float64
x.5          float64
x.6          float64
x.7          float64
x.8          float64
x.9          float64
x.10         float64
dtype: object

In [74]:
train_df.describe()

Unnamed: 0,row.names,y,x.1,x.2,x.3,x.4,x.5,x.6,x.7,x.8,x.9,x.10
count,528.0,528.0,528.0,528.0,528.0,528.0,528.0,528.0,528.0,528.0,528.0,528.0
mean,264.5,6.0,-3.166695,1.735343,-0.448002,0.524983,-0.38928,0.58496,0.017477,0.417394,-0.268112,-0.084568
std,152.56474,3.165277,0.957965,1.16097,0.741363,0.769361,0.722011,0.648547,0.479254,0.59558,0.619584,0.560317
min,1.0,1.0,-5.211,-1.274,-2.487,-1.409,-2.127,-0.836,-1.537,-1.293,-1.613,-1.68
25%,132.75,3.0,-3.923,0.91675,-0.9455,-0.0835,-0.93075,0.1085,-0.297,-0.01825,-0.67375,-0.507
50%,264.5,6.0,-3.097,1.733,-0.5025,0.4565,-0.417,0.5275,0.04,0.477,-0.255,-0.0825
75%,396.25,9.0,-2.51175,2.40375,0.04925,1.164,0.1155,1.00975,0.348,0.86125,0.1375,0.301
max,528.0,11.0,-0.941,5.074,1.413,2.191,1.831,2.327,1.403,1.673,1.309,1.396


In [75]:
test_df.describe()

Unnamed: 0,row.names,y,x.1,x.2,x.3,x.4,x.5,x.6,x.7,x.8,x.9,x.10
count,462.0,462.0,462.0,462.0,462.0,462.0,462.0,462.0,462.0,462.0,462.0,462.0
mean,231.5,6.0,-3.246078,2.049102,-0.576076,0.504626,-0.210089,0.681998,-0.029327,0.244162,-0.34282,-0.056221
std,133.512172,3.165706,0.753377,1.170402,0.671069,0.748236,0.578353,0.544476,0.440483,0.532523,0.505557,0.650602
min,1.0,1.0,-4.982,-1.074,-2.091,-1.044,-1.733,-0.405,-1.282,-0.949,-1.409,-1.241
25%,116.25,3.0,-3.85575,1.194,-1.037,-0.04925,-0.612,0.27825,-0.31025,-0.16775,-0.72175,-0.5645
50%,231.5,6.0,-3.22,2.1015,-0.621,0.4185,-0.1815,0.593,0.0055,0.245,-0.358,-0.2575
75%,346.75,9.0,-2.7065,2.985,-0.181,0.96075,0.199,1.0385,0.24575,0.6515,0.0195,0.59475
max,462.0,11.0,-1.093,4.314,1.431,2.377,1.114,2.108,1.209,2.039,0.757,1.294


## Fit the discriminant

#### Preprocess the data calculate statistics

In [112]:
class_priors = train_df.drop("row.names", axis=1).value_counts("y")
class_priors = (class_priors/class_priors.sum()).values

X_train, y_train = train_df.drop(["row.names", "y"], axis=1).values, train_df.loc[:, "y"].values
X_test, y_test = test_df.drop(["row.names", "y"], axis=1).values, test_df.loc[:, "y"].values

In [114]:
print( f"Train sample size: {X_train.shape[0]}, Train predictors: {X_train.shape[1]}")
print( f"Test sample size: {X_test.shape[0]}, Test predictors: {X_test.shape[1]}")

Train sample size: 528, Train predictors: 10
Test sample size: 462, Test predictors: 10


In [115]:
qda = discriminant_analysis.QuadraticDiscriminantAnalysis(priors = class_priors)
qda.fit(X_train, y_train)

QuadraticDiscriminantAnalysis(priors=array([0.09090909, 0.09090909, 0.09090909, 0.09090909, 0.09090909,
       0.09090909, 0.09090909, 0.09090909, 0.09090909, 0.09090909,
       0.09090909]))

**For train data**

In [130]:
y_pred = qda.predict(X_train)
train_acc = (y_pred == y_train).sum()/y_pred.shape[0]
print(f"Training accuracy for QDA is: {train_acc}")

Training accuracy for QDA is: 0.9886363636363636


**For test data**

In [126]:
y_pred = qda.predict(X_test)
test_acc = (y_pred == y_test).sum()/y_pred.shape(0)

(528,)