In [1]:
from pandas import read_csv
from pandas.tools.plotting import scatter_matrix
from matplotlib import pyplot
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC

In [2]:
filename = '/home/siddhartha/Desktop/iris/iris.csv'
names = [ 'sepal-length' , 'sepal-width' , 'petal-length' , 'petal-width' , 'class' ]
dataset = read_csv(filename, names=names)

In [5]:
print(dataset.shape)

(299, 5)


In [6]:
print(dataset.head(20))

    sepal-length  sepal-width  petal-length  petal-width   class
0            5.1          3.5           1.4          0.2  setosa
1            NaN          NaN           NaN          NaN     NaN
2            4.9          3.0           1.4          0.2  setosa
3            NaN          NaN           NaN          NaN     NaN
4            4.7          3.2           1.3          0.2  setosa
5            NaN          NaN           NaN          NaN     NaN
6            4.6          3.1           1.5          0.2  setosa
7            NaN          NaN           NaN          NaN     NaN
8            5.0          3.6           1.4          0.2  setosa
9            NaN          NaN           NaN          NaN     NaN
10           5.4          3.9           1.7          0.4  setosa
11           NaN          NaN           NaN          NaN     NaN
12           4.6          3.4           1.4          0.3  setosa
13           NaN          NaN           NaN          NaN     NaN
14           5.0         

In [28]:
dataset=dataset.dropna()
print(dataset.head(20))

    sepal-length  sepal-width  petal-length  petal-width   class
0            5.1          3.5           1.4          0.2  setosa
2            4.9          3.0           1.4          0.2  setosa
4            4.7          3.2           1.3          0.2  setosa
6            4.6          3.1           1.5          0.2  setosa
8            5.0          3.6           1.4          0.2  setosa
10           5.4          3.9           1.7          0.4  setosa
12           4.6          3.4           1.4          0.3  setosa
14           5.0          3.4           1.5          0.2  setosa
16           4.4          2.9           1.4          0.2  setosa
18           4.9          3.1           1.5          0.1  setosa
20           5.4          3.7           1.5          0.2  setosa
22           4.8          3.4           1.6          0.2  setosa
24           4.8          3.0           1.4          0.1  setosa
26           4.3          3.0           1.1          0.1  setosa
28           5.8         

In [30]:
print(dataset.describe())

       sepal-length  sepal-width  petal-length  petal-width
count    150.000000   150.000000    150.000000   150.000000
mean       5.843333     3.054000      3.758667     1.198667
std        0.828066     0.433594      1.764420     0.763161
min        4.300000     2.000000      1.000000     0.100000
25%        5.100000     2.800000      1.600000     0.300000
50%        5.800000     3.000000      4.350000     1.300000
75%        6.400000     3.300000      5.100000     1.800000
max        7.900000     4.400000      6.900000     2.500000


In [32]:
print(dataset.groupby( 'class' ).size())

class
setosa        50
versicolor    50
virginica     50
dtype: int64


In [33]:
array = dataset.values
X = array[:,0:4]
Y = array[:,4]

In [36]:
validation_size = 0.20
seed = 7
X_train, X_validation, Y_train, Y_validation = train_test_split(X, Y,test_size=validation_size, random_state=seed)

In [37]:
models = []
models.append(( ' LR ' , LogisticRegression()))
models.append(( ' LDA ' , LinearDiscriminantAnalysis()))
models.append(( ' KNN ' , KNeighborsClassifier()))
models.append(( ' CART ' , DecisionTreeClassifier()))
models.append(( ' NB ' , GaussianNB()))
models.append(( ' SVM ' , SVC()))

In [40]:
results = []
names = []
for name, model in models:
    kfold = KFold(n_splits=10, random_state=seed)
    cv_results = cross_val_score(model, X_train, Y_train, cv=kfold, scoring= 'accuracy' )
    results.append(cv_results)
    names.append(name)
    msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
    print(msg)

 LR : 0.966667 (0.040825)
 LDA : 0.975000 (0.038188)
 KNN : 0.983333 (0.033333)
 CART : 0.975000 (0.038188)
 NB : 0.975000 (0.053359)
 SVM : 0.991667 (0.025000)


In [41]:
knn = KNeighborsClassifier()
knn.fit(X_train, Y_train)
predictions = knn.predict(X_validation)
print(accuracy_score(Y_validation, predictions))
print(confusion_matrix(Y_validation, predictions))
print(classification_report(Y_validation, predictions))

0.9
[[ 7  0  0]
 [ 0 11  1]
 [ 0  2  9]]
             precision    recall  f1-score   support

     setosa       1.00      1.00      1.00         7
 versicolor       0.85      0.92      0.88        12
  virginica       0.90      0.82      0.86        11

avg / total       0.90      0.90      0.90        30

