In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegressionCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score, confusion_matrix, classification_report
from sklearn import cross_validation

In [3]:
df = pd.read_csv("Iris.csv")

In [10]:
df.head()

Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
0,1,5.1,3.5,1.4,0.2,Iris-setosa
1,2,4.9,3.0,1.4,0.2,Iris-setosa
2,3,4.7,3.2,1.3,0.2,Iris-setosa
3,4,4.6,3.1,1.5,0.2,Iris-setosa
4,5,5.0,3.6,1.4,0.2,Iris-setosa


In [4]:
X = df.iloc[:, 1:5]    
Y = df.iloc[:, -1]    
validation_size = 0.25    
seed = 7    
X_train, X_validation, y_train, y_validation = train_test_split(X, Y, test_size=validation_size, random_state=seed)    
 #k=10    
num_folds = 10    
num_instances = len(X_train)    
seed = 7    
scoring = 'accuracy'

In [6]:
#evaluate model to determine better algorithm    
models = []    
models.append(('LR', LogisticRegressionCV())) 
models.append(('KNN', KNeighborsClassifier()))    
models.append(('DT', DecisionTreeClassifier()))
models.append(('RF', RandomForestClassifier()))
models.append(('SVC', SVC()))
models.append(('NB', MultinomialNB()))
    
results = []    
names = []    
for name, model in models:    
    kfold = cross_validation.KFold(n=num_instances, n_folds=num_folds, random_state=seed)    
    cv_results = cross_validation.cross_val_score(model, X_train, y_train, cv=kfold, scoring=scoring)    
    results.append(cv_results)    
    names.append(name)    
    msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())    
    print(msg)   

LR: 0.928030 (0.054340)
KNN: 0.981818 (0.036364)
DT: 0.972727 (0.041660)
RF: 0.972727 (0.041660)
SVC: 0.990909 (0.027273)
NB: 0.883333 (0.099135)


In [7]:
svc = SVC()
svc.fit(X_train, y_train)
y_pred = svc.predict(X_validation)
accuracy = float(accuracy_score(y_validation, y_pred))*100
print("The accuracy is: ", accuracy, "%")
cm = confusion_matrix(y_validation, y_pred)
print(cm)
print(classification_report(y_validation, y_pred))

The accuracy is:  94.73684210526315 %
[[11  0  0]
 [ 0 12  2]
 [ 0  0 13]]
                 precision    recall  f1-score   support

    Iris-setosa       1.00      1.00      1.00        11
Iris-versicolor       1.00      0.86      0.92        14
 Iris-virginica       0.87      1.00      0.93        13

    avg / total       0.95      0.95      0.95        38



In [8]:
import pickle

In [9]:
pkl_filename = "model.pkl"
pfm = open(pkl_filename, 'ab')
pickle.dump(svc, pfm)
pfm.close()