In [1]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.cross_validation import cross_val_score
import random



In [2]:
random.seed(123)
df = pd.read_csv("Iris_Data.csv")

In [3]:
y = df['Labels']
x = df.iloc[:,:4]

# a) Split your data into train and test data.
xtrain, xtest, ytrain, ytest = train_test_split(x, y, test_size = 0.2, random_state=1)

In [4]:
# b) Fit a decision tree classifier. How does it perform?
ytree_pred = DecisionTreeClassifier().fit(xtrain, ytrain).predict(xtest)
print(classification_report(ytest, ytree_pred))

             precision    recall  f1-score   support

          0       1.00      1.00      1.00        11
          1       1.00      0.92      0.96        13
          2       0.86      1.00      0.92         6

avg / total       0.97      0.97      0.97        30



It performs well for this dataset with the average F1-score of .97 (I am genuinely interested in using F1-score to compare the effectiveness of a classifier as it is a harmonic mean between precision and recall. Normally I use cross validation or bootstrapping to decrease overfitting over one sample dataset.

In [5]:
# c) What about a K-Nearest Neighbors model? Please try with 1, 10, 20, 50 and 80 neighbors. 
# Why do you think 50 and 80 neighbors works less well (answer in one or two sentences)?

ks = [1,10,20,50,80]
for k in ks:
    yknn_pred = KNeighborsClassifier(n_neighbors =k).fit(xtrain,ytrain).predict(xtest)
    print("k=",k)
    print(classification_report(ytest, yknn_pred))

k= 1
             precision    recall  f1-score   support

          0       1.00      1.00      1.00        11
          1       1.00      1.00      1.00        13
          2       1.00      1.00      1.00         6

avg / total       1.00      1.00      1.00        30

k= 10
             precision    recall  f1-score   support

          0       1.00      1.00      1.00        11
          1       1.00      0.92      0.96        13
          2       0.86      1.00      0.92         6

avg / total       0.97      0.97      0.97        30

k= 20
             precision    recall  f1-score   support

          0       1.00      1.00      1.00        11
          1       1.00      0.92      0.96        13
          2       0.86      1.00      0.92         6

avg / total       0.97      0.97      0.97        30

k= 50
             precision    recall  f1-score   support

          0       1.00      1.00      1.00        11
          1       1.00      0.92      0.96        13
          2  

The cases of k=50 and 80 perform less well as the bigger k is, the less flexible a knn classifier becomes and the decision boundary becomes flatter to linear, reducing the accuracy of classification.

In [6]:
yrf_pred = RandomForestClassifier().fit(xtrain, ytrain).predict(xtest)
print(classification_report(ytest, yrf_pred))

             precision    recall  f1-score   support

          0       1.00      1.00      1.00        11
          1       1.00      0.92      0.96        13
          2       0.86      1.00      0.92         6

avg / total       0.97      0.97      0.97        30



In this case, both decision tree and random forest perform effectively with equivalent f1-score. However this is only one sample dataset. Following I will proceed with a cross validation to reduce the overfitting of one dataset bias.

In [7]:
# Decision Tree
cross_val_score(DecisionTreeClassifier(), x,y, scoring = 'neg_mean_squared_error', cv=20).mean()

-0.044444444444444439

In [8]:
cross_val_score(RandomForestClassifier(), x,y, scoring = 'neg_mean_squared_error', cv=20).mean()

-0.036111111111111108

The random forest classifier works a bit better as its absolute value of negative mean squared error is smaller but repeatedly with different cross validation, the results of both are not much different.