# Importing the libraries

In [None]:
from sklearn import datasets
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline


# Loading the data into a pandas dataframe

We will be using machine learning to try to determine the species that different flowers belong to based on their measurements.

The data set consists of 50 samples from each of three species of Iris ("Iris setosa", "Iris virginica", and "Iris versicolor"). 

Four features were measured from each sample: the length and the width of the sepals and petals, in centimetres. Based on the combination of these four features.

In [None]:
#Load the data from the sklearn datasets library
iris = datasets.load_iris()

#save the predictor data to a Pandas dataframe

irisdf = pd.DataFrame(iris.data, columns=iris.feature_names)

#Save the class labels (i.e., the species of every row) as a column called "species"
irisdf['species'] = iris.target

#Let's see the first 10 rows of the data
print irisdf.head(10)

# Initital Visualization

Plot each data point in terms of its "Petal Length" and "Petal Width"

We know what the the actual species is for each data point, so we'll color the point according the species in the plot

In [None]:
cmap = {'0': 'r', '1': 'g', '2': 'b' }
irisdf['ctarget'] = irisdf.species.apply(lambda x: cmap[str(x)])
irisdf.plot('petal length (cm)', 'petal width (cm)', kind='scatter', c=irisdf.ctarget)
print irisdf.plot('petal length (cm)', 'petal width (cm)', kind='scatter', c=irisdf.ctarget)


# K-Nearest Neighbor (KNN) Classifier

Let's build a K-Nearest neighbor classifier to classify the species based on their measurements.

In [None]:
from sklearn import neighbors

# n_neighbors is our option in KNN. We'll tune this value to attempt to improve our prediction.
predictors = ["sepal length (cm)", "sepal width (cm)", "petal length (cm)",  "petal width (cm)"]

#you could also do this to get the column names except for the last one
predictors = irisdf.columns[:-1]

X = irisdf[predictors]

labels=irisdf['species']
knn = neighbors.KNeighborsClassifier(n_neighbors=5, weights='uniform')
knn.fit(X, labels)





# Classification metrics

Print out the performance of the classifer for different classification metrics

Confusion matrix

Accurracy = (TP + TN) / (TP + TN + FP + FN)

Precision = TP / (TP + FN)

Recall = TP / (TP + FP)

F1 Score = (Precision * Recall) / (Precision + Recall)

In [None]:
from sklearn import metrics
predicted = knn.predict(X)


print metrics.confusion_matrix(labels,predicted)

print "Accuracy: %f" % metrics.accuracy_score(labels,predicted)

print "Precision: %f" % metrics.precision_score(labels,predicted,average='weighted')

print "Recall: %f " % metrics.recall_score(labels,predicted,average='weighted')

print "F1 Score: %f" % metrics.f1_score(labels,predicted,average='weighted')




# Finding the Best value for "K"

Sklearn has a function to try out all of the different parameter values for a range that you provide. It then can go through trying each value and seeing what the average cross-validated score for that parameter was.

All of the cross-validated scores will be saved and you can print them

In [None]:
from sklearn import grid_search, cross_validation

#set the range of possible K-values: We'll choose all integers from 2 to 100
k = range(2, 100)
#Make a dictionary that will hold the value we will pass on to the KNeighborsClassifier() function
params = {'n_neighbors': k }

#Make a cross-validation iterator with 5 folds
kf = cross_validation.KFold(len(irisdf), n_folds = 5)

# Set up a grid search to try every combination of K, 
#We'll use accuracy as our scoring criterion for the search, 
#but you can set it to the name of another classifer
gs = grid_search.GridSearchCV(
    estimator=neighbors.KNeighborsClassifier(),
    param_grid=params,
    cv=kf,
    scoring = 'accuracy'
)
gs.fit(X, labels)
gs.grid_scores_

# Try out the best fitting estimator

The best fitting KNN classifier from the grid search will be saved and you can use it to predict new values

In [None]:
predicted = gs.predict(X)
print metrics.accuracy_score(labels,predicted)

# Plot the cross validated accuracy of different values for K

You can visualize the accuracy for each K-value by plotting the K-value on the X-axis and the Accuracy of that K-value on the Y-axis

In [None]:
plt.plot(k,[s[1] for s in  gs.grid_scores_],)

Zoom into the area where you think the accuracy is at its highest

In [None]:
# Fill in the lower and upper bound for where you want to zoom into.
# For exmaple, if you think that accuracy is highest when K = 20, 
# then choose 10 as your lower bound, and 30 as your upper bound so you can see the K=20 accuracy

lowerbound = 
upperbound = 
plt.plot(k[lowerbound:upperbound],[s[1] for s in  gs.grid_scores_][lowerbound:upperbound],)

# Regression

KNN can be applied to a regression context

Let's compare the cross-validated performance of the KNN regressor to a regression on the bike share data

In [None]:
bikedf = pd.read_csv("bikeshare.csv")
weather = pd.get_dummies(bikedf.weather, prefix='weather')
predictors = ["holiday","temp","humidity","windspeed", "workingday"]

#make a data frame of just the predictors
X = bikedf[predictors].join(weather[['weather_1', 'weather_2', 'weather_3']])
y = bikedf['count']

from sklearn.linear_model import LinearRegression
from sklearn.cross_validation import train_test_split
from sklearn.preprocessing import StandardScaler

#Normalizing the predictor variables will improve the performance of the KNN
#and will not hurt the performance of the Linear Regression
X= StandardScaler().fit_transform(X)

#Let's split our data into a training and test set to cross-validate
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33)

lr = LinearRegression()
knn = neighbors.KNeighborsRegressor(n_neighbors=30, weights='uniform')

#Check the R2 fit
print "R2 of Linear Regression: %f" % lr.fit(X_train,y_train).score(X_test,y_test)

print "R2 of KNN Regressor: %f" % knn.fit(X_train,y_train).score(X_test,y_test)