# Supervised Learning

In [None]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

import pylab as pl
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.cm as cm
from matplotlib.colors import ListedColormap, BoundaryNorm
import matplotlib.patches as mpatches
import matplotlib.patches as mpatches
%matplotlib inline

import warnings
warnings.filterwarnings("ignore")

## Multi-class classification

In [None]:
data_file = '../data/fruits_multilabel_classification.tsv'
fruits_df = pd.read_table(data_file)
print ('Shape: ', fruits_df.shape)
print ('Unique Fruits: ', fruits_df['fruit_name'].unique())
fruits_df.head(5)

### Exploratory Data Analysis (EDA)

In [None]:
## Check whether the data is balanced
print(fruits_df.groupby('fruit_name').size())

fruits_df.describe()

In [None]:
sns.countplot(fruits_df['fruit_name'],label="Count")
plt.show()

In [None]:
fruits_df.drop('fruit_label' ,axis=1).hist(bins=30, figsize=(9,9))
pl.suptitle("Histogram for each numeric input variable")
plt.savefig('fruits_hist')
plt.show()

### Split Train and Test

In [None]:
feature_names = ['mass', 'width', 'height', 'color_score']
X = fruits_df[feature_names]
y = fruits_df['fruit_label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
print ('\nX_train shape: ', X_train.shape)
print ('y_train shape: ', y_train.shape)

print ('\nX_test shape: ', X_test.shape)
print ('y_test shape: ', y_test.shape)

### Pre-process
-- `We can see that the numerical values do not have the same scale. We will need to apply scaling to the test set that we computed for the training set`

__MinMaxScaler:__ `For each value in a feature, MinMaxScaler subtracts the minimum value in the feature and then divides by the range. The range is the difference between the original maximum and original minimum.`

In [None]:
print ('Before scaling.. ')
print ('Train:\n', X_train.head(3))
print ('Test:\n', X_test.head(3))

scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

print ('\nAfter scaling.. ')
print ('Train:\n', X_train[0:3])
print ('Test:\n', X_test[0:3])

### Build Models - Logistic regression

In [None]:
logreg = LogisticRegression()
logreg.fit(X_train, y_train)
print('Accuracy on training set: {:.2f}'.format(logreg.score(X_train, y_train)))
print('Accuracy on test set: {:.2f}'.format(logreg.score(X_test, y_test)))

### Build Models - Decision Trees

In [None]:
clf = DecisionTreeClassifier().fit(X_train, y_train)
print('Accuracy on training set: {:.2f}'.format(clf.score(X_train, y_train)))
print('Accuracy on test set: {:.2f}'.format(clf.score(X_test, y_test)))

### Build Models - KNN

In [None]:
knn = KNeighborsClassifier()
knn.fit(X_train, y_train)
print('Accuracy on training set: {:.2f}'.format(knn.score(X_train, y_train)))
print('Accuracy on test set: {:.2f}'.format(knn.score(X_test, y_test)))

### Confusion Matrix

In [None]:
# for logistics regression
pred = logreg.predict(X_test)
print(classification_report(y_test, pred))