# 6. Machine Learning with SKlearn 2

In [None]:
from sklearn.datasets import load_wine
from sklearn.neighbors import KNeighborsClassifier
import seaborn as sns
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
%matplotlib inline

# 6.1 K-Nearest Neighbors - solution

## 6.1.1 Example 1: Classification

In [None]:
wine = load_wine() #Wine Quality Data Set: 3 classes

In [None]:
print(wine.DESCR)

In [None]:
X = wine.data # the dependent variables
y = wine.target

In [None]:
#quick pairplot with some variables
wine_df = pd.DataFrame(data= np.c_[wine['data'], wine['target']],
                     columns= wine['feature_names'] + ['target'])
vars = ['proline', 'flavanoids']
#vars = ['proline', 'flavanoids', 'color_intensity', 'od280/od315_of_diluted_wines']
sns.pairplot(wine_df, hue = 'target', vars = vars)

In [None]:
wine_df.sample(3)

In [None]:
#only take 2 features into account to visualize later
X_subset = np.array(wine_df[vars])

### Preprocess data for KNN: scaling features (0-1)

In [None]:
# Compute the minimum value per feature 
min_on_training = X_subset.min(axis=0)
# Compute the range of each feature (max - min)
range_on_training = (X_subset - min_on_training).max(axis=0)

# subtract the min, divide by range
# afterward, min=0 and max=1 for each feature
X_subset = (X_subset - min_on_training) / range_on_training

We will perform two 'weights' at once. Most of the code is to get a nice plot.

In [None]:
from matplotlib.colors import ListedColormap
h = .02  # step size in the mesh


n_neighbors = 5 #see what happens if you change this value

# Create color maps
cmap_light = ListedColormap(['#FFAAAA', '#AAFFAA', '#AAAAFF'])
cmap_bold = ListedColormap(['#FF0000', '#00FF00', '#0000FF'])

'''
regarding the weights parameter value:

#'uniform' :    uniform weights.  All points in each neighborhood are weighted equally.
#'distance' :   weight points by the inverse of their distance. 
                in this case, closer neighbors of a query point will have a
                greater influence than neighbors which are further away.
'''

for weights in ['uniform', 'distance']:
    # we create an instance of Neighbours Classifier and fit the data.
    clf = KNeighborsClassifier(n_neighbors, weights=weights)
    clf.fit(X_subset, y)

    # Plot the decision boundary. For that, we will assign a color to each
    # point in the mesh [x_min, x_max]x[y_min, y_max].
    x_min, x_max = X_subset[:, 0].min() - 1, X_subset[:, 0].max() + 1
    y_min, y_max = X_subset[:, 1].min() - 1, X_subset[:, 1].max() + 1
    xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
                         np.arange(y_min, y_max, h))
    Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])

    # Put the result into a color plot
    Z = Z.reshape(xx.shape)
    plt.figure()
    plt.pcolormesh(xx, yy, Z, cmap=cmap_light)

    # Plot also the training points
    plt.scatter(X_subset[:, 0], X_subset[:, 1], c=y, cmap=cmap_bold,
                edgecolor='k', s=20)
    plt.xlim(xx.min(), xx.max())
    plt.ylim(yy.min(), yy.max())
    plt.title("3-Class classification (k = %i, weights = '%s')"
              % (n_neighbors, weights))

plt.show()

Now let's use all features

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    test_size=0.25,
                                                    random_state=1234,
                                                    stratify=y)

In [None]:
weights = 'uniform'
n_neighbors = 10
clf = KNeighborsClassifier(n_neighbors, weights=weights)
clf.fit(X_train, y_train)

In [None]:
clf.score(X_test, y_test)

## 6.1.2 Example 2: Regresssion

In [None]:
from sklearn.neighbors import KNeighborsRegressor
from sklearn.datasets import load_boston

In [None]:
boston = load_boston() #Boston house prices 
X = boston.data
y = boston.target

In [None]:
# Compute the minimum value per feature 
min_on_training = X.min(axis=0)
# Compute the range of each feature (max - min)
range_on_training = (X - min_on_training).max(axis=0)

# subtract the min, divide by range
# afterward, min=0 and max=1 for each feature
X = (X - min_on_training) / range_on_training

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    test_size=0.25,
                                                    random_state=1234)

In [None]:
weights = 'distance'
n_neighbors = 10
clf = KNeighborsRegressor(n_neighbors, weights=weights)
clf.fit(X_train, y_train)

In [None]:
clf.score(X_test, y_test)

## 6.1.3 Task 1: Salary revisited 

You might remember the exercise of the previous class about clustering people based on salary and workhours. That exercise was an unsupervised example. Now load 'salary_hours_C6.csv'. This is a supervised example, as we now have a categorical outcome variable indicating the categories (as opposed to constructing them ourself when clustering).

- Look at the data (as always)
- Transform your data if needed (you can try without and come back if needed)
- Construct a train and test set
- Construct a 1 nearest neighbor model
- Evaluate your model 
- Construct a 10 nearest neighbor model
- Evaluate your model
- Compare both k's, choose the best one
- Guess the category of somebody with Salary = 2000, hours = 9. Now predict it.
- Guess the category of somebody with Salary = 1700, hours = 8.5. Now predict it.

## 6.1.4 Solution

### 6.1.4.1 Import and look at the data

In [None]:
salary_df = pd.read_csv('data/salary_hours_C6.csv', index_col=0)

In [None]:
salary_df.head(3)

In [None]:
# Let's look at the different categories
sns.lmplot("hours", "salarys", data=salary_df, hue='cat', fit_reg=False)

In [None]:
# I'm going to replace the string encoding with numbers
enc = pd.factorize(salary_df['cat'])
salary_df['cat_enc'] = enc[0]

### 6.1.4.2 Construct a training and testing set

In [None]:
# make a matrix of the data frame
salary_mx = salary_df.values

In [None]:
X_salary = salary_mx[:,:2]
X_salary.shape

In [None]:
Y_salary = salary_mx[:,3].astype(np.int64)
Y_salary.shape

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(X_salary, 
                                                    Y_salary,
                                                    test_size=0.25,
                                                    random_state=42,)

### 6.1.4.3 Make the models

In [None]:
n_neighbors = 1
clf = KNeighborsClassifier(n_neighbors)
clf.fit(X_train, Y_train)

In [None]:
clf.score(X_test, Y_test)

In [None]:
n_neighbors = 10
clf = KNeighborsClassifier(n_neighbors)
clf.fit(X_train, Y_train)

In [None]:
clf.score(X_test, Y_test)

In [None]:
test=np.array([[2000,9],[1700,8.5]])
clf.predict(test)