## Setup stuff (don't edit)

### Imports

In [None]:
import numpy as np
import matplotlib.pyplot as plt

from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier

from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.datasets import make_blobs, make_classification, make_circles, make_moons, make_regression

### Datasets

In [None]:
num_data_points = 400

moons_data = make_moons(n_samples=num_data_points, noise=0.2, random_state=42)

classification_data = make_classification(n_samples=num_data_points, n_features=2,
                                          n_informative=2, n_redundant=0,
                                          class_sep=1,
                                          n_clusters_per_class=1,
                                          n_classes=2, random_state=42)

blobs_data = make_blobs(n_samples=num_data_points, centers=3, cluster_std=3,
                        n_features=2, random_state=42)

circle_data = make_circles(n_samples=num_data_points, factor=0.5, noise=0.1,
                           random_state=42)

datasets = [moons_data, classification_data, blobs_data, circle_data]

### Data Visualizing

In [None]:
# Visualize data as a scatter plot
def visualize_data(data):
    fig, ax = plt.subplots(figsize=(5, 5))
    X, y = data
    ax.scatter(X[:, 0], X[:, 1], c=y, cmap='viridis', s=50, edgecolor='k')
    ax.set_xlabel("Feature 1", fontsize=12)
    ax.set_ylabel("Feature 2", fontsize=12)
    plt.show()

In [None]:
# Visualize feature boundary in scatter plot
def visualize_boundary(X, y, model, acc):
    x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
    y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
    xx, yy = np.meshgrid(np.arange(x_min, x_max, 0.01), np.arange(y_min, y_max, 0.01))

    Z = model.predict(np.c_[xx.ravel(), yy.ravel()])
    Z = Z.reshape(xx.shape)

    # Plot decision boundary
    fig, ax = plt.subplots(figsize=(5, 5))
    ax.contourf(xx, yy, Z, alpha=0.3, cmap="viridis")
    ax.scatter(X[:, 0], X[:, 1], c=y, cmap='viridis', s=50, edgecolor='k')
    ax.set_title(f"Accuracy: {acc:.2f}")
    ax.set_xlabel("Feature 1", fontsize=12)
    ax.set_ylabel("Feature 2", fontsize=12)
    plt.show()

# Section 1: The Data

We've created 4 interestingly-shaped datasets for you: `moons_data`, `classification_data`, `blobs_data`, and `circle_data`. Use the `visualize_data` function provided above to display each dataset.

In [None]:
# TODO: Use the visualize_data function to show what each dataset looks like.
#
# HINT: If the code is not working make sure you have run all the cells above!
# ...

# Section 2: Splitting The Data

Let's see what one of these datasets actually looks like and print out `moons_data`.

In [None]:
# TODO: Uncomment the line below to display moons_data
# moons_data

What do we notice? We have a tuple (pair of two items) with two arrays (lists) inside it. The first list stores sub-lists of length 2, and the second list has a bunch of 1s and 0s in it. 

Which list do you think is the features and which do you think is the target? Uncomment the appropriate line of code below.

In [None]:
# TODO: Uncomment the appropriate line of code
# If you think the first list holds the features and the second list holds the target, uncomment this line of code:
# X, y = moons_data

# If you think the first list holds the target and the second list holds the features, uncomment this line of code:
# y, X = moons_data

To split the data into training and test sets, we use the `train_test_split` function. Use [this documentation link](https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.train_test_split.html) to get familiar with `train_test_split`. Scroll down on the page to see examples of the function being used in the code.

In [None]:
# TODO: Split moons_data into train and test sets. Use a random state of 123. 30% of the data should be in the test set
# Then print out X_train and y_train below
#
# HINT: Make sure that your X_train, X_test... etc variables are in the right order
# ...

In [None]:
#@title Test case (DO NOT EDIT)
assert len(X_train) + len(X_test) == len(X), "Expected X_train and X_test to combine to X"
assert len(y_train) + len(y_test) == len(y), "Expected y_train and y_test to combine to y"
assert len(X_train) == len(X)*0.7, "Expected X_train to be 70% of X's size, got " + str(100*len(X_train)/len(X)) + "%"
assert len(X_test) == len(X)*0.3, "Expected X_test to be 30% of X's size, got " + str(100*len(X_test)/len(X)) + "%"
assert len(y_train) == len(y)*0.7, "Expected y_train to be 70% of y's size, got " + str(100*len(y_train)/len(y)) + "%"
assert len(y_test) == len(y)*0.3, "Expected y_test to be 30% of y's size, got " + str(100*len(y_test)/len(y)) + "%"

# Section 3: The Models

Let's test out some models to see which ones work best on the `moons_data` dataset. 
### Tree-Based Models
Use [this documentation link](https://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeClassifier.html) to familiarize yourself with `DecisionTreeClassifier` and try it out. Scroll down to see how to use `fit` and `score`.

In [None]:
# TODO: Create a DecisionTreeClassifier, fit it to the training data, and score it with the testing data. Use a random state of 42
#
# HINT: Don't forget to modify the hyperparameter(s)! Play with them to see if you can find the best value for each.
# HINT: If your code isn't working because of an array reshaping issue, you might have picked the wrong order for X and y in moons_data! 
#       Try changing that and running this cell again.
#

tree_model = ...

# use .fit(training data)
# ...

# use .score(testing data)
tree_score = ...

In [None]:
# TODO: Uncomment the code below to display the decision boundary
# visualize_boundary(X_test, y_test, tree_model, tree_score)

In [None]:
# TODO: Uncomment the code below to display the tree
# plot_tree(tree_model, feature_names=["Feature 1", "Feature 2"], class_names=["Purple", "Yellow"], label="none", filled=True)

### Linear Models
Use [this documentation link](https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html) to familiarize yourself with `LogisticRegression` and try it out. Scroll down to see how to use `fit` and `score`.

In [None]:
# TODO: Create a LogisticRegression model, fit it to the training data, and score it with the testing data. Use a random state of 42
#
# HINT: Don't forget to modify the hyperparameter(s)! Play with them to see if you can find the best value for each.
# HINT: If your code isn't working because of an array reshaping issue, you might have picked the wrong order for X and y in moons_data! 
#       Try changing that and running this cell again.
#

lr_model = ...

# use .fit(training data)
# ...

# use .score(testing data)
lr_score = ...

In [None]:
# TODO: Uncomment the code below to display the decision boundary
# visualize_boundary(X_test, y_test, lr_model, lr_score)

### Non-Linear Models
Use [this documentation link](https://scikit-learn.org/stable/modules/generated/sklearn.svm.SVC.html) to familiarize yourself with `SVC` (SVM RBF) and try it out. You can also try setting the kernel to `linear` to try out Linear SVM. Scroll down to see how to use `fit` and `score`.

In [None]:
# TODO: Create a SVM RBF model, fit it to the training data, and score it with the testing data. Use a random state of 42
#
# HINT: Don't forget to modify the hyperparameter(s)! Play with them to see if you can find the best value for each.
# HINT: If your code isn't working because of an array reshaping issue, you might have picked the wrong order for X and y in moons_data! 
#       Try changing that and running this cell again.
#

svc_model = ...

# use .fit(training data)
# ...

# use .score(testing data)
svc_score = ...

In [None]:
# TODO: Uncomment the code below to display the decision boundary
# This will take a minute to run - this is normal!
# visualize_boundary(X_test, y_test, svc_model, svc_score)

Use [this documentation link](https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KNeighborsClassifier.html) to familiarize yourself with `KNeighborsClassifier` (K-Nearest Neighbours) and try it out. Scroll down to see how to use `fit` and `score`.

In [None]:
# TODO: Create a KNeighborsClassifier, fit it to the training data, and score it with the testing data
#
# HINT: Don't forget to modify the hyperparameter(s)! Play with them to see if you can find the best value for each.
# HINT: If your code isn't working because of an array reshaping issue, you might have picked the wrong order for X and y in moons_data! 
#       Try changing that and running this cell again.
# HINT: Make sure you're using the American spelling of "neighbors"
#

knn_model = ...

# use .fit(training data)
# ...

# use .score(testing data)
knn_score = ...

In [None]:
# TODO: Uncomment the code below to display the decision boundary
# This will take a minute to run - this is normal!
# visualize_boundary(X_test, y_test, knn_model, knn_score)

# Section 4: Picking a Good Model


You might have noticed that, no matter how much you change the hyperparameters for a model, sometimes the model just doesn't fit the data that well. For example, if the purple dots form a curve, a line can only do so well at making a decision boundary. This is why we try to pick a model that will be able to form a good decision boundary on the data we're using.

Let's go back to the `classification_data`, `blobs_data`, and `circle_data` datasets. Consider the shape of the data in each dataset, and try to pick a model from the ones you explored above that you think will fit each dataset well. Don't forget to use `train_test_split` on each new dataset!
### `classification_data`

In [None]:
# TODO: Set up X, y, X_train, X_test, y_train, and y_test like you did above, but using classification_data
# Use a random state of 123. 30% of the data should be in the test set
#
# HINT: Make sure that your X_train, X_test... etc variables are in the right order
# ...

In [None]:
#@title Test case (DO NOT EDIT)
assert len(X_train) + len(X_test) == len(X), "Expected X_train and X_test to combine to X"
assert len(y_train) + len(y_test) == len(y), "Expected y_train and y_test to combine to y"
assert len(X_train) == len(X)*0.7, "Expected X_train to be 70% of X's size, got " + str(100*len(X_train)/len(X)) + "%"
assert len(X_test) == len(X)*0.3, "Expected X_test to be 30% of X's size, got " + str(100*len(X_test)/len(X)) + "%"
assert len(y_train) == len(y)*0.7, "Expected y_train to be 70% of y's size, got " + str(100*len(y_train)/len(y)) + "%"
assert len(y_test) == len(y)*0.3, "Expected y_test to be 30% of y's size, got " + str(100*len(y_test)/len(y)) + "%"

In [None]:
# TODO: Try out a model, fit it to the training data, and score it with the testing data. If a random state is needed, use 42
#
# HINT: Don't forget to modify the hyperparameter(s)! Play with them to see if you can find the best value for each.
# HINT: If your code isn't working because of an array reshaping issue, you might not have split the data correctly above.
#       Try changing that and running this cell again.
# TIP: You can train more than one model to see which one works best! Add a new cell or create new models below.
#

classification_model = ...

# use .fit(training data)
# ...

# use .score(testing data)
classification_score = ...

In [None]:
# TODO: Uncomment the code below to display the decision boundary
# This may take a minute to run, depending on the model you picked
# visualize_boundary(X_test, y_test, classification_model, classification_score)

### `blobs_data`

In [None]:
# TODO: Set up X, y, X_train, X_test, y_train, and y_test like you did above, but using blobs_data
# Use a random state of 123. 30% of the data should be in the test set
#
# HINT: Make sure that your X_train, X_test... etc variables are in the right order
# ...

In [None]:
#@title Test case (DO NOT EDIT)
assert len(X_train) + len(X_test) == len(X), "Expected X_train and X_test to combine to X"
assert len(y_train) + len(y_test) == len(y), "Expected y_train and y_test to combine to y"
assert len(X_train) == len(X)*0.7, "Expected X_train to be 70% of X's size, got " + str(100*len(X_train)/len(X)) + "%"
assert len(X_test) == len(X)*0.3, "Expected X_test to be 30% of X's size, got " + str(100*len(X_test)/len(X)) + "%"
assert len(y_train) == len(y)*0.7, "Expected y_train to be 70% of y's size, got " + str(100*len(y_train)/len(y)) + "%"
assert len(y_test) == len(y)*0.3, "Expected y_test to be 30% of y's size, got " + str(100*len(y_test)/len(y)) + "%"

In [None]:
# TODO: Try out a model, fit it to the training data, and score it with the testing data. If a random state is needed, use 42
#
# HINT: Don't forget to modify the hyperparameter(s)! Play with them to see if you can find the best value for each.
# HINT: If your code isn't working because of an array reshaping issue, you might not have split the data correctly above.
#       Try changing that and running this cell again.
# TIP: You can train more than one model to see which one works best! Add a new cell or create new models below.
#

blobs_model = ...

# use .fit(training data)
# ...

# use .score(testing data)
blobs_score = ...

In [None]:
# TODO: Uncomment the code below to display the decision boundary
# This may take a minute to run, depending on the model you picked
# visualize_boundary(X_test, y_test, blobs_model, blobs_score)

### `circle_data`

In [None]:
# TODO: Set up X, y, X_train, X_test, y_train, and y_test like you did above, but using circle_data
# Use a random state of 123. 30% of the data should be in the test set
#
# HINT: Make sure that your X_train, X_test... etc variables are in the right order
# ...

In [None]:
#@title Test case (DO NOT EDIT)
assert len(X_train) + len(X_test) == len(X), "Expected X_train and X_test to combine to X"
assert len(y_train) + len(y_test) == len(y), "Expected y_train and y_test to combine to y"
assert len(X_train) == len(X)*0.7, "Expected X_train to be 70% of X's size, got " + str(100*len(X_train)/len(X)) + "%"
assert len(X_test) == len(X)*0.3, "Expected X_test to be 30% of X's size, got " + str(100*len(X_test)/len(X)) + "%"
assert len(y_train) == len(y)*0.7, "Expected y_train to be 70% of y's size, got " + str(100*len(y_train)/len(y)) + "%"
assert len(y_test) == len(y)*0.3, "Expected y_test to be 30% of y's size, got " + str(100*len(y_test)/len(y)) + "%"

In [None]:
# TODO: Try out a model, fit it to the training data, and score it with the testing data. If a random state is needed, use 42
#
# HINT: Don't forget to modify the hyperparameter(s)! Play with them to see if you can find the best value for each.
# HINT: If your code isn't working because of an array reshaping issue, you might not have split the data correctly above.
#       Try changing that and running this cell again.
# TIP: You can train more than one model to see which one works best! Add a new cell or create new models below.
#

circle_model = ...

# use .fit(training data)
# ...

# use .score(testing data)
circle_score = ...

In [None]:
# TODO: Uncomment the code below to display the decision boundary
# This may take a minute to run, depending on the model you picked
# visualize_boundary(X_test, y_test, circle_model, circle_score)