## Setup stuff (don't edit)

### Imports

In [3]:
import numpy as np
import matplotlib.pyplot as plt

from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier

from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.datasets import make_blobs, make_classification, make_circles, make_moons, make_regression

### Datasets

In [4]:
num_data_points = 400

moons_data = make_moons(n_samples=num_data_points, noise=0.2, random_state=42)

classification_data = make_classification(n_samples=num_data_points, n_features=2,
                                          n_informative=2, n_redundant=0,
                                          class_sep=1,
                                          n_clusters_per_class=1,
                                          n_classes=2, random_state=42)

blobs_data = make_blobs(n_samples=num_data_points, centers=3, cluster_std=3,
                        n_features=2, random_state=42)

circle_data = make_circles(n_samples=num_data_points, factor=0.5, noise=0.1,
                           random_state=42)

datasets = [moons_data, classification_data, blobs_data, circle_data]

### Model Initializing

In [None]:
def linear_model(c):
  return LogisticRegression(C=c, random_state=42)

In [None]:
def linear_model_complex(c):
  return SVC(kernel="linear", C=c, random_state=42)

In [None]:
def tree_model(depth):
  return DecisionTreeClassifier(max_depth=depth, random_state=42)

In [None]:
def non_linear(neighbours):
  return KNeighborsClassifier(n_neighbors=neighbours)

In [None]:
def non_linear_complex(gamma, c):
  return SVC(kernel="rbf", gamma=gamma, C=c, random_state=42)

### Data Visualizing

In [7]:
# Visualize data as a scatter plot
def visualize_data(data):
    fig, ax = plt.subplots(figsize=(5, 5))
    X, y = data
    ax.scatter(X[:, 0], X[:, 1], c=y, cmap='viridis', s=50, edgecolor='k')
    ax.set_xlabel("Feature 1", fontsize=12)
    ax.set_ylabel("Feature 2", fontsize=12)
    plt.show()

In [None]:
# Visualize feature boundary in scatter plot
def visualize_boundary(X, y):
    # Show decision boundaries
    y_pred = model.predict(X)
    acc = accuracy_score(y, y_pred)


    # Plot decision boundary and points
    x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
    y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
    xx, yy = np.meshgrid(np.arange(x_min, x_max, 0.01), np.arange(y_min, y_max, 0.01))

    Z = model.predict(np.c_[xx.ravel(), yy.ravel()])
    Z = Z.reshape(xx.shape)

    # Plot decision boundary
    fig, ax = plt.subplots(figsize=(5, 5))
    ax.contourf(xx, yy, Z, alpha=0.3, cmap="viridis")
    ax.scatter(X[:, 0], X[:, 1], c=y, cmap='viridis', s=50, edgecolor='k')
    ax.set_title(f"Accuracy: {acc:.2f}")
    ax.set_xlabel("Feature 1", fontsize=12)
    ax.set_ylabel("Feature 2", fontsize=12)
    plt.show()

# Section 1: The Data

We've created 4 interestingly-shaped datasets for you: `moons_data`, `classification_data`, `blobs_data`, and `circle_data`. Use the `visualize_data` function provided above to display each dataset.

In [15]:
# TODO: Use the visualize_data function to show what each dataset looks like.
#
# HINT: If the code is not working make sure you have run all the cells above!
# ...

# Section 2: Splitting The Data

Let's see what one of these datasets actually looks like and print out `moons_data`.

In [39]:
# TODO: Uncomment the line below to display moons_data
# moons_data

What do we notice? We have a tuple (pair of two items) with two arrays (lists) inside it. The first list stores sub-lists of length 2, and the second list has a bunch of 1s and 0s in it. 

Which list do you think is the features and which do you think is the target? Uncomment the appropriate line of code below.

In [36]:
# TODO: Uncomment the appropriate line of code
# If you think the first list holds the features and the second list holds the target, uncomment this line of code:
# X, y = moons_data

# If you think the first list holds the target and the second list holds the features, uncomment this line of code:
# y, X = moons_data

To split the data into training and test sets, we use the `train_test_split` function. Use [this documentation link](https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.train_test_split.html) to get familiar with `train_test_split`. Scroll down on the page to see examples of the function being used in the code.

In [37]:
# TODO: Split moons_data into train and test sets. Use a random state of 123. 30% of the data should be in the test set
# Then print out X_train and y_train below
#
# HINT: Make sure that your X_train, X_test... etc variables are in the right order
# ...

In [38]:
#@title Test case (DO NOT EDIT)
assert len(X_train) + len(X_test) == len(X), "Expected X_train and X_test to combine to X"
assert len(y_train) + len(y_test) == len(y), "Expected y_train and y_test to combine to y"
assert len(X_train) == len(X)*0.7, "Expected X_train to be 70% of X's size, got " + str(100*len(X_train)/len(X)) + "%"
assert len(X_test) == len(X)*0.3, "Expected X_test to be 30% of X's size, got " + str(100*len(X_test)/len(X)) + "%"
assert len(y_train) == len(y)*0.7, "Expected y_train to be 70% of y's size, got " + str(100*len(y_train)/len(y)) + "%"
assert len(y_test) == len(y)*0.3, "Expected y_test to be 30% of y's size, got " + str(100*len(y_test)/len(y)) + "%"

# Section 3: The Models

Let's test out some models to see 

### Tree Models

In [None]:
depth = 2

model = DecisionTreeClassifier(max_depth=depth, random_state=42)

# use .fit(training data)

# use .predict(testing data),

# call the function visualize_boundary with the result of the prediction

In [None]:
from sklearn.tree import plot_tree

# plot_tree(model, feature_names=["Feature 1", "Feature 2"], class_names=["Purple", "Yellow"], label="none", filled=True)

### Linear Models

In [None]:
X, y = classification_data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=123)

c = 1

model = LogisticRegression(C=c, random_state=42)

# use .fit(training data)

# use .predict(testing data),

# call the functionisualize_boundary with the result of the prediction

In [None]:
X, y = classification_data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=123)

c = 1

model = SVC(kernel="linear", C=c, random_state=42)

# use .fit(training data)

# use .predict(testing data),

# call the functionisualize_boundary with the result of the prediction

### Non-Linear Models

In [None]:
X, y = circle_data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=123)

g = 0.1
c = 1

model = SVC(kernel="rbf", gamma=g, C=c, random_state=42)

# use .fit(training data)

# use .predict(testing data),

# call the functionisualize_boundary with the result of the prediction

In [None]:
X, y = blobs_data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=123)

# Heads up! This model takes a long time to run (you will learn more about why tomorrow)
# Suggestion: run once all your other models are done

neighbours = 5

model = KNeighborsClassifier(n_neighbors=neighbours)

# use .fit(training data)

# use .predict(testing data),

# call the functionisualize_boundary with the result of the prediction