In [52]:
# import the libraries
from sklearn.datasets import make_blobs # for creating random data
from sklearn.model_selection import train_test_split # for splitting the data
import matplotlib.pyplot as plt # for plotting the data
import numpy as np # to perform operations on arrays

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier

from sklearn.metrics import accuracy_score # to compute the accuracy (we will also calculate the accuracy ourselves)

In [53]:
# create artificial data
X, y = make_blobs(

    n_samples = [20,40,40,100], # the number of samples for each cluster (200 samples total)
    # a cluster is a group of close data


    random_state = 2, # random seed

    centers = [[35, 30], [40, 60], [70, 40], [65, 80]], # the coordinates of the center of each cluster (each point has two features)

    cluster_std = [10,10,10,15] # specifies the standard deviation for each cluster

)

# converts the y array, which contains the cluster labels (0,1,2,3) into a binary classification problem where only samples belonging to cluster #3 are labeled as 1
# , and all other samples labeled as 0.
y = np.int16(y == 3) # convert to integers (make only class 3 pass (highest grades), all other classes fail)

Here, $X$ is a 200-by-2 array, representing 200 instances, each with two features (the results for exams 1 and 2), and $y$ is a 1-D array of length 200 containing the class of the instances in the dataset. Thus, for $0\leq i <200$,
$X[i,0]$ is the grade student $i$ obtained in exam 1, $X[i,1]$ is the grade for exam 2 and
$y[i]$ is student $i$'s final result, $y[i]=1$ means the student passed the course and  $y[i]=0$ means he/she did not.

In [54]:
# split the data
X_train, X_test, y_train, y_test = train_test_split(
    X, # feature array
    y, # target array
    test_size = 0.50, # 50% of data used for testing, other 50% used for training
    random_state = 0 # reproducibility
)

## Logistic Regression

In [63]:
# print(np.append(X_train, y_train.reshape(-1, 1), axis = 1)[:5])
# print(np.append(X_test, y_test.reshape(-1, 1), axis = 1)[:5])

logistic_regression = LogisticRegression()
logistic_regression.fit(X_train, y_train) # train the data by invoking .fit() using the training data

predictions = logistic_regression.predict(X_test) # make prediction on test data

# m = predictions.shape[0] # number of samples in the testing data
# matches = 0
# for i in np.arange(m):
#   if predictions[i] == y_test[i]:
#     matches += 1
# print(matches / m) # how many matches were correct?


logistic_regression.score(X_test, y_test) # compare prediction to the actual (the function will internally call predict(X_test) to generate predicted labels for the test data)


0.89

## K Nearest Neighbors

In [56]:
k_nearest_neighbors = KNeighborsClassifier(n_neighbors=1, weights="distance"), # use the closest neighbor, weights means that closer neighbors have more influence



## Decision Tree

In [57]:
decision_tree = DecisionTreeClassifier(max_depth = 3) # splits data based on feature values, max depth limits the tree to a depth of 3 levels