In [21]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
plt.style.use("ggplot")
%matplotlib inline

In [22]:
from pylab import rcParams
rcParams['figure.figsize'] = 12, 8

In [23]:
data = pd.read_csv("DMV_Written_Tests.csv")
data.head()

Unnamed: 0,DMV_Test_1,DMV_Test_2,Results
0,34.62366,78.024693,0
1,30.286711,43.894998,0
2,35.847409,72.902198,0
3,60.182599,86.308552,1
4,79.032736,75.344376,1


In [24]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   DMV_Test_1  100 non-null    float64
 1   DMV_Test_2  100 non-null    float64
 2   Results     100 non-null    int64  
dtypes: float64(2), int64(1)
memory usage: 2.5 KB


###  Visualize the Data
#### Creating a scatter Plot


In [25]:
passed = (result==1).reshape(100,1)
failed = (result==0).reshape(100,1)

ax = sns.scatterplot(x = scores[passed[:,0],0],
                     y = scores[passed[:,0],1],
                     marker = '^',
                     color = 'green',
                     s = 50)
sns.scatterplot(x = scores[failed[:,0],0],
                     y = scores[failed[:,0],1],
                     marker = 'X',
                     color = 'red',
                     s = 50)
ax.set(xlabel = 'Written Test Scores Paper 1',ylabel = 'Written Test Scores Paper 2')
ax.legend(['Passed','Failed'])
plt.show()

NameError: name 'result' is not defined

## Define the Logistic Sigmoid Function $\sigma(z)$
---

$$ \sigma(z) = \frac{1}{1+e^{-z}}$$

### $\sigma(z)$ squishes all the data --> [0,1]

In [26]:
def logistic_function(z):
    return 1/(1 + np.exp(-z))

In [27]:
logistic_function(0)

0.5

### Compute the Cost Function $J(\theta)$ and Gradient
---

The objective of logistic regression is to minimize the cost function

$$J(\theta) = -\frac{1}{m} \sum_{i=1}^{m} [ y^{(i)}log(h_{\theta}(x^{(i)})) + (1 - y^{(i)})log(1 - (h_{\theta}(x^{(i)}))]$$

where the gradient of the cost function is given by

$$ \frac{\partial J(\theta)}{\partial \theta_j} = \frac{1}{m} \sum_{i=1}^{m} (h_{\theta}(x^{(i)}) - y^{(i)})x_j^{(i)}$$

In [28]:
def compute_cost(theta , x , y):
    m = len(y)
    y_pred = logistic_function(np.dot(x,theta))
    error = (y * np.log(y_pred)) + (1 - y) * np.log(1 - y_pred)
    cost = -1 / m * sum(error)
    gradient = 1 / m * np.dot(x.transpose() , (y_pred - y))
    return cost[0],gradient

In [29]:
mean_scores = np.mean(scores , axis = 0)
std_scores = np.std(scores , axis = 0)
scores = (scores - mean_scores) / std_scores

rows = scores.shape[0]
cols = scores.shape[1]

x = np.append(np.ones((rows,1)),scores , axis =1)
y = result.reshape(rows,1)

theta_init = np.zeros((cols + 1,1))
cost , gradient = compute_cost(theta_init, x ,y)

print("Cost at initialisation",cost)
print("Gradients at initialisation",gradient)

NameError: name 'scores' is not defined

In [30]:
def gradient_descent(x, y ,theta , alpha , iterations):
    costs = []
    for i in range(iterations):
        cost , gradient = compute_cost(theta , x, y)
        theta -= (alpha * gradient)
        costs.append(cost)
    return theta , cost
    

In [31]:
theta , costs = gradient_descent(x,y, theta_init , 1,200)



NameError: name 'x' is not defined

In [33]:
print("Theta after running gradient descent algorithm: ",theta)
print("Resulting cost :"costs[-1])

SyntaxError: invalid syntax (<ipython-input-33-25aa600f2bff>, line 2)

In [34]:
plt.plot(costs)
plt.xlabel("Iterations")
plt.ylabel("$J(\Theta)$")
plt.title("Values of Cost Function over Iterations of gradient Descent");

NameError: name 'costs' is not defined

### Task 9: Plotting the decision boundary
---

$h_\theta(x) = \sigma(z)$, where $\sigma$ is the logistic sigmoid function and $z = \theta^Tx$

When $h_\theta(x) \geq 0.5$ the model predicts class "1":

$\implies \sigma(\theta^Tx) \geq 0.5$

$\implies \theta^Tx \geq 0$ predict class "1" 

Hence, $\theta_1 + \theta_2x_2 + \theta_3x_3 = 0$ is the equation for the decision boundary, giving us 

$ x_3 = \frac{-(\theta_1+\theta_2x_2)}{\theta_3}$

In [38]:
ax = sns.scatterplot(x = X[passed[:,0],1],
                     y = X[passed[:,0],2]),
                     marker = "^",
                     color = 'green',
                     s = 60)
sns.scatterplot(x = X[failed[:,0],1],
                y = X[failed[:,0],2],
                marker = "X",
                color = 'red',
                s = 60)
ax.legend(['Passed','Failed'])
ax.set(xlabel = "DMV Written Test 1 Scores",ylable = "DMV Written Test 2 Scores")

x_boundary = np.array([np.mix(X[:,1]),np.max(X[:,1])])
y_boundary = -(theta[0] + theta[1]*x_boudnary) / theta[2]

sns.lineplot(x = x_boundary , y = y_boundary , color = "blue")
plt.show();

IndentationError: unindent does not match any outer indentation level (<tokenize>, line 7)

### Task 10: Predictions using the optimized $\theta$ values
---

$h_\theta(x) = x\theta$

In [39]:
def predict(theta,x):
    results = x.dot(theta)
    return results>0

In [40]:
p = predict(theta,x)
print("Training Accuracy:",sum(p==y)[0],"%")

NameError: name 'theta' is not defined

In [41]:
test = np.array([50,79])
test = (test - mean_scores)/std_scores
test = np.append(np.ones(1) , test)
probability = logistic_function(test.dot(theta))
print("A person who scores 50 and 79 on their DMV written tests have a ",
      np.round(probability[0], 2),
      "probability of passing")


NameError: name 'mean_scores' is not defined