# Analyzing, Cleaning, Scaling the data

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import copy, math

%matplotlib inline

In [2]:
df = pd.read_csv('car_data.csv')

df

Unnamed: 0,User ID,Gender,Age,AnnualSalary,Purchased
0,385,Male,35,20000,0
1,681,Male,40,43500,0
2,353,Male,49,74000,0
3,895,Male,40,107500,1
4,661,Male,25,79000,0
...,...,...,...,...,...
995,863,Male,38,59000,0
996,800,Female,47,23500,0
997,407,Female,28,138500,1
998,299,Female,48,134000,1


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   User ID       1000 non-null   int64 
 1   Gender        1000 non-null   object
 2   Age           1000 non-null   int64 
 3   AnnualSalary  1000 non-null   int64 
 4   Purchased     1000 non-null   int64 
dtypes: int64(4), object(1)
memory usage: 39.2+ KB


In [4]:
df.isnull().sum()

User ID         0
Gender          0
Age             0
AnnualSalary    0
Purchased       0
dtype: int64

In [5]:
df.drop(columns=['User ID','Gender'], inplace=True)

df.drop_duplicates(inplace=True)

df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 907 entries, 0 to 999
Data columns (total 3 columns):
 #   Column        Non-Null Count  Dtype
---  ------        --------------  -----
 0   Age           907 non-null    int64
 1   AnnualSalary  907 non-null    int64
 2   Purchased     907 non-null    int64
dtypes: int64(3)
memory usage: 28.3 KB


In [6]:
df

Unnamed: 0,Age,AnnualSalary,Purchased
0,35,20000,0
1,40,43500,0
2,49,74000,0
3,40,107500,1
4,25,79000,0
...,...,...,...
995,38,59000,0
996,47,23500,0
997,28,138500,1
998,48,134000,1


In [7]:
df['Age'].max()

63

In [8]:
df['AnnualSalary'].max()

152500

In [9]:
#max-min scaling

def max_min_scaler(X):
    
    X_new = (X - X.min()) / (X.max() - X.min())
    
    return X_new

In [10]:
df_temp = df.drop('Purchased',axis = 1)

columns = list(df_temp)

columns

['Age', 'AnnualSalary']

In [11]:
for i in columns:
    df_temp[i] = max_min_scaler(df_temp[i]) 
    
df_temp

Unnamed: 0,Age,AnnualSalary
0,0.377778,0.036364
1,0.488889,0.207273
2,0.688889,0.429091
3,0.488889,0.672727
4,0.155556,0.465455
...,...,...
995,0.444444,0.320000
996,0.644444,0.061818
997,0.222222,0.898182
998,0.666667,0.865455


In [12]:
df['Age'] = df_temp['Age']
df['AnnualSalary'] = df_temp['AnnualSalary']

df

Unnamed: 0,Age,AnnualSalary,Purchased
0,0.377778,0.036364,0
1,0.488889,0.207273,0
2,0.688889,0.429091,0
3,0.488889,0.672727,1
4,0.155556,0.465455,0
...,...,...,...
995,0.444444,0.320000,0
996,0.644444,0.061818,0
997,0.222222,0.898182,1
998,0.666667,0.865455,1


# Logistic Regression with Gradient Descent

In [13]:
y_train = df['Purchased'].to_numpy()

y_train.shape

(907,)

In [14]:
df_X = df.drop('Purchased', axis=1)

X_train = df_X.to_numpy()

X_train.shape

(907, 2)

# Sigmoid function:
    
    
    g(z) = 1 / (1 + e^(-z))

In [15]:
def sigmoid(z):

    g =  1 / (1 + np.exp(-z))
 
    return g

# Logistic Cost Function:
    
        
        J(w, b) = 1/m * loss(f_w,b(x^i) , y^i) (i = 0..m-1) 
        
        loss(f_w,b(x^i) , y^i) = -y^i * log(f_w,b(x^i)) - (1 - y^i) * log(1 - f_w,b(x^i))

In [16]:
def compute_cost(X, y, w, b):
  
    m, n = X.shape
    
    loss_sum = 0
    
    for i in range(m):
        z_wb = 0
        
        for j in range(n):
            z_wb_ij = w[j]*X[i][j]
            z_wb += z_wb_ij
        z_wb += b    
        f_wb = sigmoid(z_wb)
        loss = -y[i] * np.log(f_wb) - (1 - y[i]) * np.log(1 - f_wb)
        
        
        loss_sum += loss 
    
    total_cost = (1/m)*loss_sum   

    return total_cost

# Derivatives for gradient descent

In [17]:
def compute_gradient(X, y, w, b): 
    
    m, n = X.shape
    dj_dw = np.zeros(w.shape)   
    dj_db = 0.                    

  
    for i in range(m):
        z_wb = 0
        for j in range(n): 
            z_wb_ij = X[i, j] * w[j]
            z_wb += z_wb_ij
        z_wb += b
        f_wb = sigmoid(z_wb)
        
        dj_db_i = f_wb - y[i]
        dj_db += dj_db_i
        
        for j in range(n):
            dj_dw_ij = (f_wb - y[i])* X[i][j]
            dj_dw[j] += dj_dw_ij
            
    dj_dw = dj_dw / m
    dj_db = dj_db / m
     
    return dj_db, dj_dw

# Gradient Descent

In [18]:
def gradient_descent(X, y, w_in, b_in, compute_cost, compute_gradient, alpha, num_iters): 
    
    J_history = []
    w = copy.deepcopy(w_in)  #to keep global w variable 
    b = b_in
    
    for i in range(num_iters):

        dj_db,dj_dw = compute_gradient(X, y, w, b)

        # Gradient Descent Algorithm , simultaneously updated
        w = w - alpha * dj_dw             
        b = b - alpha * dj_db
      
        # Save cost after every iteration
        if i<100001:      # prevent resource exhaustion 
            J_history.append(compute_cost(X, y, w, b))

        # print the last element of J_history when iteration count reaches %10 of the total iterations
        if i% math.ceil(num_iters / 10) == 0:  
            print(f"Iteration {i}: Cost {J_history[-1]}")
        elif num_iters == len(J_history):
            print(f"Iteration {num_iters}: Cost {J_history[-1]}")
        
    return w, b, J_history

In [19]:
w_init = np.zeros(2)
b_init = 0
iterations = 10000
alpha = 0.1

w_final , b_final , J_hist = gradient_descent(
                X_train, y_train, w_init, b_init , compute_cost, compute_gradient, alpha, iterations)

print(f"b,w found by gradient descent: {b_final},{w_final} ")

Iteration 0: Cost 0.6924067368543315
Iteration 1000: Cost 0.46098198479483204
Iteration 2000: Cost 0.41463739351849654
Iteration 3000: Cost 0.3973493186497802
Iteration 4000: Cost 0.3890764045380374
Iteration 5000: Cost 0.3845734849093788
Iteration 6000: Cost 0.38192447991662554
Iteration 7000: Cost 0.3802837994042578
Iteration 8000: Cost 0.37923014454873927
Iteration 9000: Cost 0.37853523446954473
Iteration 10000: Cost 0.3780679746140845
b,w found by gradient descent: -6.437438470669399,[8.36678519 4.12556907] 


# Prediction, Accuracy

In [22]:
def predict(X, w, b): 
    
   
    m, n = X.shape   
    p = np.zeros(m)
    
    for i in range(m):   
        z_wb = 0 
        z_wb_i = np.dot(X[i],w)
        z_wb += z_wb_i
        
        z_wb += b
        
        f_wb = sigmoid(z_wb)
        
        #threshold, 0 for below 0.5 and 1 for above 0.5
        p[i] = f_wb >= 0.5
        
    return p

In [23]:
p = predict(X_train, w_final,b_final)

print('Train Accuracy: %f'%(np.mean(p == y_train) * 100))

Train Accuracy: 82.359427


    In conclusion, we can say that there is %82 percent accuracy of predicting whether a car is being purchased or not,
    depending on age & annual salary.