# Implementing logistic regression using Python

**Importing Libraries**

In [82]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn import preprocessing
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler 
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

**Importing the csv dataset using pandas**

In [83]:
df=pd.read_csv("C:\\Users\\91810\\OneDrive\\Pictures\\Desktop\\Intro to DS\\Social_Network_Ads.csv")

## Analysing the dataset

In [84]:
df.head()

Unnamed: 0,User ID,Gender,Age,EstimatedSalary,Purchased
0,15624510,Male,19,19000,0
1,15810944,Male,35,20000,0
2,15668575,Female,26,43000,0
3,15603246,Female,27,57000,0
4,15804002,Male,19,76000,0


In [85]:
df.shape

(400, 5)

In [86]:
##datatype in each column
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 400 entries, 0 to 399
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   User ID          400 non-null    int64 
 1   Gender           400 non-null    object
 2   Age              400 non-null    int64 
 3   EstimatedSalary  400 non-null    int64 
 4   Purchased        400 non-null    int64 
dtypes: int64(4), object(1)
memory usage: 15.8+ KB


In [87]:
#df.isnull()
df.isnull().sum()

User ID            0
Gender             0
Age                0
EstimatedSalary    0
Purchased          0
dtype: int64

In [88]:
df.nunique()

User ID            400
Gender               2
Age                 43
EstimatedSalary    117
Purchased            2
dtype: int64

## Preprocesssing data and Regression Models

### From Scratch

In [103]:
#Splitting dataset
X_train, X_test, Y_train, Y_test = train_test_split(df['Age'], df['Purchased'], test_size=0.30)

**Normalisation**

In [104]:
def normalize(X):
    return X - X.mean()

**Standardisation**

In [129]:
def standardize(X):
    mean_X = np.mean(X)
    std_X = np.std(X)

    standardized_X = (X - mean_X) / std_X
    return standardized_X

**Logistic Regression Model**

In [130]:
def predict(X, b0, b1):
    return np.array([1 / (1 + np.exp(-1*b0 + -1*b1*x)) for x in X])

In [131]:
def logistic_regression(X, Y):
    
    m=0
    c=0
    
    lr = 0.001
    epochs = 10000

    for epoch in range(epochs):
        y_pred = predict(X, c, m)
        
        dc = -2 * sum((Y - y_pred) * y_pred * (1 - y_pred))
        dm = -2 * sum(X * (Y - y_pred) * y_pred * (1 - y_pred))
        
        c = c - lr* dc
        m = m - lr * dm
    
    return c, m

**Accuracy of the model**

In [134]:
def accuracy_model(test_data_x, test_data_y, c, m):
    accuracy = 0
    
    y_pred = predict(test_data_x, c, m)
    y_pred = [1 if p >= 0.5 else 0 for p in y_pred]
    
    for i in range(len(y_pred)):
        if y_pred[i] == test_data_y.iloc[i]:
            accuracy += 1
    return accuracy*100 / len(y_pred)

**Training the model**

In [137]:
c1, m1= logistic_regression(X_train, Y_train)
c2, m2= logistic_regression(normalize(X_train), Y_train)
c3, m3= logistic_regression(standardize(X_train), Y_train)

**Accuracy Calculation**

In [142]:
print("Accuracies of models:\n")
print("Using raw data:", round(accuracy_model(X_test, Y_test, c1, m1), 4))
print("Using normalised data:", round(accuracy_model(normalize(X_test), Y_test, c2, m2),4))
print("Using standardised data:", round(accuracy_model(standardize(X_test), Y_test, c3, m3),4))

Accuracies of models:

Using raw data: 61.6667
Using normalised data: 84.1667
Using standardised data: 84.1667


### Using Scikit-learn

**Splitting the dataset into train and test set**

In [143]:
#original dataset
x = df[['Age', 'EstimatedSalary']].values 
y = df['Purchased']

#splitting the data set
train_x, test_x, train_y, test_y = train_test_split(x, y, test_size=0.3, random_state=0)

**Normalisation of the dataset**

In [144]:
norm = MinMaxScaler().fit(train_x)

train_norm_x= norm.transform(train_x)
test_norm_x = norm.transform(test_x)

**Standardisation of the dataset**

In [145]:
scaler = StandardScaler() 

x_train = scaler.fit_transform(train_x) 
x_test = scaler.fit_transform(test_x)

## Logistic Regression Models

**Logistic regression model for raw data**

In [146]:
model_1 = LogisticRegression(max_iter=10000, random_state=0) 
model_1.fit(train_x, train_y)

predictions_1 = model_1.predict(test_x)

#accuracy of the model
accuracy_1 = accuracy_score(test_y, predictions_1)

#print("Accuracy:", f'{accuracy_1*100}%')

**Logistic regression model for normalised data**

In [147]:
model_2 = LogisticRegression(max_iter=10000, random_state=0) 
model_2.fit(train_norm_x, train_y)

predictions_2 = model_2.predict(test_norm_x)

accuracy_2 = accuracy_score(test_y, predictions_2)

#print("Accuracy:", f'{accuracy_2*100}%')

**Logistic regression model for standardised data**

In [148]:
model_3 = LogisticRegression(max_iter=10000, random_state=0) 
model_3.fit(x_train, train_y)

predictions_3 = model_3.predict(x_test)

accuracy_3 = accuracy_score(test_y, predictions_3)

#print("Accuracy:", f'{accuracy_3*100}%')

## Results

In [149]:
print("Accuracies of models:\n")
print("Using raw data:", f'{accuracy_1*100:.4f}%')
print("Using normalised data:", f'{accuracy_2*100:.4f}%')
print("Using standardised data:", f'{accuracy_3*100:.4f}%')

Accuracies of models:

Using raw data: 65.8333%
Using normalised data: 84.1667%
Using standardised data: 89.1667%
