In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

import statsmodels.api as sm
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [None]:
raw_data = pd.read_csv('../input/logistic-regression/Social_Network_Ads.csv')
raw_data.head()

In [None]:
raw_data.describe(include='all')

In [None]:
raw_data.info()

## Remove User ID

In [None]:
data_no_userid = raw_data.drop(columns=["User ID"], axis = 1)
data_no_userid.describe(include='all')

In [None]:
fig, ax = plt.subplots(figsize = (10, 6))

sns.scatterplot(ax=ax, 
                data=data_no_userid, 
                x="EstimatedSalary", 
                y="Age", 
                hue="Purchased")
plt.show()

In [None]:
sns.histplot(data=data_no_userid,
             x="Age", 
             kde=True)
plt.show()

In [None]:
sns.histplot(data=data_no_userid,
             x="EstimatedSalary", 
             kde=True)
plt.show()

## Map Gender Male to 1 and Female to 0

In [None]:
data_no_userid["Gender"].unique()

In [None]:
gender = ['Male', 'Female']
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
le = le.fit(gender)
data_with_dummies = data_no_userid.copy()
data_with_dummies["Gender"] = le.fit_transform(data_with_dummies["Gender"])
data_with_dummies.info()

data_with_dummies = data_no_userid.copy()
data_with_dummies["Gender"] = data_with_dummies["Gender"].map({'Male': 1, 'Female': 0})
data_with_dummies.info()

In [None]:
groupbyGender = pd.DataFrame(data=data_no_userid.groupby(by=["Gender"]).Purchased.sum()).reset_index()
sns.barplot(data=groupbyGender, x="Gender", y="Purchased")
plt.show()

In [None]:
sns.histplot(data=data_with_dummies, 
             x=np.log(data_with_dummies["EstimatedSalary"]), 
             kde=True)
plt.show()

## Define Independent & Dependent Variables

In [None]:
y = data_with_dummies["Purchased"]
x1 = data_with_dummies.drop(columns=["Purchased"], axis=1)

## Using SM Logit Regression

In [None]:
x = sm.add_constant(x1)
reg_log = sm.Logit(y,x)
results_log = reg_log.fit()
results_log.summary()

## Check accuracy

In [None]:
pred_corr = results_log.pred_table()[0, 0] + results_log.pred_table()[1, 1]
pred_incorr = results_log.pred_table()[0, 1] + results_log.pred_table()[1, 0]
total = results_log.pred_table().sum()

accuracy = pred_corr/total*100
print("Accuracy of the model is %.2f" %(accuracy) + '%')

## Check Logit Regression after Standardisation

In [None]:
y = data_with_dummies["Purchased"]
x1 = data_with_dummies.drop(columns=["Purchased"], axis=1)

In [None]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler(feature_range=(0,1))
x1_scaled = scaler.fit_transform(x1)

In [None]:
x1_scaled

In [None]:
x = sm.add_constant(x1_scaled)
reg_log = sm.Logit(y,x)
results_log = reg_log.fit()
results_log.summary()

## Accuracy

In [None]:
pred_corr = results_log.pred_table()[0, 0] + results_log.pred_table()[1, 1]
pred_incorr = results_log.pred_table()[0, 1] + results_log.pred_table()[1, 0]
total = results_log.pred_table().sum()

accuracy = pred_corr/total*100
print("Accuracy of the model is %.2f" %(accuracy) + '%')

## Split dataset in Train & Test

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x1_scaled, y, test_size=0.2, random_state = 42)

model = LogisticRegression()
model.fit(x_train, y_train)

In [None]:
y_pred = model.predict(x_test)
y_pred

In [None]:
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

In [None]:
cm = confusion_matrix(y_test, y_pred)
pred_corr = cm[0, 0] + cm[1, 1]
pred_incorr = cm[0, 1] + cm[1, 0]
total = cm.sum()

accuracy = pred_corr/total*100
print("Accuracy of the model is %.2f" %(accuracy) + '%')

In [None]:
acc_score = accuracy_score(y_test, y_pred)
print("Accuracy score is %.2f" %(acc_score*100)+ '%')

In [None]:
classification_report(y_test, y_pred)