### Import packages

In [63]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix

### Import Data

In [93]:
data = pd.read_csv("data/Social_Network_Ads.csv")

In [94]:
data.head()

Unnamed: 0,User ID,Gender,Age,EstimatedSalary,Purchased
0,15624510,Male,19,19000,0
1,15810944,Male,35,20000,0
2,15668575,Female,26,43000,0
3,15603246,Female,27,57000,0
4,15804002,Male,19,76000,0


### Exploratory Data Analysis

* Here target variable is `Purchased`.

In [95]:
data["Purchased"].value_counts()

0    257
1    143
Name: Purchased, dtype: int64

* It has 2 values, so indeed we are working with binary classification

#### Handle missing values

In [96]:
data.isna().sum()

User ID            0
Gender             0
Age                0
EstimatedSalary    0
Purchased          0
dtype: int64

* We are good, there are no missing values

#### Get useful features only

* User ID is not relevant feature for prediction, lets get rid of it.

In [97]:
data.drop(["User ID"],axis=1,inplace=True)

In [98]:
data.head()

Unnamed: 0,Gender,Age,EstimatedSalary,Purchased
0,Male,19,19000,0
1,Male,35,20000,0
2,Female,26,43000,0
3,Female,27,57000,0
4,Male,19,76000,0


In [99]:
gender = pd.get_dummies(data["Gender"], drop_first=True)

In [100]:
gender.head()

Unnamed: 0,Male
0,1
1,1
2,0
3,0
4,1


In [101]:
data.drop(["Gender"],axis=1, inplace=True)

In [102]:
data = pd.concat([data, gender], axis=1)

In [103]:
data.head()

Unnamed: 0,Age,EstimatedSalary,Purchased,Male
0,19,19000,0,1
1,35,20000,0,1
2,26,43000,0,0
3,27,57000,0,0
4,19,76000,0,1


In [104]:
X = data[["Age", "EstimatedSalary", "Male"]]

In [105]:
y = data["Purchased"]

In [106]:
X.shape

(400, 3)

In [107]:
y.shape

(400,)

In [108]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = 0.25)

In [109]:
X_train.shape

(300, 3)

In [110]:
X_test.shape

(100, 3)

### Feature Scaling

In [111]:
sc_X = StandardScaler()
X_train = sc_X.fit_transform(X_train)
X_test = sc_X.transform(X_test)

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)
  This is separate from the ipykernel package so we can avoid doing imports until


In [112]:
X_train[:5]

array([[-0.09447558, -0.52143383,  1.01342342],
       [-1.25526769,  0.25254712,  1.01342342],
       [-0.19120826, -0.52143383,  1.01342342],
       [ 0.0022571 , -0.26344018,  1.01342342],
       [ 0.96958385,  2.0298367 ,  1.01342342]])

In [113]:
LogReg = LogisticRegression()

In [114]:
LogReg.fit(X_train, y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

In [115]:
y_pred = LogReg.predict(X_test)

In [116]:
y_pred[:10]

array([1, 0, 0, 1, 0, 0, 1, 1, 1, 0], dtype=int64)

In [117]:
confusion_matrix(y_test, y_pred)

array([[63,  8],
       [10, 19]], dtype=int64)