# Logistic Regression on Social Network Ads

## 1: Importing libraries and reading the dataset

In [None]:
# Importing the libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, normalize, StandardScaler

from sklearn.linear_model import LogisticRegression

import warnings
warnings.filterwarnings("ignore")

In [None]:
# Reading the dataset
df = pd.read_csv("../input/logistic-regression/Social_Network_Ads.csv")
df.head()

In [None]:
X = df.drop(["User ID", "Purchased"], axis=1)
y = df[["Purchased"]]

## 2: EDA

Plotting pairplot to visualize the distribution of data

In [None]:
sns.pairplot(df.drop(["User ID"], axis=1))

## 3: Feature Engineering

In [None]:
features_num = ["Age", "EstimatedSalary"]
features_cat = ["Gender"]

preprocessor = ColumnTransformer([("OneHotEncoder", OneHotEncoder(), features_cat),
                                 ("Normalization", StandardScaler(), features_num)], remainder="passthrough")

X = preprocessor.fit_transform(X)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## 4: Training and Fitting the model

In [None]:
model = LogisticRegression()
model.fit(X_train, y_train)

In [None]:
model.score(X_test, y_test)

## 5: Hyperparameter Tuning


Hyperparameter tuning is an important step in model building, it defines the parameters of an estimator till a max accuracy is reached. We will be performing 2 hyperparameter tuning operations one after the other
- RandomizedSearchCV - To arrive at a paramter combination that will be likely close to the best combination, this process will occur quickly and help us difine our GridSearchCV parameters
- GridSearchCV - This is run once the RandomizedSearchCV is complete, this will be an extensive search to find the best paramters


In [None]:
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

### RandomizedSearchCV

In [None]:
# 1. Defining the param gird
param_grid = {
    "penalty": ["l1", "l2", "elasticnet", "none"],
    "C": np.logspace(-3, 3, 7)
}
# 2. Defining the RandomizedSearchCV class
model_rscv = RandomizedSearchCV(model, param_grid, cv=10)
model_rscv.fit(X_train, y_train)

# 3. Getting the best params and score
print(model_rscv.best_params_)
print(model_rscv.best_score_)

# 4. Getting the prediciting for testing data
y_pred = model_rscv.best_estimator_.fit(X_train, y_train).predict(X_test)

print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))
print(accuracy_score(y_test, y_pred))

Even after performing RandomizedGridSearchCV, we don't see improvement in the model (~88%). We will stick with these parameters.