# SUV Purchase Decision

## Goal

The goal of this notebook is to predict if a SUC will be purchased or not based on:
- gender (Male or Female)
- age
- estimated salary

## Setup

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os # file system handling
import seaborn as sns # plotting
import matplotlib.pyplot as plt # plots handling

for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

## Load the data

In [None]:
data = pd.read_csv('../input/suv-purchase-decision/SUV_Purchase.csv')

In [None]:
data.head()

## Basic insights

In [None]:
print(f"There are {data.shape[0]} rows and {data.shape[1]} columns.")
print(f"There are {data.isna().sum().sum()} missing values.")
print(f"{data.query('Purchased == 0').shape[0] / data.shape[0] * 100}% people purchased a SUC while {data.query('Purchased == 1').shape[0] / data.shape[0] * 100}% did not.")

___
# Data visualization

In [None]:
WIDTH  = 20
HEIGHT = 6

def plot_vs(frame, col1, col2, hue):
    plt.figure(figsize=(WIDTH, HEIGHT * 2))
    sns.scatterplot(data=frame, x=col1, y=col2, hue=hue)
    plt.title(f"Scatter plot | {col1} vs {col2}")
    plt.show()
    
def boxplot_cat(frame, col1, hue):
    plt.figure(figsize=(WIDTH, HEIGHT))
    sns.boxplot(data=frame, x=col1, y=hue, orient="h")
    plt.title(f"Box plot | {col1} by {hue}")
    plt.show()

In [None]:
plot_vs(data, 'Age', 'EstimatedSalary', 'Gender')
plot_vs(data, 'Age', 'EstimatedSalary', 'Purchased')

In [None]:
boxplot_cat(data, 'Age', 'Gender')
boxplot_cat(data, 'Age', 'Purchased')

boxplot_cat(data, 'EstimatedSalary', 'Gender')
boxplot_cat(data, 'EstimatedSalary', 'Purchased')

___

# Modelling

## Quick pre-processing

In [None]:
# Create a copy
train = data.copy()

# EstimatedSalary : Outliers handling
train = train.drop(train.query("EstimatedSalary > 120000 and Purchased == 0").index)

# Age : Outliers handling
train = train.drop(train.query("Age > 55 and Purchased == 0").index)

# Gender : Binary encoding
gender = {"Male":0, "Female":1}
train  = train.replace({"Gender":gender})

In [None]:
features = ['Gender', 'EstimatedSalary', 'Age']
target   = 'Purchased'

## Model

In [None]:
from sklearn.linear_model import LogisticRegressionCV


# Prepare features and target
X = train[features]
y = train[target]

# Model training
clf = LogisticRegressionCV(cv=5, random_state=42)

clf = clf.fit(X, y)

In [None]:
from sklearn.metrics import accuracy_score, roc_auc_score, f1_score

print(f"Accuracy  : {accuracy_score(y, clf.predict(X))}")
print(f"ROC score : {roc_auc_score(y, clf.predict(X))}")
print(f"F1  score : {f1_score(y, clf.predict(X))}")

# Prediction visualization

In [None]:
data['prediction'] = clf.predict(data.replace({"Gender":gender})[features])

In [None]:
plot_vs(data, 'Age', 'EstimatedSalary', 'Purchased')
plot_vs(data, 'Age', 'EstimatedSalary', 'prediction')

# Conclusion

Hope you enjoyed this simple Logistic regression on SUV purchased data!