# Task for Today  

***

## Customer Behavior Prediction  

Given *data about customers*, let's try to predict whether a given customer will **purchase** the product being offered.

We will use a logistic regression model to make our predictions.

# Getting Started

In [None]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import LogisticRegression

In [None]:
data = pd.read_csv('../input/customer-behaviour/Customer_Behaviour.csv')

In [None]:
data

In [None]:
data.info()

# Preprocessing

In [None]:
def preprocess_inputs(df, engineer_features=False):
    df = df.copy()
    
    # Drop User ID column
    df = df.drop('User ID', axis=1)
    
    # Binary encode
    df['Gender'] = df['Gender'].replace({'Female': 0, 'Male': 1})
    
    # Feature engineering
    if engineer_features == True:
        income_threshold = df['EstimatedSalary'].quantile(0.95)
        df['High Income'] = df['EstimatedSalary'].apply(lambda x: 1 if x >= income_threshold else 0)
        
        old_age_threshold = df['Age'].quantile(0.75)
        df['Old Age'] = df['Age'].apply(lambda x: 1 if x >= old_age_threshold else 0)
        
        young_age_threshold = df['Age'].quantile(0.25)
        df['Young Age'] = df['Age'].apply(lambda x: 1 if x <= young_age_threshold else 0)
        
    # Split df into X and y
    y = df['Purchased']
    X = df.drop('Purchased', axis=1)
    
    # Train-test split
    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, shuffle=True, random_state=1)
    
    # Scale X
    scaler = StandardScaler()
    scaler.fit(X_train)
    X_train = pd.DataFrame(scaler.transform(X_train), index=X_train.index, columns=X_train.columns)
    X_test = pd.DataFrame(scaler.transform(X_test), index=X_test.index, columns=X_test.columns)
    
    return X_train, X_test, y_train, y_test

# Training/Results (No Feature Engineering)

In [None]:
X_train, X_test, y_train, y_test = preprocess_inputs(data, engineer_features=False)
X_train

In [None]:
model = LogisticRegression()
model.fit(X_train, y_train)

acc = model.score(X_test, y_test)

print("Test Accuracy: {:.3f}%".format(acc * 100))

# Training/Results (Feature Engineering)

In [None]:
X_train, X_test, y_train, y_test = preprocess_inputs(data, engineer_features=True)
X_train

In [None]:
model = LogisticRegression()
model.fit(X_train, y_train)

acc = model.score(X_test, y_test)

print("Test Accuracy: {:.3f}%".format(acc * 100))

# Data Every Day  

This notebook is featured on Data Every Day, a YouTube series where I train models on a new dataset each day.  

***

Check it out!  
https://youtu.be/baRbSngFHKw