# Getting Started

In [None]:
import numpy as np
import pandas as pd

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegression

In [None]:
data = pd.read_csv('../input/bank-direct-marketing/bank-full.csv', delimiter=';')

In [None]:
data

In [None]:
data.info()

# Preprocessing

In [None]:
y = data['y']
X = data.drop('y', axis=1)

In [None]:
def get_categorical_features(df):
    return [feature for feature in df.columns if df[feature].dtype == 'object']

In [None]:
get_categorical_features(X)

In [None]:
def get_uniques(df, columns):
    return {column: list(df[column].unique()) for column in columns}

In [None]:
get_uniques(X, get_categorical_features(X))

## Missing Values

In [None]:
X = X.replace('unknown', np.NaN)

In [None]:
X.isna().sum()

In [None]:
X.drop('poutcome', axis=1, inplace=True)

## Encoding

In [None]:
get_uniques(X, get_categorical_features(X))

In [None]:
binary_features = ['default', 'housing', 'loan']

ordinal_features = ['education', 'month']

nominal_features = ['job', 'marital', 'contact']

In [None]:
def binary_encode(df, columns, positive_label):
    df = df.copy()
    for column in columns:
        df[column] = df[column].apply(lambda x: 1 if x == positive_label else 0)
    return df

In [None]:
X = binary_encode(X, binary_features, 'yes')

In [None]:
def ordinal_encode(df, columns, orderings):
    df = df.copy()
    for column, ordering in zip(columns, orderings):
        df[column] = df[column].apply(lambda x: ordering.index(x) if str(x) != 'nan' else x)
    return df

In [None]:
education_ordering = ['primary', 'secondary', 'tertiary']

month_ordering = ['jan', 'feb', 'mar', 'apr', 'may', 'jun', 'jul', 'aug', 'sep', 'oct', 'nov', 'dec']

orderings = [education_ordering, month_ordering]


X = ordinal_encode(X, ordinal_features, orderings)

In [None]:
def onehot_encode(df, columns):
    df = df.copy()
    for column in columns:
        dummies = pd.get_dummies(df[column])
        df = pd.concat([df, dummies], axis=1)
        df.drop(column, axis=1, inplace=True)
    return df

In [None]:
X = onehot_encode(X, nominal_features)

In [None]:
X

In [None]:
X.isna().sum()

In [None]:
X['education'] = X['education'].fillna(X['education'].median())

## Scaling

In [None]:
scaler = StandardScaler()

X = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)

In [None]:
label_encoder = LabelEncoder()

y = label_encoder.fit_transform(y)

In [None]:
X

In [None]:
y

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7)

# Training

In [None]:
model = LogisticRegression()

model.fit(X_train, y_train)

In [None]:
model_acc = model.score(X_test, y_test)
print("Model Accuracy:", model_acc)

# Data Every Day

This notebook is featured on Data Every Day, a YouTube series where I train models on a new dataset each day.

***

Check it out!
https://youtu.be/_IJ99hSxny8