# Machine Learning Project #5 - Bank Marketing Campaign

The dataset contains phone calls of bank marketing campaigns of a Portuguese banking institution which has 20 features and 1 target variable.

**The goal is to classify whether the client will subscribe to a term deposit.**

-------------------------

**Motivation:**

The algorithm used in this solution is **Logistic Regression** to predict whether the client will subscribe to a term deposit using the logic of **binary classification.**

-------------------

### Importing the required libraries

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction import FeatureHasher
pd.set_option('display.max_columns', None)

------------------------------

### Importing and viewing the dataset

In [None]:
bank = pd.read_csv(r'bank+marketing (1)\bank-additional\bank-additional\bank-additional-full.csv', delimiter = ';')

In [None]:
bank.head()

In [None]:
np.round(bank.describe(), 2)

In [None]:
bank.info()

---------------

### Checking for outliers

In [None]:
numerical_cols = bank.select_dtypes(include='number')

Q1 = numerical_cols.quantile(0.25)
Q3 = numerical_cols.quantile(0.75)
IQR = Q3 - Q1

potential_outliers = ((numerical_cols < (Q1 - 1.5 * IQR)) | (numerical_cols > (Q3 + 1.5 * IQR)))
outliers = potential_outliers.any(axis=1)

print(bank[outliers])

In [None]:
plt.figure(figsize=(10, 6))
numerical_cols.boxplot()
plt.xticks(rotation=45)
plt.title('Boxplot of Numerical Columns')
plt.xlabel('Columns')
plt.ylabel('Values')
plt.show()

-----------------------

### Data Visualization

In [None]:
for column in numerical_cols.columns:
    plt.figure(figsize=(8, 6))
    sns.histplot(bank[column], bins=20)  # Adjust the number of bins as needed
    plt.title(f'Histogram of {column}')
    plt.xlabel('Values')
    plt.ylabel('Frequency')
    plt.show()

----------------

### Correlation between the numerical features

In [None]:
correlation_matrix = bank.corr()
plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm')
plt.title('Correlation Heatmap')
plt.show()

---------------------

### Data Imputation & Encoding

##### Removing all the rows from all columns whose value is 'unknown'

In [None]:
bank = bank[bank.ne('unknown').all(axis=1)].reset_index()

In [None]:
bank.head()

##### Encoding each categorical variable

In [None]:
object_columns = bank.select_dtypes(include='object').columns

In [None]:
unique_counts = {}
for column in object_columns:
    unique_counts[column] = bank[column].nunique()

In [None]:
print(unique_counts)

In [None]:
for i in bank.dtypes[bank.dtypes == 'object'].keys():
    if i!='y':
        h = FeatureHasher(n_features = len(bank[i].unique()), input_type ='string')
        new_column = i+'_encode'
        hashed_Feature = h.fit_transform(bank[i])
        hashed_Feature = hashed_Feature.toarray()
        bank1 = pd.DataFrame(hashed_Feature)
        column_list = list(bank1.columns)
        print(column_list)
        column_dict={}
        for j in column_list:
            column_dict[j]=i+'_'+str(j)
        print(column_dict)
        bank1=bank1.rename(columns=column_dict,errors='raise')
        print(list(bank1.columns))
        bank = pd.concat([bank, bank1], axis = 1)

In [None]:
bank['y_encoded'] = label_encoder.fit_transform(bank['y'])

In [None]:
bank.head()

In [None]:
bank = bank.drop(columns = list(bank.dtypes[bank.dtypes == 'object'].keys()))
bank.head()

---------------

### Splitting the dataset into training and test data

In [None]:
X = bank.drop('y_encoded', axis=1)
y = bank['y_encoded']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 42)

------------

### Training the model

In [None]:
logreg = LogisticRegression(max_iter=1000)  # Initialize the logistic regression model
logreg.fit(X_train, y_train)  # Train the model

---------------

### Using the model to predict the test dataset

In [None]:
predictions = logreg.predict(X_test)

In [None]:
accuracy = accuracy_score(y_test, predictions)
print(f"Accuracy: {accuracy:.2f}")

# Additional evaluation metrics
print("\nClassification Report:")
print(classification_report(y_test, predictions))

------------------

### Calculating the cross-validation score

In [None]:
logistic_regression_pipeline = make_pipeline(StandardScaler(),
                                           LogisticRegression())

In [None]:
mse_logistic = cross_val_score(estimator = logistic_regression_pipeline,
                               X = X,
                               y = y,
                               cv = KFold(n_splits=10, shuffle=True, random_state=0),
                               scoring = 'neg_mean_squared_error')

In [None]:
-(mse_logistic.mean())