In [1]:
import pandas as pd
from analysis_src.basic_analysis import BasicDataInspection

In [2]:
basic_info = BasicDataInspection()
df = pd.read_csv('/home/sarath_kumar/Bank_Marketing/data/bank_marketing.csv')
basic_info.inspect_data(df)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45211 entries, 0 to 45210
Data columns (total 17 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   age          45211 non-null  int64 
 1   job          44923 non-null  object
 2   marital      45211 non-null  object
 3   education    43354 non-null  object
 4   default      45211 non-null  object
 5   balance      45211 non-null  int64 
 6   housing      45211 non-null  object
 7   loan         45211 non-null  object
 8   contact      32191 non-null  object
 9   day_of_week  45211 non-null  int64 
 10  month        45211 non-null  object
 11  duration     45211 non-null  int64 
 12  campaign     45211 non-null  int64 
 13  pdays        45211 non-null  int64 
 14  previous     45211 non-null  int64 
 15  poutcome     8252 non-null   object
 16  y            45211 non-null  object
dtypes: int64(7), object(10)
memory usage: 5.9+ MB


In [3]:
basic_info.check_null_values(df)

age                0
job              288
marital            0
education       1857
default            0
balance            0
housing            0
loan               0
contact        13020
day_of_week        0
month              0
duration           0
campaign           0
pdays              0
previous           0
poutcome       36959
y                  0
dtype: int64

# Dataset Overview

The dataset contains **17 columns** with a total of **45,211 entries**. Below is a summary of the dataset:

## Numerical Columns
- <span style="color: orange;">age</span>, <span style="color: orange;">balance</span>, <span style="color: orange;">day_of_week</span>, <span style="color: orange;">duration</span>, <span style="color: orange;">campaign</span>, <span style="color: orange;">pdays</span>, <span style="color: orange;">previous</span>

## Categorical Columns
- <span style="color: orange;">job</span>, <span style="color: orange;">marital</span>, <span style="color: orange;">education</span>, <span style="color: orange;">default</span>, <span style="color: orange;">housing</span>, <span style="color: orange;">loan</span>, <span style="color: orange;">contact</span>, <span style="color: orange;">month</span>, <span style="color: orange;">poutcome</span>, <span style="color: orange;">y</span>

## Key Insights
- There are **missing values** in columns like <span style="color: orange;">job</span>, <span style="color: orange;">education</span>, <span style="color: orange;">contact</span>, and <span style="color: orange;">poutcome</span>.
- The target column is <span style="color: orange;">y</span>, which is categorical.
- Mixed data types make preprocessing essential for modeling.


In [4]:
basic_info.check_duplicate_values(df)

np.int64(0)

#### There is no duplicates data in the dataset

In [5]:
basic_info.categorical_statistical_summary(df)

Unnamed: 0,job,marital,education,default,housing,loan,contact,month,poutcome,y
count,44923,45211,43354,45211,45211,45211,32191,45211,8252,45211
unique,11,3,3,2,2,2,2,12,3,2
top,blue-collar,married,secondary,no,yes,no,cellular,may,failure,no
freq,9732,27214,23202,44396,25130,37967,29285,13766,4901,39922


# Summary of Categorical Columns

| Column      | Count  | Unique | Most Frequent Value | Frequency of Top Value |
|-------------|--------|--------|----------------------|-------------------------|
| <span style="color: orange;">job</span>      | 44,923 | 11     | blue-collar           | 9,732                   |
| <span style="color: orange;">marital</span>  | 45,211 | 3      | married               | 27,214                  |
| <span style="color: orange;">education</span>| 43,354 | 3      | secondary             | 23,202                  |
| <span style="color: orange;">default</span>  | 45,211 | 2      | no                    | 44,396                  |
| <span style="color: orange;">housing</span>  | 45,211 | 2      | yes                   | 25,130                  |
| <span style="color: orange;">loan</span>     | 45,211 | 2      | no                    | 37,967                  |
| <span style="color: orange;">contact</span>  | 32,191 | 2      | cellular              | 29,285                  |
| <span style="color: orange;">month</span>    | 45,211 | 12     | may                   | 13,766                  |
| <span style="color: orange;">poutcome</span> | 8,252  | 3      | failure               | 4,901                   |
| <span style="color: orange;">y</span>        | 45,211 | 2      | no                    | 39,922                  |

## Insights
- The **job** column has the highest number of unique values among the categorical features.
- Most customers are **married** and have **secondary education**.
- The majority of interactions occurred in **May**.
- The target column <span style="color: orange;">y</span> has two categories: "yes" and "no," with "no" being significantly more frequent.


In [6]:
basic_info.numerical_statistical_summary(df)

Unnamed: 0,age,balance,day_of_week,duration,campaign,pdays,previous
count,45211.0,45211.0,45211.0,45211.0,45211.0,45211.0,45211.0
mean,40.93621,1362.272058,15.806419,258.16308,2.763841,40.197828,0.580323
std,10.618762,3044.765829,8.322476,257.527812,3.098021,100.128746,2.303441
min,18.0,-8019.0,1.0,0.0,1.0,-1.0,0.0
25%,33.0,72.0,8.0,103.0,1.0,-1.0,0.0
50%,39.0,448.0,16.0,180.0,2.0,-1.0,0.0
75%,48.0,1428.0,21.0,319.0,3.0,-1.0,0.0
max,95.0,102127.0,31.0,4918.0,63.0,871.0,275.0


# Summary of Numerical Columns

| Column             | Count      | Mean        | Std Dev    | Min       | 25%       | 50%       | 75%       | Max       |
|---------------------|------------|-------------|------------|-----------|-----------|-----------|-----------|-----------|
| <span style="color: orange;">age</span>             | 45,211     | 40.94      | 10.62      | 18        | 33        | 39        | 48        | 95        |
| <span style="color: orange;">balance</span>         | 45,211     | 1,362.27   | 3,044.77   | -8,019    | 72        | 448       | 1,428     | 102,127   |
| <span style="color: orange;">day_of_week</span>     | 45,211     | 15.81      | 8.32       | 1         | 8         | 16        | 21        | 31        |
| <span style="color: orange;">duration</span>        | 45,211     | 258.16     | 257.53     | 0         | 103       | 180       | 319       | 4,918     |
| <span style="color: orange;">campaign</span>        | 45,211     | 2.76       | 3.10       | 1         | 1         | 2         | 3         | 63        |
| <span style="color: orange;">pdays</span>           | 45,211     | 40.20      | 100.13     | -1        | -1        | -1        | -1        | 871       |
| <span style="color: orange;">previous</span>        | 45,211     | 0.58       | 2.30       | 0         | 0         | 0         | 0         | 275       |

## Distribution Analysis
1. **<span style="color: orange;">age</span>:** 
   - The age distribution has a mean of ~40.94 years with a standard deviation of ~10.62, indicating most values are clustered around the mean. 
   - The minimum is 18, and the maximum is 95, with the middle 50% of values ranging from 33 to 48.

2. **<span style="color: orange;">balance</span>:**
   - The average account balance is ~1,362.27 with a very high standard deviation of ~3,044.77, suggesting the presence of significant outliers (e.g., maximum of 102,127 and minimum of -8,019). 
   - A majority of values fall between 72 (25th percentile) and 1,428 (75th percentile).

3. **<span style="color: orange;">day_of_week</span>:**
   - Days of the week are represented numerically (1–31), with a fairly uniform distribution given the mean of ~15.81 and standard deviation of ~8.32.

4. **<span style="color: orange;">duration</span>:**
   - Call duration varies widely with a mean of ~258 seconds and a standard deviation almost equal to the mean (~257). The data shows significant variation, with some extremely long calls (up to 4,918 seconds).

5. **<span style="color: orange;">campaign</span>:**
   - The number of contacts per campaign has a mean of ~2.76 and ranges from 1 to 63. The standard deviation (~3.10) suggests a right-skewed distribution with a few customers contacted many times.

6. **<span style="color: orange;">pdays</span>:**
   - This column has a mean of ~40.20, but the median value is -1, indicating a large number of entries where the customer was not previously contacted. The range is wide, from -1 to 871.

7. **<span style="color: orange;">previous</span>:**
   - The distribution shows most customers had no previous contacts (median is 0). A small number of customers were contacted many times (up to 275).

## Key Observations
- **Outliers:** Significant outliers are present in <span style="color: orange;">balance</span>, <span style="color: orange;">duration</span>, and <span style="color: orange;">previous</span>, which may impact model performance.
- **Skewness:** <span style="color: orange;">campaign</span>, <span style="color: orange;">pdays</span>, and <span style="color: orange;">previous</span> appear to have skewed distributions.
- **Imbalance:** The high standard deviations and extreme values suggest that normalization or scaling may be required for some features.


### Data Preprocessing

In [7]:
df.isnull().sum()

age                0
job              288
marital            0
education       1857
default            0
balance            0
housing            0
loan               0
contact        13020
day_of_week        0
month              0
duration           0
campaign           0
pdays              0
previous           0
poutcome       36959
y                  0
dtype: int64

In [8]:

df['job'] = df['job'].fillna(df['job'].mode()[0])
df['education'] = df['education'].fillna(df['education'].mode()[0])
df['contact'] = df['contact'].fillna(df['contact'].mode()[0])

In [9]:
df.isnull().sum()

age                0
job                0
marital            0
education          0
default            0
balance            0
housing            0
loan               0
contact            0
day_of_week        0
month              0
duration           0
campaign           0
pdays              0
previous           0
poutcome       36959
y                  0
dtype: int64

In [11]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

In [12]:

df['education']= le.fit_transform(df['education'])
df['job'] = le.fit_transform(df['job'])
df['marital'] = df['marital'].map({'single':0,'married':1,'divorced':2})
df['default'] = df['default'].map({'no':0,'yes':1})
df['housing'] = df['housing'].map({'no':0,'yes':1})
df['loan'] = df['loan'].map({'no':0,'yes':1})
df['contact'] = df['contact'].map({'cellular':0,'telephone':1})
df['month'] = df['month'].map({'jan':0,'feb':1,'mar':2,'apr':3,'may':4,'jun':5,'jul':6,'aug':7,'sep':8,'oct':9,'nov':10,'dec':11})
df['y'] = df['y'].map({'no':0,'yes':1})
df.drop(['poutcome'],axis=1,inplace=True)

In [14]:
df.to_csv('/home/sarath_kumar/Bank_Marketing/data/bank_marketing_preprocessed.csv',index=False)

In [29]:
df.isnull().sum()

age            0
job            0
marital        0
education      0
default        0
balance        0
housing        0
loan           0
contact        0
day_of_week    0
month          0
duration       0
campaign       0
pdays          0
previous       0
y              0
dtype: int64