## IMPORTING THE REQUIRED LIBRARIES

In [1]:
import numpy as np
import pandas as pd
from IPython import display

## DISABLING THE MAX COLUMNS OPTION FROM THE PANDAS LIBRARY

In [2]:
pd.set_option('max_columns', None)

## IMPORTING THE DATA

In [3]:
data = pd.read_csv('data.csv', low_memory = False)

## CHECKING NULL VALUES IN ALL THE COLUMNS

In [4]:
columns_nan_values = pd.DataFrame()
columns, nan_values = list(), list()

for column in data.columns:
    if data[column].isna().mean() > 0:
        nan_values.append(data[column].isna().mean())
        columns.append(column)

columns_nan_values['column'] = columns
columns_nan_values['nan_values'] = nan_values

## DISPLAYING THE COLUMNS WITH NAN VALUES

In [5]:
print('COLUMNS WITH NAN VALUES')
print('-----------------------')
display.display(columns_nan_values)

COLUMNS WITH NAN VALUES
-----------------------


Unnamed: 0,column,nan_values
0,AGE,0.248030
1,NUMCHLD,0.870184
2,INCOME,0.223096
3,WEALTH1,0.468830
4,MBCRAFT,0.553955
...,...,...
87,RAMNT_24,0.814090
88,NEXTDATE,0.104526
89,TIMELAG,0.104526
90,CLUSTER2,0.001383


## CREATING A LIST THAT IS GOING TO HAVE THE COLUMNS THAT WE ARE GOING TO DROP

In [6]:
to_drop_columns = list()

## APPENDING INSIDE THE COLUMNS TO DROP LIST THE COLUMNS OSOURCE AND ZIP CODE

In [7]:
to_drop_columns.append('ZIP')
to_drop_columns.append('OSOURCE')

## APPENDING INSIDE THE COLUMNS TO DROP LIST THE COLUMNS WITH MORE THAN 0.85 OF NAN VALUES

In [8]:
for column in data.columns:
    if data[column].isna().mean() > 0.85:
        if column not in to_drop_columns:
            to_drop_columns.append(column)

## REMOVING ALL THE COLUMNS INSIDE THE COLUMNS TO DROP LIST FROM THE DATAFRAME

In [9]:
data = data.drop(columns = to_drop_columns)

## FILLING THE NAN VALUES OF THE GENDER COLUMN WITH THE F VALUE

In [10]:
data['GENDER'] = data['GENDER'].fillna('F')

## REDUCING THE NUMBER OF CATEGORIES IN THE GENDER COLUMN

In [11]:
def not_male_female(gender):

    if gender == 'F':
        return 'F'

    elif gender == 'M':
        return 'M'

    else:
        return 'OTHER'

data['GENDER'] = data['GENDER'].apply(not_male_female)

## CHECKING THE NAN VALUES IN THE NUMERICAL COLUMNS

In [12]:
numerical = data.select_dtypes(include = np.number)
numerical_nan = pd.DataFrame(data = numerical.isna().sum().values.tolist(), 
                             index = numerical.isna().sum().index.tolist(),
                             columns = ['nan_values'])

numerical_nan[numerical_nan['nan_values'] > 0].transpose()

Unnamed: 0,AGE,INCOME,WEALTH1,MBCRAFT,MBGARDEN,MBBOOKS,MBCOLECT,MAGFAML,MAGFEM,MAGMALE,PUBGARDN,PUBCULIN,PUBHLTH,PUBDOITY,PUBNEWFN,PUBPHOTO,PUBOPP,WEALTH2,MSA,ADI,DMA,ADATE_3,ADATE_4,ADATE_5,ADATE_6,ADATE_7,ADATE_8,ADATE_9,ADATE_10,ADATE_11,ADATE_12,ADATE_13,ADATE_14,ADATE_15,ADATE_16,ADATE_17,ADATE_18,ADATE_19,ADATE_20,ADATE_21,ADATE_22,ADATE_23,ADATE_24,RDATE_8,RDATE_9,RDATE_11,RDATE_12,RDATE_14,RDATE_16,RDATE_18,RDATE_19,RDATE_22,RDATE_24,RAMNT_8,RAMNT_9,RAMNT_11,RAMNT_12,RAMNT_14,RAMNT_16,RAMNT_18,RAMNT_19,RAMNT_22,RAMNT_24,NEXTDATE,TIMELAG,CLUSTER2
nan_values,23665,21286,44732,52854,52854,52854,52914,52854,52854,52854,52854,52854,52854,52854,52854,52854,52854,43823,132,132,132,1950,2191,33590,3557,8874,3511,11245,32748,10422,8923,40219,18867,65477,20364,27650,21263,24480,50200,35212,25648,56270,36973,73940,78678,80672,69712,72095,68418,75634,79535,74539,77674,73940,78678,80672,69712,72095,68418,75634,79535,74539,77674,9973,9973,132


## CLEANING THE FOLLOWING GEOCODE2 COLUMN

In [13]:
data['GEOCODE2'] = data['GEOCODE2'].replace({' ': np.NaN})
data['GEOCODE2'] = data['GEOCODE2'].fillna(data['GEOCODE2'].mode()[0])

## CLEANING THE FOLLOWING WEALTH1 COLUMN

In [14]:
data['WEALTH1'] = data['WEALTH1'].replace({' ': np.NaN})
data['WEALTH1'] = data['WEALTH1'].fillna(data['WEALTH1'].mode()[0])

## CLEANING THE FOLLOWING ADI COLUMN

In [16]:
data['ADI'] = data['ADI'].fillna(np.ceil(data['ADI'].mean()))

## CLEANING THE FOLLOWING DMA COLUMN

In [17]:
data['DMA'] = data['DMA'].fillna(np.ceil(data['DMA'].mean()))

## CLEANING THE FOLLOWING MSA COLUMN

In [18]:
data['MSA'] = data['MSA'].fillna(np.ceil(data['MSA'].mean()))