# Preliminary EDA

## Imports 

In [1]:
import pandas as pd
import numpy as np
import altair as alt
import altair_ally as aly
import matplotlib.pyplot as plt

## Data Cleaning

In [18]:
# Load the data
df = pd.read_csv('../data/raw/cmc_data.txt',names=['age','education','spouse_education','children',
                                                'religion','work','spouse_occupation','living_standard',
                                                'media_exposure','contraceptive_method'])
df.head()

Unnamed: 0,age,education,spouse_education,children,religion,work,spouse_occupation,living_standard,media_exposure,contraceptive_method
0,24,2,3,3,1,1,2,3,0,1
1,45,1,3,10,1,1,3,4,0,1
2,43,2,3,7,1,1,3,4,0,1
3,42,3,2,9,1,1,3,3,0,1
4,36,3,3,8,1,1,3,2,0,1


In [19]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1473 entries, 0 to 1472
Data columns (total 10 columns):
 #   Column                Non-Null Count  Dtype
---  ------                --------------  -----
 0   age                   1473 non-null   int64
 1   education             1473 non-null   int64
 2   spouse_education      1473 non-null   int64
 3   children              1473 non-null   int64
 4   religion              1473 non-null   int64
 5   work                  1473 non-null   int64
 6   spouse_occupation     1473 non-null   int64
 7   living_standard       1473 non-null   int64
 8   media_exposure        1473 non-null   int64
 9   contraceptive_method  1473 non-null   int64
dtypes: int64(10)
memory usage: 115.2 KB


All the regressors and targets are integer type. We shall wrangle the data to put the correct data types. According to the data description on UCI website, the following data types should be associated with all the variables:
- `age` : numeric type (already encoded as such)
- `education` : ordinal (already encoded as such)
- `spouse_education` : ordinal (already encoded as such)
- `children` : numeric type as integers (already encoded as such)
- `religion` : binary (Islam or not - already encoded as such)
- `work` : binary (yes or no - already encoded as such)
- `spouse education` : categorical (needs proper encoding)
- `living_standard` : ordinal (already encoded as such)
- `media exposure` : ordinal (already encoded as such)
- `contraceptive method` : categorical (needs proper encoding)


In [20]:
# check entries for spouse_education
df['spouse_occupation'].value_counts()

3    585
1    436
2    425
4     27
Name: spouse_occupation, dtype: int64

In [21]:
# encode as string/categorical type
df['spouse_occupation'] = df['spouse_occupation'].apply(str)

In [42]:
# encode the. target contraceptive_method properly
df['contraceptive_method'] = df['contraceptive_method'].replace([1, 2, 3], ['No use', 'Long term', 'Short term'])
df.head()

Unnamed: 0,age,education,spouse_education,children,religion,work,spouse_occupation,living_standard,media_exposure,contraceptive_method
0,24,2,3,3,1,1,2,3,0,No use
1,45,1,3,10,1,1,3,4,0,No use
2,43,2,3,7,1,1,3,4,0,No use
3,42,3,2,9,1,1,3,3,0,No use
4,36,3,3,8,1,1,3,2,0,No use


In [43]:
df.describe()

Unnamed: 0,age,education,spouse_education,children,religion,work,living_standard,media_exposure
count,1473.0,1473.0,1473.0,1473.0,1473.0,1473.0,1473.0,1473.0
mean,32.538357,2.958588,3.429735,3.261371,0.850645,0.749491,3.133741,0.073999
std,8.227245,1.014994,0.816349,2.358549,0.356559,0.433453,0.976161,0.261858
min,16.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0
25%,26.0,2.0,3.0,1.0,1.0,0.0,3.0,0.0
50%,32.0,3.0,4.0,3.0,1.0,1.0,3.0,0.0
75%,39.0,4.0,4.0,4.0,1.0,1.0,4.0,0.0
max,49.0,4.0,4.0,16.0,1.0,1.0,4.0,1.0


### Effect of `work` on `contraceptive_method`

In [89]:
alt.Chart(df).mark_bar().encode(
    x=alt.X('work:N', title=None),
    y='count()',
    color=alt.Color('work:N', title='Work'),
    column=alt.Column('contraceptive_method:N', title='Contraception')
).properties(width=50)

### Effect of `age` on `contraceptive_method`

In [79]:
alt.Chart(df).mark_boxplot(extent='min-max').encode(
    y=alt.Y('contraceptive_method:O', title='Contraception'),
    x=alt.X('age:Q', title='Age'),
    color=alt.Color('contraceptive_method:N', legend=None),
).properties(height=100)

### Effect of `children` on `contraceptive_method`

In [82]:
alt.Chart(df).mark_boxplot(extent='min-max').encode(
    y=alt.Y('contraceptive_method:O', title='Contraception'),
    x=alt.X('children:Q', title='Children'),
    color=alt.Color('contraceptive_method:N', legend=None),
).properties(height=100)

### Effect of `living_standard` on `contraceptive_method`

In [96]:
alt.Chart(df).mark_bar().encode(
    x=alt.X('contraceptive_method:N', title=None),
    y='count()',
    color=alt.Color('contraceptive_method:N', title='Contraception'),
    column=alt.Column('living_standard:N', title='Living Standard'),
).properties(width=50)

### Analysis of correlations

In [97]:
aly.corr(df)

### Regressor distributions for multiple target classs

In [92]:
aly.dist(df, 'contraceptive_method', mark='bar')