# About this notebook

This is a brief exploratory analysis of brazilian society using the 2015 PNAD dataset (Pesquisa Nacional por Amostras de Domicílios: National Survey by Household Sample).

Part 1: Data analysis (this notebook)

Part 2: [Modeling](https://www.kaggle.com/hinepo/pnad-income-prediction?scriptVersionId=74289035)

Part 3: [Lazy Predict](https://www.kaggle.com/hinepo/pnad-lazy-predict?scriptVersionId=74288711)

# Imports

[matplotlib styles](https://matplotlib.org/stable/gallery/style_sheets/style_sheets_reference.html)

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
plt.style.use('bmh')
import seaborn as sns

# Load data

In [None]:
df_raw = pd.read_csv('../input/testes/dados.csv')
df_raw

In [None]:
df_raw.isnull().values.any()

In [None]:
# unique values in each column
for i in df_raw.columns:
    print(i, ": ", df_raw[i].nunique())

In [None]:
df_raw.dtypes

# Preprocess

In [None]:
State_dict = {
    11 : 'Rondônia',
    12 : 'Acre',
    13 : 'Amazonas',
    14 : 'Roraima',
    15 : 'Pará',
    16 : 'Amapá',
    17 : 'Tocantins',
    21 : 'Maranhão',
    22 : 'Piauí',
    23 : 'Ceará',
    24 : 'Rio Grande do Norte',
    25 : 'Paraíba',
    26 : 'Pernambuco',
    27 : 'Alagoas',
    28 : 'Sergipe',
    29 : 'Bahia',
    31 : 'Minas Gerais',
    32 : 'Espírito Santo',
    33 : 'Rio de Janeiro',
    35 : 'São Paulo',
    41 : 'Paraná',
    42 : 'Santa Catarina',
    43 : 'Rio Grande do Sul',
    50 : 'Mato Grosso do Sul',
    51 : 'Mato Grosso',
    52 : 'Goiás',
    53 : 'Distrito Federal'
}

In [None]:
Sex_dict = {
    0 : 'Male',
    1 : 'Female'
}

In [None]:
Color_dict = {
    0 : 'Indigenous',
    2 : 'White',
    4 : 'Black',
    6 : 'Yellow',
    8 : 'Brown',
    9 : 'N/A'
    }

In [None]:
Years_of_study_dict = {
    1 : 0,
    2 : 1,
    3 : 2,
    4 : 3,
    5 : 4,
    6 : 5,
    7 : 6,
    8 : 7,
    9 : 8,
    10 : 9,
    11 : 10,
    12 : 11,
    13 : 12,
    14 : 13,
    15 : 14,
    16 : 15,
    17 : 0
    }

# 15 years of study means '15 years or more'

In [None]:
%%time
def preprocess(dataframe):
    df = dataframe.copy()
    df.rename(columns={"UF": "State", 
                     "Sexo": "Sex",
                     "Idade": "Age",
                     "Cor": "Color",
                     "Anos de Estudo": "Years_of_study",
                     "Renda": "Income",
                     "Altura": "Height"}, inplace=True)
    
    df["State"] = df["State"].map(State_dict)
    df["Sex"] = df["Sex"].map(Sex_dict)
    df["Color"] = df["Color"].map(Color_dict)
    df["Years_of_study"] = df["Years_of_study"].map(Years_of_study_dict)
    df['Height'] = round(df['Height'], 2)
    
    return df

df = preprocess(df_raw)

In [None]:
df

# Univariate Analysis

In [None]:
# basic statistics
df.describe()

## Histograms

In [None]:
df.hist(figsize=(10,8));

## Boxplots

In [None]:
%%time
fig = plt.figure(figsize = (10,8))
fig.suptitle('Boxplots')

vars = [j for j in df.columns if j not in ['State', 'Sex', 'Color']]

for i in range(0, len(vars)):
    ax = plt.subplot(2, 2, i + 1)
    var = vars[i]
    plt.title(var)
    sns.boxplot(data = df[var]);

## Histograms & Boxplots

In [None]:
def plot_together(df, var, bins=10):
    sns.set(style="ticks")

    x = df[var]

    f, (ax_box, ax_hist) = plt.subplots(2, sharex=True, 
                                        gridspec_kw={"height_ratios": (.15, .85)})

    sns.boxplot(x, ax=ax_box)
    sns.distplot(x, ax=ax_hist, norm_hist=False, bins=bins)

    ax_box.set(yticks=[])
    sns.despine(ax=ax_hist)
    sns.despine(ax=ax_box, left=True);

In [None]:
%%time
# Age
plot_together(df, 'Age', bins=20)

# Income
plot_together(df, 'Income', bins=1000)
plt.xlim(0, 10000)
# plt.xticks(rotation=90)

# Years of study
plot_together(df, 'Years_of_study', bins=20)

# Height
plot_together(df, 'Height')

## Value counts

In [None]:
%%time
fig = plt.figure(figsize = (14,8))

vars = [j for j in df.columns if j not in ['Income', 'Age', 'Height']]

for i in range(0, len(vars)):
    ax = plt.subplot(2, 2, i + 1)
    var = vars[i]
    plt.title(var)
    df[var].value_counts().plot(kind = 'bar')
    plt.tight_layout()

In [None]:
%%time
fig = plt.figure(figsize = (16,10))

vars = ['Age', 'Height']

for i in range(0, len(vars)):
    ax = plt.subplot(2, 1, i + 1)
    var = vars[i]
    plt.title(var)
    df[var].value_counts().plot(kind = 'bar')
    plt.tight_layout()

In [None]:
print(df["Sex"].value_counts())
print("\n")
print(df["Sex"].value_counts(normalize=True))

In [None]:
# from 76840 samples, there are 423 that have Age less than 20
len(df["Age"][df["Age"]<20])

In [None]:
print("Maximum value for column Age:", df["Age"].max())
print("Minimum value for column Age:", df["Age"].min())

# Bivariate Analysis

# Years of study

## Years of study vs Color

In [None]:
plt.figure(figsize=(10,6))
sns.boxplot(x = df['Color'], y = df['Years_of_study'], data = df)
plt.title("Years of study x Color");

## Years of study vs Sex

In [None]:
plt.figure(figsize=(10,6))
sns.boxplot(x = df['Sex'], y = df['Years_of_study'], data = df)
plt.title("Years of study x Sex");

- Women tend to study more years than men.

## Years of study vs State

In [None]:
df.groupby('State').mean()[['Years_of_study']].plot(figsize=(14,6), kind='bar')
plt.title("Years of study (Average) x State");

# Income

In [None]:
# Some insights
print("Number of samples that have Income < 20 k :", len(df[df["Income"] < 20000]), ":", round(len(df[df["Income"] < 20000])/len(df),4), "%")
print("Number of samples that have Income >= 20 k :", len(df[df["Income"] >= 20000]), ":", round(len(df[df["Income"] >= 20000])/len(df),4), "%")
print("Number of samples that have Income >= 40 k :", len(df[df["Income"] >= 40000]), ":", round(len(df[df["Income"] >= 40000])/len(df),4), "%")
print("\nAverage Salary (Income) :", round(df['Income'].mean(), 2))
print("Maximum value for Income :", df["Income"].max())
print("Minimum value for Income :", df["Income"].min())

In [None]:
# income histogram with zoom (less than 40k)

fig, ax = plt.subplots(1, 4, figsize = (16, 5))
ax[0].hist(df["Income"][df["Income"] < 40000], bins = 100)
ax[0].set_title('Frequency x Income (<40k)')
ax[1].hist(df["Income"][df["Income"] < 15000], bins = 100)
ax[1].set_title('Frequency x Income (<15k)')
ax[2].hist(df["Income"][df["Income"] < 10000], bins = 100)
ax[2].set_title('Frequency x Income (<10k)')
ax[3].hist(df["Income"][df["Income"] < 5000], bins = 100)
ax[3].set_title('Frequency x Income (<5k)');

In [None]:
# income histogram with zoom (more than 40k)
df["Income"][df["Income"] > 40000].plot(figsize=(10,4), kind = 'hist', bins = 300)
plt.title('Frequency x Income (>40k)');

In [None]:
%%time
fig = plt.figure(figsize = (14,12))

vars = [j for j in df.columns if j not in ['Income', 'Height', 'Age', 'State']]

max_value = 5000

for i in range(0, len(vars)):
    ax = plt.subplot(3, 1, i + 1)
    var = vars[i]
    sns.boxplot(x = df[var], y = df['Income'][df['Income']<max_value], data = df[df['Income'] < max_value])
    plt.tight_layout();

- Although women usually study more than men (as seen in the item Years of study vs. Sex), they generally have lower income than men.
- There is a clear relationship between years of schooling and earnings.

In [None]:
%%time
fig = plt.figure(figsize = (16,10))

vars = ['Age', 'Height']

max_value = 5000

for i in range(0, len(vars)):
    ax = plt.subplot(2, 1, i + 1)
    var = vars[i]
    plt.title(var)
    sns.boxplot(x = df[var], y = df['Income'][df['Income']<max_value], data = df[df['Income'] < max_value])
    
    if var == 'Height':
        plt.xticks(rotation = 90)
    
    plt.tight_layout();

- As expected, height does not affect income.
- As people get older, they tend to have lower incomes.

## Income x Age x Color

In [None]:
n = 500
sample = df.sample(n, random_state=4) # plot only n samples

plt.figure(figsize=(16, 6))
plt.style.use('fivethirtyeight')
plt.ylim(0, 20000)
plt.title("Income x Age x Color")
plt.xticks([0, 10, 20, 30, 40, 50, 60, 70, 80], labels = [0, 10, 20, 30, 40, 50, 60, 70, 80])
sns.scatterplot(x = sample['Age'], y = sample['Income'], data = sample, hue = sample['Color']);

In [None]:
limit = 10_000

df[df['Income'] < limit].plot.hexbin(x='Age', y='Income', gridsize=15);

## Income x Years of study

In [None]:
less_than_five_years = df[df["Years_of_study"] <= 5]
five_nine_years = df[(df["Years_of_study"] > 5) &  (df["Years_of_study"] < 10)]
nine_fourteen_years = df[(df["Years_of_study"] >= 10) & (df["Years_of_study"] < 15)]
more_than_fifteen_years = df[df["Years_of_study"] >= 15]

print("Average income for 0-5 years of study :", round(less_than_five_years['Income'].mean(), 2))
print("Average income for 6-9 years of study :", round(five_nine_years['Income'].mean(), 2))
print("Average income for 10-14 years of study :", round(nine_fourteen_years['Income'].mean(), 2))
print("Average income for 15+ years of study :", round(more_than_fifteen_years['Income'].mean(), 2))

In [None]:
# plot averages
year_avgs = np.array([
    round(less_than_five_years['Income'].mean(), 2),
    round(five_nine_years['Income'].mean(), 2),
    round(nine_fourteen_years['Income'].mean(), 2),
    round(more_than_fifteen_years['Income'].mean(), 2)
    ])

categories = np.array(['<5', '5-9', '10-14', '15+'])

plt.figure(figsize=(8,6))
plt.style.use('bmh')
sns.barplot(x=categories, y=year_avgs)
plt.title("Income x Years of study");

In [None]:
limit = 10_000

df[df['Income'] < limit].plot.hexbin(x='Years_of_study', y='Income', gridsize=15);

## Income x State

In [None]:
df.groupby('State').mean()[['Income']].plot(figsize=(16,6), kind='bar');
plt.axhline(y = df['Income'].mean(), color = 'r', linestyle = 'dashed', label = 'mean value');
plt.legend();

# Height x Sex

In [None]:
plt.figure(figsize=(8,6))
plt.title("Height x Sex")
sns.boxplot(x = df['Sex'], y = df['Height']);

In [None]:
print("Average height for men :", round(df[df['Sex'] == 'Male']['Height'].mean(), 3))
print("Average height for women :", round(df[df['Sex'] == 'Female']['Height'].mean(), 3))

# Multivariate Analysis

Now let's check some correlations on the data.

Reminder: Correlation is not causality!

In [None]:
%%time

n = 800
sample = df.sample(n, random_state=0)

sns.pairplot(sample, hue = 'Color');

In [None]:
%%time

n = 800
sample = df.sample(n, random_state=0)

sns.pairplot(sample, hue = 'Sex');

In [None]:
# heatmap for correlations
corr = df.corr()
plt.figure(figsize=(8,6))
sns.heatmap(corr, annot = True, vmin = 0, vmax = 1, cmap = 'Purples');

# Export clean PNAD df

In [None]:
df.to_csv('/kaggle/working/pnad_clean.csv', index=False)