In [None]:
%config IPCompleter.greedy=True
%matplotlib inline
import pandas as pd
pd.plotting.register_matplotlib_converters()
import matplotlib.pyplot as plt
from matplotlib import style
from matplotlib import rcParams

import seaborn as sns
import numpy as np

#set style
style.use('seaborn-poster') #sets the size of the charts
style.use('ggplot')
rcParams['font.family'] = "serif"
sns.set_context('talk') 

import sys
import os
from pathlib import Path

IMAGES_FOLDER = Path.cwd().joinpath(Path().absolute().parent, 'sampleImages')
flights = sns.load_dataset("flights")

 ### Read data

In [None]:
df_clean = pd.read_pickle(os.path.join('..', 'data/despesas56_clean.pkl'))
df_clean.head(3)

 ### Fit data do analysis

In [None]:
df = df_clean.copy()

# switch full date to month-year and drop full date
df['periodo'] = df['dataGasto'].dt.to_period('M')

#select year period
df = df.loc()[(df.periodo > '2019-06') & (df.periodo <= '2020-06')]

#remove compensatory vaues
df = df[~df['numDocumento'].str.match('Comp')].reset_index(drop=True)
df_plot = df.sample(10000)

 ### Expenses by month

In [None]:
fig, ax = plt.subplots(figsize=(18,7))
# plt.style.use('ggplot')
# df.groupby('dataGasto')['valor'].sum().plot(ax=ax)
# sns.set_style('ticks')

plt.margins(x=0,y=0)
sns.lineplot(data=df.groupby('dataGasto')['valor'].sum().to_frame().reset_index(), x='dataGasto', y='valor')
ax.set_xlabel('Data', fontsize=13)
ax.set_ylabel('Valor (R$)', fontsize=13)
ax.set_title('Total gasto em cotas parlamentares', fontsize=15)

save_fig = Path.joinpath(IMAGES_FOLDER, 'despesas-total-data.png')
plt.savefig(save_fig, bbox_inches = 'tight')

 ### Expenditure by Education Level

In [None]:
df_escolaridade = pd.read_pickle(os.path.join('..', 'data/despesas56_model.pkl'))

In [None]:
df_escolaridade[['nome', 'escolaridade', 'valor']].groupby('nome')['escolaridade'].unique().to_frame().reset_index()['escolaridade'].values

In [None]:
df_escolaridade = pd.read_pickle(os.path.join('..', 'data/despesas56_model.pkl'))
media_escolaridade = df_escolaridade.groupby(['nome', 'escolaridade'])['valor'].mean().to_frame().reset_index()

fig, ax = plt.subplots(figsize=(18,7))
ax.tick_params(axis='x')#, rotation=90)
sns.barplot(ax = ax, x="escolaridade", y="valor", data=media_escolaridade.sort_values(by='valor'))

# ax.set_xlabel('Estado')
# ax.set_ylabel('Data')
plt.title('Gasto Medio por Escolaridade')

save_fig = Path.joinpath(IMAGES_FOLDER, 'despesas-por-educacao.png')
plt.savefig(save_fig, bbox_inches = 'tight')

 ### Heat Map plane ticket value x month x uf district

In [None]:
# features = ['ufNascimento', 'tipo', 'fornecedor', 'dataGasto', 'valor']
features = ['ufPartido', 'dataGasto', 'valor', 'tipo', 'fornecedor']
index_heatmap_passage = df.tipo.str.match('PASSAGEM')
df_heatmap = df[features].loc()[index_heatmap_passage].reset_index(drop=True).copy()

df_heatmap['periodo'] = df_heatmap['dataGasto'].dt.to_period('M')
df_heatmap.drop('dataGasto', axis=1, inplace=True)
df_heatmap.set_index('periodo', inplace=True)

d1 = pd.pivot_table(df_heatmap, index="periodo", columns="ufPartido", values="valor", aggfunc=np.mean, fill_value=0)

estados = ['RS', 'SC', 'PR',
           'SP', 'MG', 'RJ', 'ES',
           'MS', 'GO', 'DF', 'MT',
           'BA', 'SE', 'AL', 'PE', 'PB', 'RN', 'CE', 'PI', 'MA',
           'TO', 'PA', 'AP', 'RO', 'AC', 'AM', 'RR']

fig, ax = plt.subplots(figsize=(18,7))
sns.heatmap(ax = ax, data=d1[estados])

ax.set_xlabel('Estado')
ax.set_ylabel('Data')
plt.title('Gasto médio com passagens aéreas')
# ax.set(xticks=labels_ticks, xticklabels = labels, xlabel=None, ylabel='Valor do Documento (R$)')
# ax.set_xticklabels(rotation=90, horizontalalignment='center')

save_fig = Path.joinpath(IMAGES_FOLDER, 'heat-map-despesas-uf.png')
plt.savefig(save_fig, bbox_inches = 'tight')

 ### Bar plot by Partido

In [None]:
features = ['partido', 'sexo']
df_sex_proportion = df.drop_duplicates(subset='nome')[features]

df_sex_proportion['sexo'] = df_sex_proportion['sexo'].str.replace(pat='M', repl='Homem')
df_sex_proportion['sexo'] = df_sex_proportion['sexo'].str.replace(pat='F', repl='Mulher')

df_sex_proportion = df_sex_proportion.groupby(['partido', 'sexo'])['partido'].count().unstack(fill_value=0)

plt.figure()
df_sex_proportion.plot(kind='bar',stacked=True, xlabel='Partido', ylabel='Deputados do Partido', figsize=(18,8))
# plt.legend(fontsize=20)

save_fig = Path.joinpath(IMAGES_FOLDER, 'distribuicao-genero-partido.png')
plt.savefig(save_fig, bbox_inches = 'tight')


 ### Bar plot by UF

In [None]:
features = ['ufPartido', 'sexo']
df_sex_proportion = df.drop_duplicates(subset='nome')[features]

df_sex_proportion['sexo'] = df_sex_proportion['sexo'].str.replace(pat='M', repl='Homem')
df_sex_proportion['sexo'] = df_sex_proportion['sexo'].str.replace(pat='F', repl='Mulher')

df_sex_proportion = df_sex_proportion.groupby(['ufPartido', 'sexo'])['ufPartido'].count().unstack(fill_value=0)

df_sex_proportion.plot(kind='bar',stacked=True, xlabel='Estado', ylabel='Deputados do Estado', figsize=(18,8))

# labels = [insert_every_n(15, tipo.lower()) for tipo in df.tipo.unique()]
# labels_ticks = np.arange(len(labels))

# chart.set(xticks=labels_ticks, xticklabels = labels, xlabel=None, ylabel='Valor do Documento (R$)')
# chart.set_xticklabels(rotation=90, horizontalalignment='center')

# plt.legend(title='Documento Emitido')


save_fig = Path.joinpath(IMAGES_FOLDER, 'distribuicao-genero-uf.png')
plt.savefig(save_fig, bbox_inches = 'tight')

### Relation between type of expenses and document emmited

In [None]:
def insert_every_n(n, string):
    splitted = string.split(' ')
    len_section = 0
    sections = []
    j = 0

    for i, word in enumerate(splitted):
        len_section += len(word)

        if len_section > n:
            sections.append(' '.join(splitted[j:i+1]))
            len_section = 0
            j = i + 1

    if not sections:
        return string

    if splitted[j:]:
        sections[-1] = ' '.join([sections[-1], *splitted[j:]])

    return '\n'.join(sections)

In [None]:
features = ['tipo', 'tipoDocumento', 'valor']

chart = sns.catplot(data=df[features], x='tipo', y = 'valor', hue='tipoDocumento', s = 4, jitter=True, height=6, aspect=4, legend_out=False)

labels = [insert_every_n(15, tipo.lower()) for tipo in df.tipo.unique()]
labels_ticks = np.arange(len(labels))

chart.set(xticks=labels_ticks, xticklabels = labels, xlabel=None, ylabel='Valor do Documento (R$)')
chart.set_xticklabels(rotation=90, horizontalalignment='center')

plt.legend(title='Documento Emitido')

save_fig = Path.joinpath(IMAGES_FOLDER, 'relacao-tipo-documento.png')
plt.savefig(save_fig, bbox_inches = 'tight')