# Start digging into your data and generating visualizations to spot strange patterns or interesting statistics.

## Think of questions that data can answer. For example:
- Can you see which senator declared the most expenses?
- What percentage of the quota did the senators spend?
- Did you have any with more than 90%?
- In an election year, do senators spend more?

# Load libraries

In [79]:
import pandas as pd
import numpy as np
import plotly.offline as py
import plotly.graph_objs as go

py.init_notebook_mode(connected=True)

# Load the dataset

In [2]:
dataset = pd.read_csv('../day1/datasets/dataset_preprocessed.csv', sep=';')
dataset

Unnamed: 0,ANO,MES,SENADOR,TIPO_DESPESA,CNPJ_CPF,FORNECEDOR,DATA,VALOR_REEMBOLSADO,COD_DOCUMENTO
0,2019,1,ACIR GURGACZ,"Aluguel de imóveis para escritório político, c...",05.914.650/0001-66,ENERGISA,2019-04-01,66.02,2116543
1,2019,1,ACIR GURGACZ,"Aluguel de imóveis para escritório político, c...",05.914.650/0001-66,ENERGISA,2019-04-01,139.98,2116546
2,2019,1,ACIR GURGACZ,"Aluguel de imóveis para escritório político, c...",004.948.028-63,GILBERTO PISELO DO NASCIMENTO,2019-07-01,6000.00,2113817
3,2019,1,ACIR GURGACZ,"Aluguel de imóveis para escritório político, c...",05.423.963/0001-11,OI MÓVEL S.A.,2018-12-25,316.39,2116541
4,2019,2,ACIR GURGACZ,"Aluguel de imóveis para escritório político, c...",05.914.650/0001-66,ENERGISA,2019-04-02,99.45,2116550
...,...,...,...,...,...,...,...,...,...
52527,2021,12,ZEQUINHA MARINHO,"Passagens aéreas, aquáticas e terrestres nacio...",22.052.777/0001-32,Exceller Tour,2021-01-12,1715.45,2170983
52528,2021,12,ZEQUINHA MARINHO,"Passagens aéreas, aquáticas e terrestres nacio...",22.052.777/0001-32,Exceller Tour,2021-12-15,2489.29,2172263
52529,2021,12,ZEQUINHA MARINHO,"Passagens aéreas, aquáticas e terrestres nacio...",22.052.777/0001-32,Exceller Tour,2021-12-17,1486.66,2172717
52530,2021,12,ZEQUINHA MARINHO,"Passagens aéreas, aquáticas e terrestres nacio...",22.052.777/0001-32,Exceller Tour,2021-12-27,2515.56,2173239


# Can you see which senator declared the most expenses?

In [20]:
quantity_of_expensive = pd.DataFrame(columns=['SENADOR', 'QUANTIDADE_GASTOS'])
for senator in dataset['SENADOR'].unique():
    quantity_of_expensive = pd.concat(
        [quantity_of_expensive, pd.DataFrame.from_records([{'SENADOR': senator, 'QUANTIDADE_GASTOS': len(dataset[dataset['SENADOR'] == senator])}])]
    )
quantity_of_expensive

Unnamed: 0,SENADOR,QUANTIDADE_GASTOS
0,ACIR GURGACZ,557
0,AÉCIO NEVES,15
0,ALESSANDRO VIEIRA,703
0,ALOYSIO NUNES FERREIRA,22
0,ALVARO DIAS,106
...,...,...
0,JOSÉ ANÍBAL,89
0,MARIA ELIZA DE AGUIAR E SILVA,26
0,NAILDE PANTA,1
0,NILDA GONDIM,155


In [24]:
# Creating the chart
trace = go.Scatter(
    x = quantity_of_expensive['SENADOR'],
    y = quantity_of_expensive['QUANTIDADE_GASTOS'],
    mode = 'markers',
    marker={
        'color':'#e74c3c',
        'line':{
            'width':1,
            'color':'#c0392b'
        }
    },
    opacity=.8
)
# Storing chart in a list
data = [trace]
# Creating Layout
layout = go.Layout(
    title='Quantidade de gastos declarados por Senador',
    yaxis={'title':'Quantidade de gastos'},
    xaxis={'title':'Nome Senador'}
)
# Creating the figure that will be displayed
fig = go.Figure(data=data, layout=layout)
# Showing figure/graph
py.iplot(fig)

## Generating a box plot for the 4 senators who spent the most

In [31]:
# Generating graphs for senator 1
trace1 = go.Box(
    y = dataset.loc[dataset['SENADOR'] == 'CARLOS VIANA', 'VALOR_REEMBOLSADO'],
    name = 'CARLOS VIANA',
    marker = {'color': '#f39c12'}
)
# Generating graphs for senator 2
trace2 = go.Box(
    y = dataset.loc[dataset['SENADOR'] == 'WELLINGTON FAGUNDES', 'VALOR_REEMBOLSADO'],
    name = 'WELLINGTON FAGUNDES',
    marker = {'color': '#e67e22'}
)
# Generating graphs for senator 3
trace3 = go.Box(
    y = dataset.loc[dataset['SENADOR'] == 'JAQUES WAGNER', 'VALOR_REEMBOLSADO'],
    name = 'JAQUES WAGNER',
    marker = {'color': '#d35400'}
)
# Generating graphs for senator 4
trace4 = go.Box(
    y = dataset.loc[dataset['SENADOR'] == 'PAULO PAIM', 'VALOR_REEMBOLSADO'],
    name = 'PAULO PAIM',
    marker = {'color': '#e74c3c'}
)
data = [trace1, trace2, trace3, trace4]
layout = go.Layout(
    title = 'Dispersão de valores reembolsado para 4 senadores que mais reembolsaram',
    titlefont = {'family': 'Arial', 'size': 22, 'color': '#7f7f7f'},
    xaxis = {'title': 'Senadores'},
    yaxis = {'title': 'Valor de reembolso'},
    paper_bgcolor = 'rgb(243, 243, 243)',
    plot_bgcolor = 'rgb(243, 243, 243)'
)
fig = go.Figure(data=data, layout=layout)
py.iplot(fig)

# What percentage of the quota did the senators spend?

In [92]:
average_annual_quota = 483073.99

total_refund_per_year = pd.DataFrame(columns=['SENADOR', 'ANO', 'TOTAL_REEMBOLSO'])
for senator in dataset['SENADOR'].unique():
    df_aux = dataset.loc[dataset['SENADOR'] == senator]
    for year in dataset['ANO'].unique():
        total_refund_per_year = pd.concat(
            [total_refund_per_year, pd.DataFrame.from_records([{'SENADOR': senator, 'ANO': year, 'TOTAL_REEMBOLSO': df_aux.loc[dataset['ANO'] == year, 'VALOR_REEMBOLSADO'].sum()}])]
        )
total_refund_per_year

Unnamed: 0,SENADOR,ANO,TOTAL_REEMBOLSO
0,ACIR GURGACZ,2019,251714.02
0,ACIR GURGACZ,2020,415364.91
0,ACIR GURGACZ,2021,415374.0
0,AÉCIO NEVES,2019,19739.01
0,AÉCIO NEVES,2020,0.0
...,...,...,...
0,NILDA GONDIM,2020,0.0
0,NILDA GONDIM,2021,206458.96
0,VIRGINIO DE CARVALHO,2019,0.0
0,VIRGINIO DE CARVALHO,2020,0.0


In [95]:
trace1 = go.Bar(
    x = total_refund_per_year['SENADOR'].unique(),
    y = total_refund_per_year.loc[total_refund_per_year['ANO'] == 2019, 'TOTAL_REEMBOLSO'],
    name = 'Ano 2019',
    marker = {'color': '#feca57'}
)
trace2 = go.Bar(
    x = total_refund_per_year['SENADOR'].unique(),
    y = total_refund_per_year.loc[total_refund_per_year['ANO'] == 2020, 'TOTAL_REEMBOLSO'],
    name = 'Ano 2020',
    marker = {'color': '#ff9f43'}
)
trace3 = go.Bar(
    x = total_refund_per_year['SENADOR'].unique(),
    y = total_refund_per_year.loc[total_refund_per_year['ANO'] == 2021, 'TOTAL_REEMBOLSO'],
    name = 'Ano 2021',
    # marker = {'color': '#fjkf13'}
)
quota_array = np.arange(len(total_refund_per_year['SENADOR']))
quota_array.fill(average_annual_quota)
trace4 = go.Scatter(
    x = total_refund_per_year['SENADOR'],
    y = quota_array,
    mode = 'lines',
    name = 'Média da cota anual',
    line = {
        'color': '#341f97',
        'dash': 'dot'
    }
)
data = [trace1, trace2, trace3, trace4]
py.iplot(data)