In [245]:
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go

from plotly.subplots import make_subplots

In [2]:
import warnings
warnings.filterwarnings('ignore')

### Define functions

In [292]:
def get_program_dict():
  program = pd.read_csv('program_data.csv')
  oktmo_str = program['oktmo'].unique()
  new_col = []
  for oktmo in program['oktmo']:
    try:
      oktmo_in = int(oktmo.replace(' ', ''))
      new_col.append(oktmo_in)
    except:
      new_col.append(-1)
  program['oktmo'] = new_col
  oktmo_in = list(set(new_col))
  oktmo_in.remove(-1)
  oktmo_region = {}
  for oktmo in oktmo_in:
    select_oktmo = program[program['oktmo'] == oktmo]
    oktmo_region[oktmo] = (select_oktmo['Регион'].iloc[0]).replace(' ', '')
  return oktmo_region

In [285]:
def plot_description(df, year):
    for reg in df['region_name'].unique():
        fig = make_subplots(rows=1, cols=2, subplot_titles=['не участники', 'участники'], shared_yaxes=True)
        for in_pr_par in [False, True]:
            segm = df[(df['region_name'] == reg) & (df['in_program'] == in_pr_par)]
            col=1
            if in_pr_par:
                col = 2
            fig.add_trace(go.Box(y=segm['value'], name = year, marker=dict(color='royalblue')), row=1, col=col)
            fig.update_layout(title=reg + ', ' + year, showlegend=False)
            fig.update_layout({'plot_bgcolor': 'rgba(0, 0, 0, 0)',
                                'paper_bgcolor': 'rgba(0, 0, 0, 0)',
                                })
            
            fig.update_xaxes(showline=True, linewidth=2, linecolor='black')
            fig.update_yaxes(showline=True, linewidth=2, linecolor='black', gridcolor='grey')
        fig.show()

In [333]:
def plot_diff(region, df):
    reg_df = df[df['region_name'] == region]
    value_2014_in = reg_df[(reg_df['in_program'] == True) & (reg_df['year'] == 2014)]['value'].median()
    value_2018_in = reg_df[(reg_df['in_program'] == True) & (reg_df['year'] == 2018)]['value'].median()
    value_2014_out = reg_df[(reg_df['in_program'] == False) & (reg_df['year'] == 2014)]['value'].median()
    value_2018_out = reg_df[(reg_df['in_program'] == False) & (reg_df['year'] == 2018)]['value'].median()

    fig = go.Figure()
    fig.add_trace(go.Scatter(x=['2014', '2018'], y=[value_2014_out, value_2018_out], name='не участники',
                             line=dict(color='purple', width=4)))
    fig.add_trace(go.Scatter(x=['2014', '2018'], y=[value_2014_in, value_2018_in], name='участники',
                             line=dict(color='royalblue', width=4)))        
    fig.add_trace(go.Scatter(x=['2014', '2018'] , y=[value_2014_in, value_2014_in + (value_2018_out - value_2014_out)],
                             name='предположение',
                             line=dict(color='royalblue', width=4,
                                  dash='dash')))#dash options include 'dash', 'dot', and 'dashdot'
    fig.update_layout({
    'plot_bgcolor': 'rgba(0, 0, 0, 0)',
    'paper_bgcolor': 'rgba(0, 0, 0, 0)',
    })

    fig.update_layout(
        title=region,
        xaxis_title="год",
        yaxis_title="Население, человек"
    )

    fig.update_xaxes(showline=True, linewidth=2, linecolor='black', gridcolor='grey', zerolinecolor='grey')
    fig.update_yaxes(showline=True, linewidth=2, linecolor='black', gridcolor='grey', zerolinecolor='grey')

    fig.show()

### Read & Format

In [277]:
population = pd.read_excel('fd_8112027.xlsx')
pop14 = pd.read_csv('population_final.csv')
popl17 = pd.read_csv('pop18cleaner.csv')

In [278]:
pop14 = pop14[:3092]
popl17 = popl17[:2283]
pop14 = pop14.fillna(False)
in_program_col = list()
for oktmo in pop14['октмо']:
    in_program = False
    if oktmo != False:
        in_program=True
    in_program_col.append(in_program)
pop14['in_program'] = in_program_col
#pop13 = pop13[['регион', '2013', '2014', 'in_program']]
pop14 = pop14.drop(columns=['название ', '2013   ', 'октмо'])
pop14 = pop14.rename({'регион': 'region_name', '2014   ': 'value'},
                     axis=1)
pop14 = pop14[pop14['region_name'] != 'Воронежская область']

In [279]:
popl17 = popl17.fillna(False)
in_program_col = list()
for oktmo in popl17['октмо']:
    in_program = False
    if oktmo != False:
        in_program=True
    in_program_col.append(in_program)
popl17['in_program'] = in_program_col
#pop13 = pop13[['регион', '2013', '2014', 'in_program']]
popl17 = popl17.drop(columns=['МО', '2017', 'октмо'])
popl17 = popl17.rename({'регион': 'region_name', '2018': 'value',}, axis=1)

In [280]:
pop14['year'] = [2014]*len(pop14)
popl17['year'] = [2018]*len(popl17)
population = pd.concat([pop14, popl17])
population

Unnamed: 0,region_name,value,in_program,year
0,Архангельская область,1186.0,False,2014
1,Архангельская область,1642.0,False,2014
2,Архангельская область,667.0,False,2014
3,Архангельская область,659.0,False,2014
4,Архангельская область,681.0,False,2014
...,...,...,...,...
2278,Красноярский край,266.0,False,2018
2279,Красноярский край,87.0,False,2018
2280,Красноярский край,3292.0,False,2018
2281,Красноярский край,2914.0,False,2018


In [None]:
population.to_csv('population.csv')

### Descriptive Statisctics

#### Variance

#### Plots

In [286]:
plot_description(popl17, '2018')

In [287]:
plot_description(pop14, '2014')

### Difference in Differences Plots

In [328]:
# all - 1/7
# noth - 3/7
# partly - 3/7
worked = [1/7, 3/7, 3/7]

res = pd.DataFrame({'type': ['обе гипотезы', 'одна гипотеза', 'ни однa'], 'count': worked})
fig = px.pie(res,values='count', names='type', color='type',
             color_discrete_map={'обе гипотезы': 'limegreen', 'одна гипотеза': 'yellow', 'ни однa': 'coral'}, hole=.3)
fig.show()

In [334]:
for region in population['region_name'].unique():
    plot_diff(region, population)

In [299]:
def read_astr(name):
    df = pd.read_csv(name)
    df = df[(df['region_name'] == 'Астраханская область') & (df['god'].isin([2014, 2018]))]
    df = df[['region_name', 'oktmo']]
    df['oktmo'] = pd.to_numeric(df['oktmo']).astype(int)
    df['in_program'] = [(oktmo in program_data.keys()) for oktmo in df_inp['oktmo']]
    return df

In [300]:
program_data = get_program_dict()

In [302]:
sport = read_astr('fd_8003001.xlsx')
roads = read_astr('fd_8006005')
hosp = read_astr('fd_8018100')

UnicodeDecodeError: 'utf-8' codec can't decode byte 0x8f in position 22: invalid start byte