In [None]:
pip install xlrd

In [None]:
pip install researchpy

In [None]:
pip install openpyxl

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing
import plotly.express as px
import datetime
import calendar
import researchpy as rp

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

#Actividad económica para reportes estadísticos
sae_dict = {'A':'AGRICULTURA','B':'MINAS Y CANTERAS','C':'MANUFACTURERAS','D':'SUMINISTRO DE ELECTRICIDAD GAS Y VAPOR',
            'E':'AGUA POTABLE','F':'CONSTRUCCIÓN','G':'COMERCIO', 'H':'TRANSPORTE','I':'TURISMO','J':'INFORMACIÓN Y COMUNICACIÓN',
            'K':'FINANCIERAS Y SEGUROS','L':'INMOBILIARIAS','M':'ACTIVIDADES PROFESIONALES','N':'SERVICIOS ADMINISTRATIVOS',
            'O':'ADMINISTRACIÓN PÚBLICA','P':'ENSEÑANZAS','Q':'ACTIVIDADES DE SALUD','R':'ARTES','S':'SERVICIOS','T':'HOGARES',
            'U':'EXTRATERRITORIALES','V':'SIN ACTIVIDAD ECONÓMICA - CIUU','W':'RLDP PRIVADO','X':'RLDP PÚBLICO','9':'VERIFICAR',
            'NO TIENE':'NO TIENE'
           }

In [None]:
len(sae_dict.values())

In [None]:
df_cods = pd.read_excel("../input/sri-saiku/CRUCES CIIU.xls",sheet_name='CIIU 6C')
df_cods[df_cods.CODIGO.str.len() == 1].head()

In [None]:
df = pd.read_csv("../input/sri-saiku/saiku ventas totales 2018 - 2020.csv").replace({'FAMILIA': sae_dict})
df
#Se arreglan la fecha a un formato Datetime de pandas
df["FECHA"] = pd.to_datetime(df.apply(lambda x: 
                       datetime.date(x["ANIO FISCAL"],
                                     x["MES FISCAL"],
                                     calendar.monthrange(x["ANIO FISCAL"],x["MES FISCAL"])[1]) 
                       ,axis=1))
df.rename(columns = {'VENTAS LOCALES 12% (411)': 'Ventas_12pct', 
                     'VENTAS LOCALES 0% (413)': 'Ventas_0pct',
                    'EXPORTACIONES DE BIENES (417)': 'Exp_bienes', 
                     'EXPORTACIONES DE SERVICIOS (418)':'Exp_servicios'}, inplace = True)

df.eval("VENTAS_TOTALES = Ventas_12pct + Ventas_0pct + Exp_bienes + Exp_servicios",inplace=True)

In [None]:
df.head()

In [None]:
dfg_family = df.copy().groupby(by=['FECHA','FAMILIA']).sum().reset_index().replace({'FAMILIA': sae_dict})
dfg_family.head()

In [None]:
df.FAMILIA.unique()

In [None]:
index_list = []
summary_list = []
results_list = []
for key,value in sae_dict.items():
    if (value != 'NO TIENE'):
        data_2019 = df[(df['ANIO FISCAL'] == 2019) & (df.FAMILIA == value)].VENTAS_TOTALES
        data_2020 = df[(df['ANIO FISCAL'] == 2020) & (df.FAMILIA == value)].VENTAS_TOTALES
        summary, results = rp.ttest(group1= data_2019, group1_name= "Grupo 2019",
                                    group2= data_2020, group2_name= "Grupo 2020")
        index_list.append(value)
        summary_list.append(summary)
        results_list.append(results)

In [None]:
results_list[0]

In [None]:
summary_list[0].iloc[0].Mean - summary_list[0].iloc[1].Mean
# g2019 - g2020 > 0
# g2019 - g2020 < 0

In [None]:
cols=['(Grupo 2019 - Grupo 2020)', 'DOF','t_val', 'Ts_p_value', 'Diff_<_0_p_value','Diff_>_0_p_value']

result_table = pd.concat([pd.DataFrame([result.results[:6].to_list()],
                        columns=cols) for result in results_list],
                         keys = index_list).droplevel(1)

def hipotesisRes(diff,alpha,ts_p,low_p,great_p):
    if (diff>0):
        return (great_p<alpha)
    elif (diff<0):
        return (low_p<alpha)

result_table['Ho Rejected']=result_table.apply(lambda x: hipotesisRes(x['(Grupo 2019 - Grupo 2020)'],
                                          0.05,
                                         x['Ts_p_value'],
                                         x['Diff_<_0_p_value'],
                                         x['Diff_>_0_p_value']),axis=1)
result_table

In [None]:
summary_list[0]

In [None]:
descg_cols = ['Variable','N','Mean','SD','SE']

desc_data = pd.concat([pd.DataFrame(summary_l[summary_l.columns[:5]],
                        columns=descg_cols) for summary_l in summary_list],
                         keys = index_list).droplevel(1).reset_index()

desc_data = desc_data[desc_data.Variable!='combined']

In [None]:
desc_data.to_excel('Descriptive data.xlsx')

In [None]:
familias_decrecientes = result_table[result_table['Ho Rejected'] == True].index.to_list()
familias_crecientes = result_table[result_table['Ho Rejected'] == False].index.to_list()
desc_data[desc_data['index'].isin(familias_decrecientes)]

In [None]:
import plotly.express as px
fig = px.bar(desc_data[desc_data['index'].isin(familias_decrecientes)], x="index", y="Mean",color='Variable', barmode='group')
fig.show()

In [None]:
fig = px.bar(desc_data[desc_data['index'].isin(familias_crecientes)], x="index", y="Mean",color='Variable', barmode='group')
fig.show()