In [22]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [18]:
path = "base_rentismo_final_217.xlsx" # Nombre del archivo
data = pd.read_excel(path)

In [25]:
def filtrar_df(data, condicion):
    """
    Filtra un DataFrame según una condición dada y realiza cálculos adicionales.
    
    Args:
        data (pd.DataFrame): El DataFrame a procesar.
        condicion (str): La condición a aplicar como cadena. Por ejemplo, "merged_df['dummy_issue'] == 0".
    
    Returns:
        pd.DataFrame: El DataFrame con el resultado de los cálculos.
    """
    # Crear columna 'dummy_issue' basada en la condición de 'issue_count'
    data['dummy_issue'] = np.where(data['issue_count'] > 0, 1, 0)
    data = data.groupby('NAICS_Sector_Name').filter(lambda x: (x['dummy_issue'] ==1).any())
    data = data[data['NAICS_Sector_Name'] != 'Finance and Insurance']
    data = data[~data['NAICS_Sector_Name'].isin(["Finance and Insurance", "Real Estate and Rental and Leasing","Financial Services","Real Estate Management & Development","Insurance"])]
    #data = data[data['NAICS_Sector_Name'] != 'Transportation and Warehousing']
    data = data[data['Issuer'] != 'LATAM Airlines Group SA']
    # Extraer el año de 'Period_End_Date'
    data['Año'] = pd.to_datetime(data['Period_End_Date']).dt.year
    # Función para categorizar el periodo basado en el año
    def categorize_period(year):
        if 2013 <= year <= 2023:
            return '2013-2023'
        else:
            return 'Fuera de rango'
    """"    
    def categorize_period(year):
        if 2013 <= year <= 2019:
            return '2013-2019'
        elif 2020<= year <= 2023:
            return '2020-2023'
        else:
            return 'Fuera de rango'
    """
    # Crear la columna 'Period' categorizada
    data['Period'] = data['Año'].apply(categorize_period)
# Filtrar los sector 'Educational Services'

    # Agrupar por Sector y Año para calcular la mediana de ROA
    grouped = data.groupby(['NAICS_Sector_Name', 'Año']).agg(
        median_NIBT=('Net_Income_Before_Taxes', 'median')
    ).reset_index()
    
    # Agregar el periodo al DataFrame agrupado
    grouped['Period'] = grouped['Año'].apply(categorize_period)
    # Agrupar por Sector y Periodo para calcular el promedio de las medianas
    benchmark = grouped.groupby(['NAICS_Sector_Name', 'Period']).agg(
    avg_median_roa=('median_NIBT', 'mean')
    ).reset_index()
        # Merge del DataFrame original con el benchmark
    merged_df = pd.merge(data, benchmark, on=['NAICS_Sector_Name', 'Period'], how='inner')
        # Calcular las utilidades típicas y el excedente de la firma
    #merged_df['tipical_profits'] = merged_df['avg_median_roa']
    # Filtrar el DataFrame usando la condición proporcionada
    merged_df = merged_df.loc[eval(condicion)]
    #print(merged_df.to_markdown(index=False, numalign="left", stralign="left"))
    merged_df['avg_median_roa_y'] = merged_df['avg_median_roa_y']*(merged_df['Total_Assets,_Reported']/np.log(merged_df['Total_Assets,_Reported']))
    merged_df['Net_Income_Before_Taxes'] = merged_df['Net_Income_Before_Taxes']/np.log(merged_df['Total_Assets,_Reported'])
    # Agrupar por Periodo para calcular el numerador y denominador del excedente
    surplus = merged_df.groupby(['NAICS_Sector_Name','Period']).agg(
        sum_tipical=('avg_median_roa_y', 'sum'),
        sum_net=('Net_Income_Before_Taxes', 'sum')
    ).reset_index()
    return surplus
    

In [27]:




#surplus1 =filtrar_df(data, "merged_df['dummy_issue'] == 0")
#surplus2 =filtrar_df(data, "merged_df['dummy_issue'] == 1")

surplus1 =filtrar_df(data, "merged_df['decil'] < 10")
surplus2 =filtrar_df(data, "merged_df['decil'] == 10")


merge_surplus = pd.merge(surplus1,surplus2, on=['NAICS_Sector_Name', 'Period'], how='inner')

merge_surplus['total_net'] = merge_surplus['sum_net_x'] +  merge_surplus['sum_net_y']
merge_surplus['total_tipical'] = merge_surplus['sum_tipical_x'] +  merge_surplus['sum_tipical_y']

merge_surplus['proporcion_net_x'] = merge_surplus['sum_net_x'] / merge_surplus.groupby(['Period', 'NAICS_Sector_Name'])['total_net'].transform('sum')
merge_surplus['proporcion_net_y'] = merge_surplus['sum_net_y'] / merge_surplus.groupby(['Period', 'NAICS_Sector_Name'])['total_net'].transform('sum')

merge_surplus['proporcion_tipical_x'] = merge_surplus['sum_tipical_x'] / merge_surplus.groupby(['Period', 'NAICS_Sector_Name'])['total_tipical'].transform('sum')
merge_surplus['proporcion_tipical_y'] = merge_surplus['sum_tipical_y'] / merge_surplus.groupby(['Period', 'NAICS_Sector_Name'])['total_tipical'].transform('sum')

merge_surplus = merge_surplus.drop(columns=['total_net', 'total_tipical','sum_net_x','sum_net_y','sum_tipical_x','sum_tipical_y'])
# Lista de las variables a multiplicar
columnas_a_multiplicar = ['proporcion_net_x', 'proporcion_net_y', 'proporcion_tipical_x', 'proporcion_tipical_y']
# Multiplicar por 100
merge_surplus[columnas_a_multiplicar] = merge_surplus[columnas_a_multiplicar] * 100
merge_surplus= merge_surplus.sort_values(by='Period')
latex_code = merge_surplus.to_latex(index=False, escape=False,float_format="%.2f")
print(latex_code)

\begin{tabular}{llrrrr}
\toprule
NAICS_Sector_Name & Period & proporcion_net_x & proporcion_net_y & proporcion_tipical_x & proporcion_tipical_y \\
\midrule
Information & 2013-2023 & 22.63 & 77.37 & 17.14 & 82.86 \\
Manufacturing & 2013-2023 & 46.14 & 53.86 & 55.55 & 44.45 \\
Mining, Quarrying, and Oil and Gas Extraction & 2013-2023 & 15.27 & 84.73 & 10.90 & 89.10 \\
Retail Trade & 2013-2023 & 3.74 & 96.26 & 11.11 & 88.89 \\
Transportation and Warehousing & 2013-2023 & 18.29 & 81.71 & 41.44 & 58.56 \\
Utilities & 2013-2023 & 23.62 & 76.38 & 22.38 & 77.62 \\
\bottomrule
\end{tabular}



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  merged_df['avg_median_roa_y'] = merged_df['avg_median_roa_y']*(merged_df['Total_Assets,_Reported']/np.log(merged_df['Total_Assets,_Reported']))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  merged_df['Net_Income_Before_Taxes'] = merged_df['Net_Income_Before_Taxes']/np.log(merged_df['Total_Assets,_Reported'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable