## Task 1: **Basic Industry Analysis**

Analyze data from industry level and year trend

In [0]:
import os
import re
import sys
from collections import defaultdict
import pandas as pd
import numpy as np
from scipy.stats import norm
import matplotlib.pyplot as plt

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


In [0]:
def read_data(root_dir, data_folder, csv_file):
    print("\nReading data from " + csv_file)
    file_dir = os.path.join(root_dir, data_folder, csv_file)
    return pd.read_csv(file_dir)

In [0]:
def compute_corelation(columns, src_dataframe):
    print("Computing Correlation matrix")
    return np.corrcoef(src_dataframe[columns].to_numpy(), rowvar = False)

In [6]:
'''
Basic environ params
'''
# root_dir = "\\".join(os.path.dirname(__file__).split('\\')[:-1])
root_dir = '/content/drive/My Drive/Penn Inequality Project'
data_folder = "reduced_paynet_data"
task_folder = "task1"
print('----------Analysis Starts----------\n')

data_dir = os.path.join(root_dir, data_folder)
all_df = None
all_annual_df = None
all_annual_plus_df = None
all_data_df = None
cash_df = None

'''
Adjustable params
'''
total_analyzed = False
year_analyzed = False
sector_analyzed = False
sector_annual_analyzed = False

----------Analysis Starts----------



In [0]:
'''
Data import
'''
all_df = pd.read_csv(os.path.join(root_dir, 'reclassified_all_data.csv'))
all_data_df = all_df.drop(columns = ['Benefit Values', 'Fixed Annual Remuneration', 'Total Earnings', 'Long Term Incentive Values', 'Short Term Variable Payments', 'Target Incentive Payment (%)']).dropna()
all_annual_df = all_df[['CalendarYear','IndustryName','Total Annual Remuneration']].dropna()
all_annual_plus_df = all_df[['CalendarYear','IndustryName','Total Remuneration Plus']].dropna()
cash_df = all_df[['CalendarYear','IndustryName','Total Cash']].dropna()

In [11]:
'''
Total analysis
'''
if not total_analyzed:
# correlation analysis
    all_columns = ['Base Salary', 'Total Annual Remuneration', 'Total Cash', 'Total Direct Compensation', 'Total Remuneration Plus']
    corelation_matrix = compute_corelation(all_columns, all_data_df)
    print(corelation_matrix)
    np.save(os.path.join(root_dir, task_folder, 'cor_mat.npy'), corelation_matrix)

    # integrety analysis
    temp = 'Total Annual Remuneration'
    plt.title(temp + " for all integrated data")
    plt.ylabel("Percentage per range")
    plt.xlabel('Range in logarithmic dollar')
    logit_data = np.log10(all_data_df[temp].tolist())
    logit_mean = np.mean(logit_data)
    logit_std = np.std(logit_data)
    n, bins, _ = plt.hist(logit_data, bins=1000, density=True)
    y = norm.pdf(bins, logit_mean, logit_std)
    plt.plot(bins, y, label = "Mean: " + str(logit_mean) + " , Std: " + str(logit_std))
    plt.xlim(left = 0, right = 10)
    plt.grid(True)
    plt.legend()
    # plt.show()
    plt.savefig(os.path.join(root_dir, task_folder, "all_data_" + temp))
    plt.clf()


    temp = 'Total Remuneration Plus'
    plt.title(temp + " for all integrated data")
    plt.ylabel("Percentage per range")
    plt.xlabel('Range in logarithmic dollar')
    logit_data = np.log10(all_data_df[temp].tolist())
    logit_mean = np.mean(logit_data)
    logit_std = np.std(logit_data)
    n, bins, _ = plt.hist(logit_data, bins=1000, density=True)
    y = norm.pdf(bins, logit_mean, logit_std)
    plt.plot(bins, y, label = "Mean: " + str(logit_mean) + " , Std: " + str(logit_std))
    plt.xlim(left = 0, right = 10)
    plt.grid(True)
    plt.legend()
    # plt.show()
    plt.savefig(os.path.join(root_dir, task_folder, "all_data_" + temp))
    plt.clf()


    temp = 'Total Cash'
    plt.title(temp + " for all integrated data")
    plt.ylabel("Percentage per range")
    plt.xlabel('Range in logarithmic dollar')
    logit_data = np.log10(cash_df[temp].tolist())
    logit_mean = np.mean(logit_data)
    logit_std = np.std(logit_data)
    n, bins, _ = plt.hist(logit_data, bins=1000, density=True)
    y = norm.pdf(bins, logit_mean, logit_std)
    plt.plot(bins, y, label = "Mean: " + str(logit_mean) + " , Std: " + str(logit_std))
    plt.xlim(left = 0, right = 10)
    plt.grid(True)
    plt.legend()
    # plt.show()
    plt.savefig(os.path.join(root_dir, task_folder, "all_data_" + temp))
    plt.clf()

Computing Correlation matrix
[[1.         0.9184407  0.91587359 0.69832747 0.73210701]
 [0.9184407  1.         0.99241205 0.83236358 0.86259628]
 [0.91587359 0.99241205 1.         0.83807289 0.85956991]
 [0.69832747 0.83236358 0.83807289 1.         0.99624224]
 [0.73210701 0.86259628 0.85956991 0.99624224 1.        ]]


<Figure size 432x288 with 0 Axes>

In [13]:
if not year_analyzed:
    # year level analysis
    all_year = pd.unique(all_data_df['CalendarYear']).tolist()
    all_year = range(min(all_year), max(all_year) + 1)
    year_mean = defaultdict(list)

    for curr_year in all_year:
        temp = 'Total Annual Remuneration'
        year_data_df = all_annual_df[all_annual_df['CalendarYear'] == curr_year]
        print(str(year_data_df.shape[0]) + ' data pieces chosen for year ' + str(curr_year) + ' on ' + temp)
        if year_data_df.shape[0] > 0:
            plt.title(temp + " for yearly integrated data")
            plt.ylabel("Percentage per range")
            plt.xlabel('Range in logarithmic dollar')
            logit_data = np.log10(year_data_df[temp].tolist())
            logit_mean = np.mean(logit_data)
            year_mean[temp].append(np.mean(year_data_df[temp].tolist()))
            logit_std = np.std(logit_data)
            n, bins, _ = plt.hist(logit_data, bins=1000, density=True)
            y = norm.pdf(bins, logit_mean, logit_std)
            plt.plot(bins, y, label = "Mean: " + str(logit_mean) + " , Std: " + str(logit_std))
            plt.xlim(left = 0, right = 10)
            plt.grid(True)
            plt.legend()
            # plt.show()
            if not os.path.isdir(os.path.join(root_dir, task_folder, str(curr_year))):
                os.mkdir(os.path.join(root_dir, task_folder, str(curr_year)))
            plt.savefig(os.path.join(root_dir, task_folder, str(curr_year), "year_data_" + temp))
            plt.clf()
        else:
            year_mean[temp].append(0)


        temp = 'Total Remuneration Plus'
        year_data_df = all_annual_plus_df[all_annual_plus_df['CalendarYear'] == curr_year]
        print(str(year_data_df.shape[0]) + ' data pieces chosen for year ' + str(curr_year) + ' on ' + temp)
        if year_data_df.shape[0] > 0:
            plt.title(temp + " for yearly integrated data")
            plt.ylabel("Percentage per range")
            plt.xlabel('Range in logarithmic dollar')
            logit_data = np.log10(year_data_df[temp].tolist())
            logit_mean = np.mean(logit_data)
            year_mean[temp].append(np.mean(year_data_df[temp].tolist()))
            logit_std = np.std(logit_data)
            n, bins, _ = plt.hist(logit_data, bins=1000, density=True)
            y = norm.pdf(bins, logit_mean, logit_std)
            plt.plot(bins, y, label = "Mean: " + str(logit_mean) + " , Std: " + str(logit_std))
            plt.xlim(left = 0, right = 10)
            plt.grid(True)
            plt.legend()
            # plt.show()
            if not os.path.isdir(os.path.join(root_dir, task_folder, str(curr_year))):
                os.mkdir(os.path.join(root_dir, task_folder, str(curr_year)))
            plt.savefig(os.path.join(root_dir, task_folder, str(curr_year), "year_data_" + temp))
            plt.clf()
        else:
            year_mean[temp].append(0)


        temp = 'Total Cash'
        year_data_df = cash_df[cash_df['CalendarYear'] == curr_year]
        print(str(year_data_df.shape[0]) + ' data pieces chosen for year ' + str(curr_year) + ' on ' + temp)
        plt.title(temp + " for yearly integrated data")
        plt.ylabel("Percentage per range")
        plt.xlabel('Range in logarithmic dollar')
        logit_data = np.log10(year_data_df[temp].tolist())
        logit_mean = np.mean(logit_data)
        year_mean[temp].append(np.mean(year_data_df[temp].tolist()))
        logit_std = np.std(logit_data)
        n, bins, _ = plt.hist(logit_data, bins=1000, density=True)
        y = norm.pdf(bins, logit_mean, logit_std)
        plt.plot(bins, y, label = "Mean: " + str(logit_mean) + " , Std: " + str(logit_std))
        plt.xlim(left = 0, right = 10)
        plt.grid(True)
        plt.legend()
        # plt.show()
        if not os.path.isdir(os.path.join(root_dir, task_folder, str(curr_year))):
            os.mkdir(os.path.join(root_dir, task_folder, str(curr_year)))
        plt.savefig(os.path.join(root_dir, task_folder, str(curr_year), "year_data_" + temp))
        plt.clf()
        print(str(curr_year) + ' year analysis finished\n')


    # linear regression of time with three variables
    plt.title('Total year trend')
    plt.xlabel('Year span')
    plt.ylabel('Range in linear dollar')
    plt.xlim(min(all_year) - 1, max(all_year) + 1)
    plt.plot(all_year, year_mean['Total Annual Remuneration'], label = "Total Annual Remuneration")
    plt.plot(all_year, year_mean['Total Remuneration Plus'], label = 'Total Remuneration Plus')
    plt.plot(all_year, year_mean['Total Cash'], label = 'Total Cash')
    plt.legend()
    plt.savefig(os.path.join(root_dir, task_folder, 'Year Trend'))
    plt.clf()

436733 data pieces chosen for year 2008 on Total Annual Remuneration
436733 data pieces chosen for year 2008 on Total Remuneration Plus
800165 data pieces chosen for year 2008 on Total Cash
2008 year analysis finished

338409 data pieces chosen for year 2009 on Total Annual Remuneration
338409 data pieces chosen for year 2009 on Total Remuneration Plus
891611 data pieces chosen for year 2009 on Total Cash
2009 year analysis finished

371154 data pieces chosen for year 2010 on Total Annual Remuneration
371154 data pieces chosen for year 2010 on Total Remuneration Plus
991692 data pieces chosen for year 2010 on Total Cash
2010 year analysis finished

354197 data pieces chosen for year 2011 on Total Annual Remuneration
354197 data pieces chosen for year 2011 on Total Remuneration Plus
993702 data pieces chosen for year 2011 on Total Cash
2011 year analysis finished

0 data pieces chosen for year 2012 on Total Annual Remuneration
0 data pieces chosen for year 2012 on Total Remuneration Plu

<Figure size 432x288 with 0 Axes>

In [17]:
if not sector_analyzed:
    all_industry = pd.unique(cash_df['IndustryName']).tolist()
    all_year = pd.unique(all_data_df['CalendarYear']).tolist()
    all_year = range(min(all_year), max(all_year) + 1)
    for curr_industry in all_industry:
        industry = re.sub(r'\s+', ' ', curr_industry)
        if not os.path.isdir(os.path.join(root_dir, task_folder, industry)):
            os.mkdir(os.path.join(root_dir, task_folder, industry))
            
        industry_data_df = all_data_df[all_data_df['IndustryName'] == curr_industry]
        industry_remuneration_df = all_annual_df[all_annual_df['IndustryName'] == curr_industry]
        industry_remuneration_plus_df = all_annual_plus_df[all_annual_plus_df['IndustryName'] == curr_industry]
        industry_cash_df = cash_df[cash_df['IndustryName'] == curr_industry]

        all_columns = ['Base Salary', 'Total Annual Remuneration', 'Total Cash', 'Total Direct Compensation', 'Total Remuneration Plus']
        if industry_data_df.shape[0] > 0:
            corelation_matrix = compute_corelation(all_columns, industry_data_df)
            print(corelation_matrix)
            np.save(os.path.join(root_dir, task_folder, industry, 'cor_mat.npy'), corelation_matrix)


        if industry_remuneration_df.shape[0] > 0:
            temp = 'Total Annual Remuneration'
            plt.title(temp + " for all data in " + industry)
            plt.ylabel("Percentage per range")
            plt.xlabel('Range in logarithmic dollar')
            print(str(industry_remuneration_df[temp].shape[0]) + ' data pieces chosen for ' + industry + ' on ' + temp)
            logit_data = np.log10(industry_remuneration_df[temp].tolist())
            logit_mean = np.mean(logit_data)
            logit_std = np.std(logit_data)
            n, bins, _ = plt.hist(logit_data, bins=1000, density=True)
            y = norm.pdf(bins, logit_mean, logit_std)
            plt.plot(bins, y, label = "Mean: " + str(logit_mean) + " , Std: " + str(logit_std))
            plt.xlim(left = 0, right = 10)
            plt.grid(True)
            plt.legend()
            # plt.show()
            plt.savefig(os.path.join(root_dir, task_folder, industry, "all_data_" + temp))
            plt.clf()


        if industry_remuneration_plus_df.shape[0] > 0:
            temp = 'Total Remuneration Plus'
            plt.title(temp + " for all data in " + industry)
            plt.ylabel("Percentage per range")
            plt.xlabel('Range in logarithmic dollar')
            print(str(industry_remuneration_plus_df[temp].shape[0]) + ' data pieces chosen for ' + industry + ' on ' + temp)
            logit_data = np.log10(industry_remuneration_plus_df[temp].tolist())
            logit_mean = np.mean(logit_data)
            logit_std = np.std(logit_data)
            n, bins, _ = plt.hist(logit_data, bins=1000, density=True)
            y = norm.pdf(bins, logit_mean, logit_std)
            plt.plot(bins, y, label = "Mean: " + str(logit_mean) + " , Std: " + str(logit_std))
            plt.xlim(left = 0, right = 10)
            plt.grid(True)
            plt.legend()
            # plt.show()
            plt.savefig(os.path.join(root_dir, task_folder, industry, "all_data_" + temp))
            plt.clf()


        if industry_cash_df.shape[0] > 0:
            temp = 'Total Cash'
            plt.title(temp + " for all data in " + industry)
            plt.ylabel("Percentage per range")
            plt.xlabel('Range in logarithmic dollar')
            print(str(industry_cash_df[temp].shape[0]) + ' data pieces chosen for ' + industry + ' on ' + temp)
            logit_data = np.log10(industry_cash_df[temp].tolist())
            logit_mean = np.mean(logit_data)
            logit_std = np.std(logit_data)
            n, bins, _ = plt.hist(logit_data, bins=1000, density=True)
            y = norm.pdf(bins, logit_mean, logit_std)
            plt.plot(bins, y, label = "Mean: " + str(logit_mean) + " , Std: " + str(logit_std))
            plt.xlim(left = 0, right = 10)
            plt.grid(True)
            plt.legend()
            # plt.show()
            plt.savefig(os.path.join(root_dir, task_folder, industry, "all_data_" + temp))
            plt.clf()
            print(curr_industry + ' sector analysis finished\n')

    

Computing Correlation matrix
[[1.         0.90395786 0.90291244 0.66753586 0.69395849]
 [0.90395786 1.         0.99283417 0.81362939 0.83725524]
 [0.90291244 0.99283417 1.         0.81297715 0.82938614]
 [0.66753586 0.81362939 0.81297715 1.         0.9973469 ]
 [0.69395849 0.83725524 0.82938614 0.9973469  1.        ]]
5051392 data pieces chosen for specialty retail on Total Annual Remuneration
5004378 data pieces chosen for specialty retail on Total Remuneration Plus
18032842 data pieces chosen for specialty retail on Total Cash
specialty retail sector analysis finished

Computing Correlation matrix
[[1.         0.8890719  0.89707538 0.77917896 0.79936239]
 [0.8890719  1.         0.99236901 0.89335251 0.91981227]
 [0.89707538 0.99236901 1.         0.89911944 0.91633806]
 [0.77917896 0.89335251 0.89911944 1.         0.99584021]
 [0.79936239 0.91981227 0.91633806 0.99584021 1.        ]]
69511 data pieces chosen for food product on Total Annual Remuneration
68866 data pieces chosen for fo

<Figure size 432x288 with 0 Axes>

In [18]:
if not sector_annual_analyzed:
    all_industry = pd.unique(cash_df['IndustryName']).tolist()
    all_year = pd.unique(all_data_df['CalendarYear']).tolist()
    all_year = range(min(all_year), max(all_year) + 1)
    for curr_industry in all_industry:    
        year_mean = defaultdict(list) 
        industry_remuneration_df = all_annual_df[all_annual_df['IndustryName'] == curr_industry]
        industry_remuneration_plus_df = all_annual_plus_df[all_annual_plus_df['IndustryName'] == curr_industry]
        industry_cash_df = cash_df[cash_df['IndustryName'] == curr_industry]
        industry = re.sub(r'\s+', ' ', curr_industry)
        for curr_year in all_year:
            if not os.path.isdir(os.path.join(root_dir, task_folder, str(curr_year), industry)):
                os.mkdir(os.path.join(root_dir, task_folder, str(curr_year), industry))
            industry_year_remuneration_df = industry_remuneration_df[industry_remuneration_df['CalendarYear'] == curr_year]
            industry_year_remuneration_plus_df = industry_remuneration_plus_df[industry_remuneration_plus_df['CalendarYear'] == curr_year]
            industry_year_cash_df = industry_cash_df[industry_cash_df['CalendarYear'] == curr_year]
                
            temp = 'Total Annual Remuneration'
            plt.title(temp + " for all data in " + industry + " of year " + str(curr_year))
            plt.ylabel("Percentage per range")
            plt.xlabel('Range in logarithmic dollar')
            if industry_year_remuneration_df[temp].size > 0:
                print(str(industry_year_remuneration_df[temp].shape[0]) + ' data pieces chosen for ' + curr_industry + ' of year ' + str(curr_year) + ' on ' + temp)
                logit_data = np.log10(industry_year_remuneration_df[temp].tolist())
                logit_mean = np.mean(logit_data)
                year_mean[temp].append(np.mean(industry_year_remuneration_df[temp].tolist()))
                logit_std = np.std(logit_data)
                n, bins, _ = plt.hist(logit_data, bins=1000, density=True)
                y = norm.pdf(bins, logit_mean, logit_std)
                plt.plot(bins, y, label = "Mean: " + str(logit_mean) + " , Std: " + str(logit_std))
                plt.xlim(left = 0, right = 10)
                plt.grid(True)
                plt.legend()
                # plt.show()
                plt.savefig(os.path.join(root_dir, task_folder, str(curr_year), industry, "all_data_" + temp))     
            else:
                year_mean[temp].append(0)
            plt.clf()


            temp = 'Total Remuneration Plus'
            plt.title(temp + " for all data in " + industry + " of year " + str(curr_year))
            plt.ylabel("Percentage per range")
            plt.xlabel('Range in logarithmic dollar')
            if industry_year_remuneration_plus_df[temp].size > 0:
                print(str(industry_year_remuneration_plus_df[temp].shape[0]) + ' data pieces chosen for ' + curr_industry + ' of year ' + str(curr_year) + ' on ' + temp)
                logit_data = np.log10(industry_year_remuneration_plus_df[temp].tolist())
                logit_mean = np.mean(logit_data)
                year_mean[temp].append(np.mean(industry_year_remuneration_plus_df[temp].tolist()))
                logit_std = np.std(logit_data)
                n, bins, _ = plt.hist(logit_data, bins=1000, density=True)
                y = norm.pdf(bins, logit_mean, logit_std)
                plt.plot(bins, y, label = "Mean: " + str(logit_mean) + " , Std: " + str(logit_std))
                plt.xlim(left = 0, right = 10)
                plt.grid(True)
                plt.legend()
                # plt.show()
                plt.savefig(os.path.join(root_dir, task_folder, str(curr_year), industry, "all_data_" + temp))
            else:
                year_mean[temp].append(0)
            plt.clf()


            temp = 'Total Cash'
            plt.title(temp + " for all data in " + industry + " of year " + str(curr_year))
            plt.ylabel("Percentage per range")
            plt.xlabel('Range in logarithmic dollar')
            if industry_year_cash_df[temp].size > 0:
                print(str(industry_year_cash_df[temp].shape[0]) + ' data pieces chosen for ' + curr_industry + ' of year ' + str(curr_year) + ' on ' + temp)
                logit_data = np.log10(industry_year_cash_df[temp].tolist())
                logit_mean = np.mean(logit_data)
                year_mean[temp].append(np.mean(industry_year_cash_df[temp].tolist()))
                logit_std = np.std(logit_data)
                n, bins, _ = plt.hist(logit_data, bins=1000, density=True)
                y = norm.pdf(bins, logit_mean, logit_std)
                plt.plot(bins, y, label = "Mean: " + str(logit_mean) + " , Std: " + str(logit_std))
                plt.xlim(left = 0, right = 10)
                plt.grid(True)
                plt.legend()
                # plt.show()
                plt.savefig(os.path.join(root_dir, task_folder, str(curr_year), industry, "all_data_" + temp))
            else:
                year_mean[temp].append(0)
            plt.clf()
                

        plt.title(curr_industry + ' year trend')
        plt.xlabel('Year span')
        plt.ylabel('Range in linear dollar')
        plt.xlim(min(all_year) - 1, max(all_year) + 1)
        plt.plot(all_year, year_mean['Total Annual Remuneration'], label = 'Total Annual Remuneration')
        plt.plot(all_year, year_mean['Total Remuneration Plus'], label = 'Total Remuneration Plus')
        plt.plot(all_year, year_mean['Total Cash'], label = 'Total Cash')
        plt.legend()
        plt.savefig(os.path.join(root_dir, task_folder, industry, 'Year Trend'))
        plt.clf()
        print(curr_industry + ' sector trend analysis finished\n')


print('\n\n-----------Analysis Ends-----------')

248967 data pieces chosen for specialty retail of year 2008 on Total Annual Remuneration
248967 data pieces chosen for specialty retail of year 2008 on Total Remuneration Plus
305307 data pieces chosen for specialty retail of year 2008 on Total Cash
131779 data pieces chosen for specialty retail of year 2009 on Total Annual Remuneration
131779 data pieces chosen for specialty retail of year 2009 on Total Remuneration Plus
317705 data pieces chosen for specialty retail of year 2009 on Total Cash
136813 data pieces chosen for specialty retail of year 2010 on Total Annual Remuneration
136813 data pieces chosen for specialty retail of year 2010 on Total Remuneration Plus
325976 data pieces chosen for specialty retail of year 2010 on Total Cash
93682 data pieces chosen for specialty retail of year 2011 on Total Annual Remuneration
93682 data pieces chosen for specialty retail of year 2011 on Total Remuneration Plus
326677 data pieces chosen for specialty retail of year 2011 on Total Cash
35

<Figure size 432x288 with 0 Axes>