In [1]:
from src.core import DataProcessor

In [2]:
results_folder = 'results/demo'
corr_name = "real_data_correlation"
conf_matrix = "real_data_conf_matrix"

demo_cols = ["ESI_Key", "RaterType", "Race", "Gender"]

# all the items cols
items_cols = ["BMK_S01_Strategic", "BMK_S02_Quick", "BMK_S03_Decisive", "BMK_S04_Change", "BMK_S05_Leading", "BMK_S06_Confront", "BMK_S07_Participative",
              "BMK_S08_Build", "BMK_S09_Compassion", "BMK_S10_Putting", "BMK_S11_Respect", "BMK_D01_Interper", "BMK_D02_DiffBuild", "BMK_D03_DiffChange", "BMK_D04_Failure", "BMK_D05_Narrow"]

In [3]:
data_processor = DataProcessor(data="BMK_2018.csv")

data_processor.items_cols = items_cols
data_processor.demo_cols = demo_cols

In [4]:
df = data_processor.get_data()

In [5]:
data_processor.median_rater_counts()

RaterType
boss             1.0
direct report    3.0
other            1.0
peer             4.0
superior         1.0
dtype: float64

In [6]:
data_processor.filter_data_with_all_raters()
data_processor.df.shape

(166814, 20)

In [7]:
data_processor.df.RaterType.unique()

array(['boss', 'direct report', 'peer', 'self'], dtype=object)

In [8]:
data_processor.items_cols

['BMK_S01_Strategic',
 'BMK_S02_Quick',
 'BMK_S03_Decisive',
 'BMK_S04_Change',
 'BMK_S05_Leading',
 'BMK_S06_Confront',
 'BMK_S07_Participative',
 'BMK_S08_Build',
 'BMK_S09_Compassion',
 'BMK_S10_Putting',
 'BMK_S11_Respect',
 'BMK_D01_Interper',
 'BMK_D02_DiffBuild',
 'BMK_D03_DiffChange',
 'BMK_D04_Failure',
 'BMK_D05_Narrow']

In [9]:
real_data = data_processor.pivot_rater_data()

Index(['BMK_S01_Strategic_boss', 'BMK_S01_Strategic_direct report',
       'BMK_S01_Strategic_peer', 'BMK_S02_Quick_boss',
       'BMK_S02_Quick_direct report', 'BMK_S02_Quick_peer',
       'BMK_S03_Decisive_boss', 'BMK_S03_Decisive_direct report',
       'BMK_S03_Decisive_peer', 'BMK_S04_Change_boss',
       'BMK_S04_Change_direct report', 'BMK_S04_Change_peer',
       'BMK_S05_Leading_boss', 'BMK_S05_Leading_direct report',
       'BMK_S05_Leading_peer', 'BMK_S06_Confront_boss',
       'BMK_S06_Confront_direct report', 'BMK_S06_Confront_peer',
       'BMK_S07_Participative_boss', 'BMK_S07_Participative_direct report',
       'BMK_S07_Participative_peer', 'BMK_S08_Build_boss',
       'BMK_S08_Build_direct report', 'BMK_S08_Build_peer',
       'BMK_S09_Compassion_boss', 'BMK_S09_Compassion_direct report',
       'BMK_S09_Compassion_peer', 'BMK_S10_Putting_boss',
       'BMK_S10_Putting_direct report', 'BMK_S10_Putting_peer',
       'BMK_S11_Respect_boss', 'BMK_S11_Respect_direct report

In [10]:
len(real_data.columns)

68

In [11]:
import numpy as np
replacement_map = {
    np.nan: 'Unknown',
    r'White|Caucasian': 'Caucasian',
    r'African American|Black': 'African American',
    r'Asian|Chinese|Filipino|Korean|Vietnamese|Japanese': 'Asian',
    r'Asian (Other|Indian)': 'Asian',
    r'Hispanic': 'Hispanic',
    r'Multiracial': 'Multiracial',
    r'American Indian or Alaskan Nat|Indigenous': 'Indigenous',
    r'Other \(please specify\)|Other': 'Other',
    r'Native Hawaiian|Samoan': 'Pacific Islander'
}

real_data['Race'] = real_data['Race'].replace(replacement_map, regex=True)

In [12]:
replacement_map = {
    np.nan: 'Unknown',
    r'UnSpec|NonBin|NotSay|Other': 'Other',
    r'Male': 'Male',
    r'Female': 'Female'
}

real_data['Gender'] = real_data['Gender'].replace(replacement_map, regex=True)

In [13]:
real_data['Race'].value_counts()

Race
Unknown             10104
Caucasian            4942
African American      649
Asian                 490
Hispanic              312
Other                 121
Multiracial           117
Indigenous             14
Pacific Islander        3
Name: count, dtype: int64

In [14]:
real_data['Gender'].value_counts()

Gender
Unknown    7120
Other      4381
Male       3314
Female     1937
Name: count, dtype: int64

In [15]:
column_names = [
    "Gender",
    "Race",
    'BMK_S01_Strategic',
    'BMK_S02_Quick',
    'BMK_S03_Decisive',
    'BMK_S04_Change',
    'BMK_S05_Leading',
    'BMK_S06_Confront',
    'BMK_S07_Participative',
    'BMK_S08_Build',
    'BMK_S09_Compassion',
    'BMK_S10_Putting',
    'BMK_S11_Respect',
    'BMK_D01_Interper',
    'BMK_D02_DiffBuild',
    'BMK_D03_DiffChange',
    'BMK_D04_Failure',
    'BMK_D05_Narrow',
    'BMK_S01_Strategic_boss',
    'BMK_S01_Strategic_direct_report',
    'BMK_S01_Strategic_peer',
    'BMK_S02_Quick_boss',
    'BMK_S02_Quick_direct_report',
    'BMK_S02_Quick_peer',
    'BMK_S03_Decisive_boss',
    'BMK_S03_Decisive_direct_report',
    'BMK_S03_Decisive_peer',
    'BMK_S04_Change_boss',
    'BMK_S04_Change_direct_report',
    'BMK_S04_Change_peer',
    'BMK_S05_Leading_boss',
    'BMK_S05_Leading_direct_report',
    'BMK_S05_Leading_peer',
    'BMK_S06_Confront_boss',
    'BMK_S06_Confront_direct_report',
    'BMK_S06_Confront_peer',
    'BMK_S07_Participative_boss',
    'BMK_S07_Participative_direct_report',
    'BMK_S07_Participative_peer',
    'BMK_S08_Build_boss',
    'BMK_S08_Build_direct_report',
    'BMK_S08_Build_peer',
    'BMK_S09_Compassion_boss',
    'BMK_S09_Compassion_direct_report',
    'BMK_S09_Compassion_peer',
    'BMK_S10_Putting_boss',
    'BMK_S10_Putting_direct_report',
    'BMK_S10_Putting_peer',
    'BMK_S11_Respect_boss',
    'BMK_S11_Respect_direct_report',
    'BMK_S11_Respect_peer',
    'BMK_D01_Interper_boss',
    'BMK_D01_Interper_direct_report',
    'BMK_D01_Interper_peer',
    'BMK_D02_DiffBuild_boss',
    'BMK_D02_DiffBuild_direct_report',
    'BMK_D02_DiffBuild_peer',
    'BMK_D03_DiffChange_boss',
    'BMK_D03_DiffChange_direct_report',
    'BMK_D03_DiffChange_peer',
    'BMK_D04_Failure_boss',
    'BMK_D04_Failure_direct_report',
    'BMK_D04_Failure_peer',
    'BMK_D05_Narrow_boss',
    'BMK_D05_Narrow_direct_report',
    'BMK_D05_Narrow_peer'
]

In [16]:
len(real_data.columns)

68

In [17]:
from src.core import SynthLeader

In [18]:
synth = SynthLeader(df=real_data, name="BMK_2018")

GPU: True


In [19]:
# synth.metadata

## Real Dataset

In [20]:
real_corr = synth.generate_corr_matrix(df=real_data)
fig = synth.style_correlation_matrix(real_corr)
data_processor.save_data(
    fig.to_html(), name=f"{corr_name}.html", folder=results_folder)

results/demo/real_data_correlation.html uploaded


In [21]:
%store real_data
%store real_corr
%store column_names
%store synth
%store data_processor
%store results_folder

Stored 'real_data' (DataFrame)
Stored 'real_corr' (DataFrame)
Stored 'column_names' (list)
Stored 'synth' (SynthLeader)
Stored 'data_processor' (DataProcessor)
Stored 'results_folder' (str)


## Output

In [22]:
import pandas as pd
xtab = pd.crosstab(real_data['Race'], real_data['Gender'])
data_processor.save_data(xtab.to_csv(index=True),
                         name=f'{conf_matrix}.csv', folder=results_folder)

results/demo/real_data_conf_matrix.csv uploaded


In [23]:
data_processor.save_data(real_corr.to_csv(index=False),
                         name=f'{corr_name}.csv', folder=results_folder)

results/demo/real_data_correlation.csv uploaded
