In [1]:
import numpy as np
import pandas as pd
# import altair as alt
from sklearn.model_selection import train_test_split

In [2]:
# WES 2013, question 1
data_2013 = pd.read_excel("../data/raw/2013/WES2013 1st Qual Sample - Coded.xlsx", 
                     sheet_name='2013 1st Qual Sample',
                     skiprows=1)   ## change your path for data
data_2013.rename(columns={'_telkey':'Telkey',
                          'AQ3345_13':'Comment'}, inplace=True)
data_2013['Year'] = 2013

In [3]:
# WES 2018, question 1
data_2018 = pd.read_excel("../data/raw/2018/WES2018 1st Qual Coded - Final Comments and Codes.xlsx", 
                     sheet_name='2018 1st Qual',
                     skiprows=1)   ## change your path for data
data_2018.rename(columns={'_telkey':'Telkey',
                          'Q3345_13':'Comment'}, inplace=True)
data_2018['Year'] = 2018

In [4]:
# WES 2020, question 1
data_2020 = pd.read_excel("../data/raw/2020/WES2020 1st Qual Coded - Final Comments and Codes.xlsx", 
                     sheet_name='2020 1st Qual',
                     skiprows=1)
data_2020.rename(columns={'Q3345_13:   What one thing would you like your organization to focus on to improve your work environment?':'Comment'}, inplace=True)
data_2020['Year'] = 2020

In [6]:
# header
header_codes = pd.read_excel("../data/raw/2020/WES2020 1st Qual Coded - Final Comments and Codes.xlsx", 
                          sheet_name='Codebook',
                          usecols='A,F')

# In this line I am correcting their CODE, for the subthemes of
# 11-Vision_Mission_Goals and 12-Other, because those should have 
# codes as 11.1 and not 111, to be similar to all the other 
# subthemes' format.

header_codes['CODE'] = [code/10 if code>100 else code for code in header_codes['CODE']]

# Comment: Note that all the CODEs that have XX.0 format (finish in 
# zero) are themes, and any CODE with decimals is a subtheme.

In [7]:
# Dictionaries for header (back and forward)
header_dict_VAR = {header_codes['VARIABLE NAME'][i] : header_codes['CODE'][i] for i in range(0, len(header_codes['CODE']))}
header_dict_COD = {header_codes['CODE'][i] : header_codes['VARIABLE NAME'][i] for i in range(0, len(header_codes['CODE']))}
# header_dict_COD # <-- Print this to show what the dictionary looks like it but be can erase the line

In [8]:
# Correcting names to compile the databases
data_2013.rename(columns = {"Tools_Equipment_Physical_Environment":'TEPE',
    "Vision_Mission_Goals":'VMG',
    "Other":'OTH',
    "Other comments":'OTH_Other_related',
    "Positive comments": "OTH_Positive_comments"}, inplace=True)

data_2018.rename(columns = {'FWE':'FEW',
    'CPD_Improve_performance_management':'CPD_Improve_performance',
    'CB_Improve_benefits':'CB_Improve_medical',
    'Exec_Strengthen_quality_of_executive_leadership':'Exec_Strengthen_quality_of_executive_leaders',
    'FWE_Leading_Workplace_Strategies':'FWE_Improve_and_or_expand_Leading_Workplace_Strategies_LWS',
    'TEPE__Ensure_safety_and_security':'TEPE__Ensure_safety',
    'TEPE_Better_supplies_equipment':'TEPE_Provide_better_equipment',
    'TEPE_Better_furniture':'TEPE_Provide_better_furniture',
    'TEPE_Better_computer_hardware':'TEPE_Provide_better_hardware',
    'VMG_Assess_plans_priorities':'VMG_Assess_plans',
    'VMG_Improve_program_implementation':'VMG_Improve_program',
    'VMG_Public_interest_and_service_delivery':'VMG_Pay_attention_to_the_public_interest',
    'VMG_Keep_politics_out_of_work':'VMG_Remove_political_influence'
    }, inplace=True)

# Comment: the code for theme 'FEW' is different from the 
# initial part of their sub-themes ('FWE').

In [9]:
# Put databases together
frames = [data_2020, data_2018, data_2013]
data_all = pd.concat(frames)
data_all_num = data_all.rename(columns=header_dict_VAR)
# pd.set_option('display.max_colwidth', 60) # <-- to display just the beggining of the comment
data_all.head()

Unnamed: 0,Telkey,Comment,CPD,CB,EWC,Exec,FEW,SP,RE,Sup,...,VMG_Pay_attention_to_the_public_interest,VMG_Review_funding_or_budget,VMG_Remove_political_influence,VMG_other,OTH_Other_related,OTH_Positive_comments,OTH_Survey_feedback,OTH_Covid,Unrelated,Year
0,172538-522988,WAGES! We are all very underpaid. It i...,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0.0,0.0,0.0,2020
1,172540-015050,"With each ""bad"" press news story the organizat...",0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0.0,0.0,0.0,2020
2,172550-323842,better seating furniture and office layout,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0.0,0.0,0.0,2020
3,172553-172324,Is to improve the ventilation system for heati...,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0.0,0.0,0.0,2020
4,172553-986176,Stop hiring based on scenarios. Hire based on...,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0.0,0.0,0.0,2020


In [11]:
data_all.to_excel('../data/interim/bcstats.xlsx',index = False)

## Class Distribution

In [2]:
X_train_Q1 = pd.read_excel('../data/interim/X_train_Q1_clean.xlsx')
X_valid_Q1 = pd.read_excel('../data/interim/X_valid_Q1_clean.xlsx')

y_train_Q1 = pd.read_excel('../data/interim/y_train_Q1.xlsx')
y_valid_Q1 = pd.read_excel('../data/interim/y_valid_Q1.xlsx')

In [3]:
df = pd.concat([X_train_Q1, y_train_Q1.iloc[:,0:12]], axis = 1)

In [4]:
df

Unnamed: 0,Comment,CPD,CB,EWC,Exec,FEW,SP,RE,Sup,SW,TEPE,VMG,OTH
0,"to be real about diversity, you need to create...",0,0,1,0,0,0,0,0,0,0,0,0
1,Keep the building warmer and provide warm wate...,0,0,0,0,0,0,0,0,0,1,0,0
2,better communication from the top down,0,0,0,1,0,0,0,0,0,0,0,0
3,It would be beneficial if Management did not m...,0,0,0,0,0,0,1,0,0,0,0,0
4,more education applicable to my job,1,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
10371,office morale,0,0,1,0,0,0,0,0,0,0,0,0
10372,"I work in a mobile work environment, when peop...",0,0,0,0,1,0,0,0,0,0,0,0
10373,In my region I would like to see more support ...,1,0,0,0,0,0,0,0,0,0,0,0
10374,Consider 'green computing.' There are many w...,0,0,0,0,0,0,0,0,0,1,0,0


In [7]:
df.columns[1:]

Index(['CPD', 'CB', 'EWC', 'Exec', 'FEW', 'SP', 'RE', 'Sup', 'SW', 'TEPE',
       'VMG', 'OTH'],
      dtype='object')

In [14]:
labels = []
records = []

for i in df.columns[1:]:
    labels.append(i)
    records.append(np.sum(df[i]))

In [15]:
distribution = pd.DataFrame({'labels':labels, 'records': records})

In [17]:
distribution.sort_values(by = 'records', ascending = False)

Unnamed: 0,labels,records
9,TEPE,2390
8,SW,1616
10,VMG,1398
3,Exec,1396
1,CB,1305
0,CPD,1293
5,SP,1040
7,Sup,1025
2,EWC,933
6,RE,843
