## **Script for merging and splitting data into train, validation and test sets**

In [1]:
import numpy as np
import pandas as pd
# import altair as alt
from sklearn.model_selection import train_test_split

### **Question 1**

**Reading datasets** (2013, 2018, 2020 labeled datasets)

In [2]:
pd.set_option('display.max_colwidth', 60)

In [3]:
# WES 2013, question 1
data_2013 = pd.read_excel("data/2013/WES2013 1st Qual Sample - Coded.xlsx", 
                     sheet_name='2013 1st Qual Sample',
                     skiprows=1)   ## change your path for data
data_2013.rename(columns={'_telkey':'Telkey',
                          'AQ3345_13':'Comment'}, inplace=True)
data_2013['Year'] = 2013

In [4]:
# WES 2018, question 1
data_2018 = pd.read_excel("data/2018/WES2018 1st Qual Coded - Final Comments and Codes.xlsx", 
                     sheet_name='2018 1st Qual',
                     skiprows=1)   ## change your path for data
data_2018.rename(columns={'_telkey':'Telkey',
                          'Q3345_13':'Comment'}, inplace=True)
data_2018['Year'] = 2018

In [5]:
# WES 2020, question 1
data_2020 = pd.read_excel("data/2020/WES2020 1st Qual Coded - Final Comments and Codes.xlsx", 
                     sheet_name='2020 1st Qual',
                     skiprows=1)
data_2020.rename(columns={'Q3345_13:   What one thing would you like your organization to focus on to improve your work environment?':'Comment'}, inplace=True)
data_2020['Year'] = 2020

**Compiling datasets**

In [6]:
# header
header_codes = pd.read_excel("data/2020/WES2020 1st Qual Coded - Final Comments and Codes.xlsx", 
                          sheet_name='Codebook',
                          usecols='A,F')

# In this line I am correcting their CODE, for the subthemes of
# 11-Vision_Mission_Goals and 12-Other, because those should have 
# codes as 11.1 and not 111, to be similar to all the other 
# subthemes' format.

header_codes['CODE'] = [code/10 if code>100 else code for code in header_codes['CODE']]

# Comment: Note that all the CODEs that have XX.0 format (finish in 
# zero) are themes, and any CODE with decimals is a subtheme.

In [6]:
# header_codes

In [7]:
# Dictionaries for header (back and forward)
header_dict_VAR = {header_codes['VARIABLE NAME'][i] : header_codes['CODE'][i] for i in range(0, len(header_codes['CODE']))}
header_dict_COD = {header_codes['CODE'][i] : header_codes['VARIABLE NAME'][i] for i in range(0, len(header_codes['CODE']))}
# header_dict_COD # <-- Print this to show what the dictionary looks like it but be can erase the line

##### Comment: Note that all the CODEs that have XX.0 format (finish in zero) are themes, and any CODE with decimals is a subtheme.

In [8]:
# Correcting names to compile the databases
data_2013.rename(columns = {"Tools_Equipment_Physical_Environment":'TEPE',
    "Vision_Mission_Goals":'VMG',
    "Other":'OTH',
    "Other comments":'OTH_Other_related',
    "Positive comments": "OTH_Positive_comments"}, inplace=True)

data_2018.rename(columns = {'FWE':'FEW',
    'CPD_Improve_performance_management':'CPD_Improve_performance',
    'CB_Improve_benefits':'CB_Improve_medical',
    'Exec_Strengthen_quality_of_executive_leadership':'Exec_Strengthen_quality_of_executive_leaders',
    'FWE_Leading_Workplace_Strategies':'FWE_Improve_and_or_expand_Leading_Workplace_Strategies_LWS',
    'TEPE__Ensure_safety_and_security':'TEPE__Ensure_safety',
    'TEPE_Better_supplies_equipment':'TEPE_Provide_better_equipment',
    'TEPE_Better_furniture':'TEPE_Provide_better_furniture',
    'TEPE_Better_computer_hardware':'TEPE_Provide_better_hardware',
    'VMG_Assess_plans_priorities':'VMG_Assess_plans',
    'VMG_Improve_program_implementation':'VMG_Improve_program',
    'VMG_Public_interest_and_service_delivery':'VMG_Pay_attention_to_the_public_interest',
    'VMG_Keep_politics_out_of_work':'VMG_Remove_political_influence'
    }, inplace=True)

# Comment: the code for theme 'FEW' is different from the 
# initial part of their sub-themes ('FWE').

In [9]:
# Put databases together
frames = [data_2020, data_2018, data_2013]
data_all = pd.concat(frames)
data_all_num = data_all.rename(columns=header_dict_VAR)
# pd.set_option('display.max_colwidth', 60) # <-- to display just the beggining of the comment
data_all.head()

Unnamed: 0,Telkey,Comment,CPD,CB,EWC,Exec,FEW,SP,RE,Sup,...,VMG_Pay_attention_to_the_public_interest,VMG_Review_funding_or_budget,VMG_Remove_political_influence,VMG_other,OTH_Other_related,OTH_Positive_comments,OTH_Survey_feedback,OTH_Covid,Unrelated,Year
0,172538-522988,WAGES! We are all very underpaid. It is very dif...,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0.0,0.0,0.0,2020
1,172540-015050,"With each ""bad"" press news story the organization knee j...",0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0.0,0.0,0.0,2020
2,172550-323842,better seating furniture and office layout,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0.0,0.0,0.0,2020
3,172553-172324,Is to improve the ventilation system for heating and coo...,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0.0,0.0,0.0,2020
4,172553-986176,Stop hiring based on scenarios. Hire based on knowledge...,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0.0,0.0,0.0,2020


**Basic Cleaning**

In [10]:
## dropping null rows
data_all.dropna(inplace=True)

In [11]:
data_all['Comment'].isnull().sum()

0

**Splitting data into train, test and validation portions**

In [12]:
X = data_all['Comment']
y = data_all.drop(['Telkey', 'Comment', 'Year'], axis=1)

In [13]:
X_trainvalid, X_test, y_trainvalid, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

In [14]:
X_train, X_valid, y_train, y_valid = train_test_split(X_trainvalid, y_trainvalid, test_size=0.20, random_state=42)

In [15]:
## y_train with column names as code numbers
y_train_num = y_train.rename(columns=header_dict_VAR) 

**Saving y_train files**

In [16]:
y_train.to_csv('data/y_train.csv', index=False)

In [17]:
# y_train_num.to_csv('data/y_train_num.csv', index=False)

### **Question 2**

**Reading dataset** (2018 labeled dataset)

In [18]:
# WES 2018, question 2
data_2018_2 = pd.read_excel('data/2018/WES2018 2nd Qual Coded - Final Comments and Codes.xlsx', 
                     sheet_name='2018 2nd Qual Coded (All)')  ## change path for data
data_2018_2.rename(columns={'Q4981_11':'Comment'}, inplace=True)
# data_2018_2['Year'] = 2018
# data_2018_2.head()

**Basic Cleaning**

In [19]:
## dropping last row that contains "totals"
data_2018_2.drop(data_2018_2.tail(1).index,inplace=True)

In [20]:
## dropping NaN rows
data_2018_2.dropna(inplace=True)

**Splitting into train and test**

In [21]:
X_2 = data_2018_2['Comment']
y_2 = data_2018_2.drop(['Telkey', 'Comment', '# of codes'], axis=1)

In [22]:
X_trainvalid_2, X_test_2, y_trainvalid_2, y_test_2 = train_test_split(X_2, y_2, test_size=0.20, random_state=42)

In [23]:
X_train_2, X_valid_2, y_train_2, y_valid_2 = train_test_split(X_trainvalid_2, y_trainvalid_2, test_size=0.20, random_state=42)

**Saving y_train file**

In [24]:
## Saving y_train
y_train_2.to_csv('data/y_train_q2.csv', index=False)