# Variable construction 

In this notebook I construct my variables for the final dataset. There are some notes about the taken choices made along the way. 

#### Data Processing

First reading in pre-processed data and loading packages

In [None]:
# libs
import spacy
import pandas as pd
import numpy as np
from funcs import ExtractNameYear, ExtractFileName, ExtractAllText, ComputeGreenInd, TransformReturns, TransformIndices, MakeReturnsInd, MakeReturns, AbnormalReturns
from datetime import datetime

#### Loading the NLP models

NLP model and greenwashing indicator (but first checking if I can use my GPU)

In [None]:
import torch

print(torch.cuda.is_available())
print(torch.cuda.current_device())
print(torch.cuda.get_device_name(0))



Classifying paragraphs into climate related/not related

In [None]:
# importing the model stuff
from transformers import AutoModelForSequenceClassification, AutoTokenizer, pipeline

In [None]:
model_name_class = "climatebert/distilroberta-base-climate-detector"

# If you want to use your own data, simply load them as 🤗 Datasets dataset, see https://huggingface.co/docs/datasets/loading

model_class = AutoModelForSequenceClassification.from_pretrained(model_name_class)
tokenizer_class = AutoTokenizer.from_pretrained(model_name_class, max_len=512)

pipe_class = pipeline("text-classification", model=model_class, tokenizer=tokenizer_class, device=0)

*CLIMATEBERT* for checking the specificity of climate-related paragraphs **TESTS**

In [None]:
model_name_spec = "climatebert/distilroberta-base-climate-specificity"

model_spec = AutoModelForSequenceClassification.from_pretrained(model_name_spec)
tokenizer_spec = AutoTokenizer.from_pretrained(model_name_spec, max_len=512)

pipe_spec = pipeline("text-classification", model=model_spec, tokenizer=tokenizer_spec, device=0)

#### Data

In [None]:
# data on greenwashing companies
data_greenwashing = pd.read_excel('../data/LSEG data/matched_final.xlsx', sheet_name='companies')


In [None]:
# monthly returns
# data_returns_m = pd.read_excel('../data/LSEG data/matched_final.xlsx', sheet_name='indicators_1')
# data_indices_m = pd.read_excel('../data/LSEG data/matched_final.xlsx', sheet_name='INDICES_monthly', skiprows=3)

In [None]:
# weekly returns
# data_returns_w = pd.read_excel('../data/LSEG data/matched_final.xlsx', sheet_name='indicators_2', skiprows=3)
# data_indices_w = pd.read_excel('../data/LSEG data/matched_final.xlsx', sheet_name='INDICES_weekly', skiprows=3)

In [None]:
# daily returns
data_returns_d = pd.read_excel('../data/LSEG data/matched_final.xlsx', sheet_name='indicators_3', skiprows=3)
data_indices_d = pd.read_excel('../data/LSEG data/matched_final.xlsx', sheet_name='INDICES_daily', skiprows=3)

In [None]:
# sales
data_sales = pd.read_excel('../data/LSEG data/matched_final.xlsx', sheet_name='indicators_4.1', skiprows=3)

In [None]:
# processing returns and merging to greenwashing


# data_returns_m = TransformReturns(data_returns_m, data_greenwashing, old=True)
# data_returns_w = TransformReturns(data_returns_w, data_greenwashing)
data_returns_d = TransformReturns(data_returns_d, data_greenwashing)



In [None]:
# data_indices_m = TransformIndices(data_indices_m)
# data_indices_w = TransformIndices(data_indices_w, weekly=True)
data_indices_d = TransformIndices(data_indices_d)


#### Transforming the sales data

In [None]:
data_sales.loc[:, "VARIABLE"] = np.where(data_sales.loc[:,"Name"].str.contains("- QUARTER 1"), 'Q1', pd.NA)
data_sales.loc[:, "VARIABLE"] = np.where(data_sales.loc[:,"Name"].str.contains("- QUARTER 2"), 'Q2', data_sales.loc[:, "VARIABLE"])
data_sales.loc[:, "VARIABLE"] = np.where(data_sales.loc[:,"Name"].str.contains("- QUARTER 3"), 'Q3', data_sales.loc[:, "VARIABLE"])
data_sales.loc[:, "VARIABLE"] = np.where(data_sales.loc[:,"Name"].str.contains("- QUARTER 4"), 'Q4', data_sales.loc[:, "VARIABLE"])


data_sales.loc[:,"NAME"] = data_sales.loc[:,"Name"].str.replace(r"- INTERIM SALES - QUARTER \d{1}", "", regex=True)
data_sales.dropna(subset=["VARIABLE"], inplace=True)
data_sales[["TYPE", "VAR_CODE"]] = data_sales["Code"].str.split("(", n=1, expand=True)
data_sales.drop(columns=['Name', 'Code', 'VAR_CODE'], inplace=True)


data_sales = pd.melt(data_sales, id_vars=['NAME', "TYPE", "VARIABLE"]).rename(columns={'variable':'DATE', 
                                                                         'value':'SALES'})

data_sales["SALES"] = data_sales["SALES"].astype(float)
data_sales[["QUARTER", "YEAR"]] = data_sales["DATE"].str.split(" ", n=1, expand=True)
data_sales["DATE"] = data_sales["YEAR"] + " " + data_sales["QUARTER"]

data_sales=data_sales.loc[:, ['NAME', 'TYPE','DATE','YEAR','QUARTER','SALES']].sort_values(["NAME", 'YEAR',"QUARTER"])


from itertools import product
# making sure that all firms have data starting from the same date
full_index = pd.DataFrame(product(data_sales["NAME"].unique(), data_sales["DATE"].unique()))
full_index.columns = ["NAME", "DATE"]

data_sales.dropna(subset=["SALES"], inplace=True)

data_sales = full_index.merge(data_sales, how="left", on=['NAME', 'DATE'])

data_sales = data_sales.groupby("NAME").filter(lambda x: ~x["SALES"].isna().any())
data_sales = data_sales.groupby("NAME").filter(lambda x: (x["SALES"]>0).all())



**Calculating the log differences for sales** \
\
Here I will surely have some missing data, and so I am trying to make the log_change_sales such that there's no loss of variables. **NOTE** I will need to adjust for seasonality in sales! \
\
I WILL DEFINITELY NEED TALK ABOUT THE EXCLUSION OF FIRMS IN MY SAMPLES - AFTER ALL, A LOT OF FIRMS DROPS OUT!!!

In [None]:
data_sales["LOG_SALES"] = np.log((data_sales["SALES"]))
data_sales["LOG_SALES_DIFF"] = data_sales.groupby(by='NAME')['LOG_SALES'].diff()
data_sales.to_csv("../data/output/data_sales.csv", index=False)

#### Calculating abnormal returns

Here I calculate abnormal returns and cumulative abnormal returns. In order to do that, I benchmark the realized returns of each of the companies against their expected return. The market model is estimated in order to assess the expected return. The market for each security is chosen with respect to the primary market of operations of each company (S&P for the USA/Canada, MSCI EUROPE for Europe, ... **Work-in-progress, need to get indices for more geographical regions**). Finally, I chose to use the price return on equity only (instead of total return including re-invested dividends - they don't really matter over such a short period anyway)

In [None]:
# check to see the market from which most of the companies come from ("Country of Domicile")
print(data_greenwashing['CTRY_OF_DOM_NAME'].value_counts(normalize=True))


# data_returns_m = MakeReturns(data_returns_m)
# data_returns_w = MakeReturns(data_returns_w)
data_returns_d = MakeReturns(data_returns_d)


# data_indices_m = MakeReturnsInd(data_indices_m)
# data_indices_w = MakeReturnsInd(data_indices_w)
data_indices_d = MakeReturnsInd(data_indices_d)


Checking for duplicated data in the companies. It seems that some have been duplicated but mostly due to having a subsidiary in the same name/having multiple pages etc. 
Some of them are also the wrong company. There's little enough problems though, that it seems like it could be dropped (16 companies like that) 

In [None]:
# print(data_returns_m[['NAME', 'DATE']].value_counts())
# print(data_returns_w[['NAME', 'DATE']].value_counts())
# print(data_returns_d[['NAME', 'DATE']].value_counts())

# data_returns_m.drop_duplicates(subset=['NAME', 'DATE'], inplace=True)
# data_returns_w.drop_duplicates(subset=['NAME', 'DATE'], inplace=True)
data_returns_d.drop_duplicates(subset=['NAME', 'DATE'], inplace=True)


In [None]:
market_dict = {
                # S&P 500
                "UNITED STATES":"S&P 500 COMPOSITE",
                "CANADA":"S&P 500 COMPOSITE",
                "BERMUDA":"S&P 500 COMPOSITE",
                "CAYMAN ISLANDS":"S&P 500 COMPOSITE",
                
                # MSCI EM MARKETS AMERICA
                "MEXICO":"MSCI EM LATIN AMERICA U$",
                "PUERTO RICO":"MSCI EM LATIN AMERICA U$",
                "COSTA RICA":"MSCI EM LATIN AMERICA U$",
                "BARBADOS":"MSCI EM LATIN AMERICA U$",
                "PANAMA":"MSCI EM LATIN AMERICA U$",
                "COLOMBIA":"MSCI EM LATIN AMERICA U$",
                "BRAZIL":"MSCI EM LATIN AMERICA U$",
                "CHILE":"MSCI EM LATIN AMERICA U$",
                "PERU":"MSCI EM LATIN AMERICA U$",
                "URUGUAY":"MSCI EM LATIN AMERICA U$",
                "ARGENTINA":"MSCI EM LATIN AMERICA U$",
                
                # MSCI europe
                "UNITED KINGDOM":"MSCI EUROPE U$",
                "IRELAND":"MSCI EUROPE U$",
                "SWITZERLAND":"MSCI EUROPE U$",
                "NETHERLANDS":"MSCI EUROPE U$",
                "GREECE":"MSCI EUROPE U$",
                "GERMANY":"MSCI EUROPE U$",
                "BELGIUM":"MSCI EUROPE U$",
                "DENMARK":"MSCI EUROPE U$",
                "MONACO":"MSCI EUROPE U$",
                "LUXEMBOURG":"MSCI EUROPE U$",
                "FRANCE":"MSCI EUROPE U$",
                "SWEDEN":"MSCI EUROPE U$",
                "ISLE OF MAN":"MSCI EUROPE U$",
                "SPAIN":"MSCI EUROPE U$",
                "FINLAND":"MSCI EUROPE U$",
                "ROMANIA":"MSCI EUROPE U$",
                "ITALY":"MSCI EUROPE U$",
                "AUSTRIA":"MSCI EUROPE U$",
                "JERSEY":"MSCI EUROPE U$",
                "GUERNSEY":"MSCI EUROPE U$",
                "TURKEY":"MSCI EUROPE U$",
                
                
                # msci pacific
                "HONG KONG":"MSCI PACIFIC U$",
                "SINGAPORE":"MSCI PACIFIC U$",
                "JAPAN":"MSCI PACIFIC U$",
                "AUSTRALIA":"MSCI PACIFIC U$",
                "NEW ZEALAND":"MSCI PACIFIC U$",
                "PAPUA NEW GUINEA":"MSCI PACIFIC U$",
                
                # MSCI AC ASIA
                "CHINA":"MSCI AC ASIA U$",
                "INDIA":"MSCI AC ASIA U$",
                "SOUTH KOREA":"MSCI AC ASIA U$",
                "TAIWAN":"MSCI AC ASIA U$",
                "MONGOLIA":"MSCI AC ASIA U$",
                "INDONESIA":"MSCI AC ASIA U$",
                "PHILIPPINES":"MSCI AC ASIA U$",
                
                # MSCI WORLD
                "ISRAEL":"MSCI WORLD U$",
                "KAZAKHSTAN":"MSCI WORLD U$",
                "UNITED ARAB EMIRATES":"MSCI WORLD U$",
                "SOUTH AFRICA":"MSCI WORLD U$",
                
                # missing values
                np.nan:'NA'
               
    
}

exchange_mrkt_dict = {
# US
"Nasdaq":"S&P 500 COMPOSITE",
"Toronto SE":"S&P 500 COMPOSITE",
"NYSE":"S&P 500 COMPOSITE",

# Europe
"London SE":"MSCI EUROPE U$",       
"Euronext Amsterdam":"MSCI EUROPE U$",
"NASDAQ Stockholm":"MSCI EUROPE U$",
"Boerse Frankfurt" :"MSCI EUROPE U$",
"Euronext Paris":"MSCI EUROPE U$",
"Six Swiss Exchange":"MSCI EUROPE U$",
"Euronext Brussels" :"MSCI EUROPE U$",
"Borsa Italiana":"MSCI EUROPE U$",
"Oslo Bors":"MSCI EUROPE U$",
"Wiener Boerse AG":"MSCI EUROPE U$",
"Athens SE" :"MSCI EUROPE U$",
"NASDAQ Helsinki":"MSCI EUROPE U$",
"NASDAQ Copenhagen":"MSCI EUROPE U$",
"Boerse Hamburg":"MSCI EUROPE U$",
"BME Exchange":"MSCI EUROPE U$",
"Ljubljana SE":"MSCI EUROPE U$",

# Pacific
"Tokyo SE":"MSCI PACIFIC U$",
"Hong Kong Exchange":"MSCI PACIFIC U$",
"Singapore Exchange":"MSCI PACIFIC U$",
"Australian SE":"MSCI PACIFIC U$",
"New Zealand Exchange":"MSCI PACIFIC U$",

# Asia
"Korea Exchange":"MSCI AC ASIA U$",    
"Taiwan SE":"MSCI AC ASIA U$",
"National SE":"MSCI AC ASIA U$",

# latin america
"Santiago SE":"MSCI EM LATIN AMERICA U$",
"Bolsa Mexicana":"MSCI EM LATIN AMERICA U$",

# Other
"Egyptian Exchange":"MSCI WORLD U$",
"Johannesburg SE":"MSCI WORLD U$",
np.nan:'NA'
}


# data_abnormal_returns_m = AbnormalReturns(data_returns_m, data_indices_m,  market_dict, "2014-01-01", "2018-06-01", "STOCK_LOG_RETURN", "INDEX_LOG_RETURN")
# data_abnormal_returns_w = AbnormalReturns(data_returns_w, data_indices_w,  market_dict, "2016-01-01", "2019-07-01", "STOCK_LOG_RETURN", "INDEX_LOG_RETURN")
data_abnormal_returns_d = AbnormalReturns(data_returns_d, data_indices_d,  market_dict, exchange_mrkt_dict, "2019-01-01", "2019-08-01", "STOCK_LOG_RETURN", "INDEX_LOG_RETURN")


# data_abnormal_returns_m.to_csv("../data/output/data_abnormal_returns_m.csv", index=False)
# data_abnormal_returns_w.to_csv("../data/output/data_abnormal_returns_w.csv", index=False)
data_abnormal_returns_d.to_csv("../data/output/data_abnormal_returns_d.csv", index=False)


### Transforming the greenwashing dataset
Making one observation per sustainability report

In [None]:
# /////////////////////////////////////
#       expanding
# /////////////////////////////////////

data_greenwashing['REPORT_LISTS'] = data_greenwashing['REPORT_LISTS'].apply(eval)

# Expand each list entry into its own row
data_greenwashing = data_greenwashing.explode('REPORT_LISTS', ignore_index=True)

data_greenwashing['YEAR'] = data_greenwashing['REPORT_LISTS'].str.extract(r'(\d+)')
data_greenwashing['YEAR'] = data_greenwashing['YEAR'].astype(float)

data_greenwashing.to_csv("../data/output/company_characteristics.csv", index=False)

In [None]:
# get datasets that have a 2017 sustainability report

greenwashing_2017 = data_greenwashing[data_greenwashing["YEAR"]==2017]
names_2017 = greenwashing_2017["NAME_SCRAPED"]

#### Greenwashing Indicator - **2017**

Expanding the dataset to have one observation per sustainability report and calculating the number of reports per year.

In [None]:
print(greenwashing_2017.value_counts("CTRY_OF_DOM_NAME"))

Converting the 2017 sustainability reports into lists of files

In [None]:
nlp = spacy.load('en_core_web_sm')

input_name = "C:/Users/Jakub/OneDrive - Tilburg University/thesis data/responsibility reports"

filenames = ExtractFileName(series_names=names_2017, input_dir=input_name, year_str="2017")

names_2017_updated = names_2017.str.lower().str.replace(" ", "_").str.replace("?", "").str.replace("|", "")
names_2017_updated = names_2017_updated.loc[~(names_2017_updated.isin(filenames.keys()))]

filenames_updated = ExtractNameYear(series_names=names_2017_updated, input_dir=input_name, nlp_model=nlp, year_str="2017")

filenames.update(filenames_updated)

Dumping all of the text from PDF files into their respective TXT files for later processing

In [None]:
ExtractAllText(filenames, "2017", "../data/text_processing")

Here I need to load all of the files in a loop, then categorize them, subset the categorized ones and within those classify some as specific and nonspecific and then compute the prop of non-specific

In [None]:
greenwashing_ind = ComputeGreenInd(names_2017, "../data/text_processing/2017", pipe_class, pipe_spec)

greenwashing_ind.to_csv("../data/text_processing/greenwashing_ind_2017.csv")

#### Greenwashing Indicator - **2018**

In [None]:
# get datasets that have a 2017 sustainability report

greenwashing_2018 = data_greenwashing[data_greenwashing["YEAR"]==2018]
names_2018 = greenwashing_2018["NAME_SCRAPED"]

Expanding the dataset to have one observation per sustainability report and calculating the number of reports per year.

In [None]:
print(greenwashing_2018.value_counts("CTRY_OF_DOM_NAME"))

Converting the 2018 sustainability reports into lists of files

In [None]:
nlp = spacy.load('en_core_web_sm')

input_name = "C:/Users/Jakub/OneDrive - Tilburg University/thesis data/responsibility reports"

filenames = ExtractFileName(series_names=names_2018, input_dir=input_name, year_str="2018")

names_2018_updated = names_2018.str.lower().str.replace(" ", "_").str.replace("?", "").str.replace("|", "")
names_2018_updated = names_2018_updated.loc[~(names_2018_updated.isin(filenames.keys()))]

filenames_updated = ExtractNameYear(series_names=names_2018_updated, input_dir=input_name, nlp_model=nlp, year_str="2018")

filenames.update(filenames_updated)

Dumping all of the text from PDF files into their respective TXT files for later processing

In [None]:
ExtractAllText(filenames, "2018", "../data/text_processing")

Computing the greenwashing indicator and writing it to a file

In [None]:
greenwashing_ind = ComputeGreenInd(names_2018, "../data/text_processing/2018", pipe_class, pipe_spec)

greenwashing_ind.to_csv("../data/text_processing/greenwashing_ind_2018.csv")