In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import plotly.express as px
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings("ignore")

from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

from sklearn import preprocessing
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_samples, silhouette_score
from yellowbrick.cluster import SilhouetteVisualizer
from yellowbrick.datasets import load_nfl
from yellowbrick.cluster import KElbowVisualizer
from itertools import permutations

In [None]:
# Read in the data
df_firm_data = pd.read_csv('../Daten/FDS/Firm_Data.csv', index_col=0)
df_id_data = pd.read_csv('../Daten/FDS/ID_Data.csv', index_col=0)
df_mc_data = pd.read_csv('../Daten/FDS/MC_Data.csv', index_col=0)
df_return_data = pd.read_csv('../Daten/FDS/Return_Data.csv', index_col=0)

In [None]:
def show_data(df_firm_data, df_id_data, df_mc_data, df_return_data):
    #show heads if true
    print("firm data")
    display(df_firm_data.head())
    print("id data")
    display(df_id_data.head())
    print("mc data (market cap)")
    display(df_mc_data.head())
    print("return data")
    display(df_return_data.head())


In [None]:
#make the dataframes a little bit more readable
def make_readable(df_firm_data, df_id_data, df_mc_data, df_return_data):
    """- sets date as index where suitaeble"""
    df_mc_data.index = pd.to_datetime(df_mc_data["Date"])
    df_return_data.index = pd.to_datetime(df_return_data["Date"])
    #drop date column
    df_mc_data.drop("Date", axis=1, inplace=True)
    df_return_data.drop("Date", axis=1, inplace=True)
    #sort index
    df_mc_data.sort_index(inplace=True)
    df_return_data.sort_index(inplace=True)
    return df_firm_data, df_id_data, df_mc_data, df_return_data
    
df_firm_data, df_id_data, df_mc_data, df_return_data = make_readable(df_firm_data, df_id_data, df_mc_data, df_return_data)

In [None]:
show_data(df_firm_data, df_id_data, df_mc_data, df_return_data)

In [16]:
def make_df_visualize_without_clustering(df_firm_data, df_id_data, df_mc_data, df_return_data, visualize=True,
                                         show_time=False, color_what = "Sector"):
    """
    - makes a dataframe for visualization without clustering
    - if visualize is true, it will show the data
    - it visualizes calculates the return, risk, and esg score
    - the colorization can by set, by default it is set to the industry
    - the company name of a datapoint is shown when hovering over it
    - the first visualization is a 3D scatterplot with the avg return, avg risk, and avg esg score, and the 
    colorization is set to the Sector by default, can be changed with the color_what parameter
    - it is also possible to visualize the data in 3d with the axes ESG, Return, and time if show_time is set to true
    - if plot hyperplane is True it will also plot the hyperplane with the "best" companies
    """
    #base data
    df_company = df_id_data.copy()
    #calculate the average return in a dictionary (use arethmetic mean)
    dict_return_mean = {}
    dict_risk_mean = {}
    dict_esg_mean = {}

    for column in df_return_data.columns:
        if column != "Date":
            dict_return_mean[column] = df_return_data[column].mean()
            dict_risk_mean[column] = df_return_data[column].std()
            dict_esg_mean[column] = df_firm_data[df_firm_data.index == column]["ESG Score"].mean()

    #add the data to the dataframe
    df_return_esg_time = df_firm_data.copy()
    #remove all columns except the date, esg score, and return
    df_return_esg_time.drop(df_return_esg_time.columns.difference(["Date", "ESG Score", "Return"]), 1, inplace=True)    
    #remove rows with nan values
    df_return_esg_time.dropna(inplace=True)
    #save indes as column
    df_return_esg_time["Company"] = df_return_esg_time.index
    
    #get value baes on index and date from df_return_data
    def get_value(index, date):

        try:
            return df_return_data.loc[date][index]
        except:
            return np.nan
    return df_return_esg_time

    #remove extreme outliers
    df_company = df_company[df_company["avg_return"] < 3*df_company["avg_return"].median()]
    
    #create a new dataframe with the return and esg score on a daily basis
    df_return_esg = df_return_data.copy()
 

    
    #visualize the data
    if visualize:
        #make the 3d scatterplot
        fig = px.scatter_3d(df_company, x="avg_return", y="avg_risk", z="avg_esg", color=color_what,
                            hover_name="Name")
        #make smaller points
        fig.update_traces(marker=dict(size=4))
        #add hyperplane, dont use go since it is not working

        
        fig.show()
        #make the 3d scatterplot with time
        if show_time:
            fig = px.scatter_3d(df_company, x="avg_return", y="avg_risk", z="avg_esg", color=color_what, 
                                hover_name="Company Name", animation_frame="Date")
            fig.show()
    return df_company
    
    

In [17]:
make_df_visualize_without_clustering(df_firm_data, df_id_data, df_mc_data, df_return_data,color_what = "Country", visualize=True, show_time=True)

Unnamed: 0_level_0,Date,ESG Score,Company
RIC,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
BVIC.L,31/12/2009,51.267135,BVIC.L
BVIC.L,31/12/2010,50.550242,BVIC.L
BVIC.L,31/12/2011,46.732870,BVIC.L
BVIC.L,31/12/2012,57.941343,BVIC.L
BVIC.L,31/12/2013,49.513243,BVIC.L
...,...,...,...
PUMG.DE,31/12/2021,88.195666,PUMG.DE
AKSEN.IS,31/12/2018,43.092015,AKSEN.IS
AKSEN.IS,31/12/2019,36.847893,AKSEN.IS
AKSEN.IS,31/12/2020,41.214852,AKSEN.IS
