## Settings

In [8]:
reviews_path = 'C:\\Data\\UCL\\@MSc Project - Data and sources\\reviews.csv'

bond_dataset_path = 'C:\\Data\\UCL\\@MSc Project - Data and sources\\List of bonds'

bond_datasets = ['Bond_dataset_new', 'Bond_EURO_dataset', 'Bond_FTSE_dataset']

In [12]:
import numpy as np
import pandas as pd

from os.path import join
from datetime import datetime

## Load the data

**Reviews**

In [11]:
reviews = pd.read_csv(reviews_path)

min_date = datetime.strptime('2018-7-1', '%Y-%m-%d')
max_date = datetime.strptime('2020-6-30', '%Y-%m-%d')
reviews['Date'] = pd.to_datetime(reviews.Date)

reviews = reviews[(reviews.Date >= min_date) & (reviews.Date <= max_date)]

**Bond dataset**

In [22]:
bonds_dataset = {
    dataset: pd.read_csv(join(bond_dataset_path, dataset + '.csv')) for dataset in bond_datasets
}

bonds = sum(
    [list(dataset.Company.unique()) for dataset in bonds_dataset.values()], []
)

## Filter the data

In [23]:
reviews = reviews[reviews.Company.isin(bonds)]

### ==Functions==

In [25]:
# helper quantile/quartile functions

def q1(x):
    return x.quantile(.25)

def q3(x):
    return x.quantile(.75)

def q10(x):
    return x.quantile(.1)

def q90(x):
    return x.quantile(.9)

## 1. Market indices

In [42]:
print(
    reviews
    .groupby('ListedOn')
    .Rating
    .agg(['count', 'mean', 'std', q1, 'median', q3])
)

                count      mean       std   q1  median   q3
ListedOn                                                   
EURO STOXX 50    3235  4.126121  1.020892  4.0     4.0  5.0
FTSE 100         5341  3.735068  1.142155  3.0     4.0  5.0
S&P 500        274426  3.595323  1.236797  3.0     4.0  5.0


## 2. Market sectors

In [43]:
print(
    reviews
    .groupby('Sector')
    .Rating
    .agg(['count', 'mean', 'std', q1, 'median', q3])
)

                        count      mean       std   q1  median   q3
Sector                                                             
Basic Materials          2498  3.595677  1.196315  3.0     4.0  5.0
Communication Services  18781  3.572919  1.276506  3.0     4.0  5.0
Consumer Cyclical       71644  3.660837  1.208333  3.0     4.0  5.0
Consumer Defensive      29423  3.349013  1.262811  3.0     3.0  4.0
Energy                   3085  3.607455  1.203059  3.0     4.0  5.0
Financial Services      43584  3.587899  1.215689  3.0     4.0  5.0
Healthcare              25564  3.443358  1.299782  3.0     4.0  5.0
Industrials             23060  3.547962  1.283081  3.0     4.0  5.0
Real Estate              1779  3.664980  1.423791  3.0     4.0  5.0
Technology              61954  3.764858  1.167811  3.0     4.0  5.0
Utilities                1630  3.639877  1.305325  3.0     4.0  5.0


## 3. Employees

In [40]:
def update_EmployeeRelationship(x):
    if x not in ['Current Employee', 'Former Employee']:
        return 'Not specified'
    else:
        return x
    
def update_Contract(x):
    if x not in ['full-time', 'part-time']:
        return 'Not specified'
    else:
        return x

reviews['EmployeeRelationship'] = reviews['EmployeeRelationship'].apply(lambda x: update_EmployeeRelationship(x))
reviews['Contract'] = reviews['Contract'].apply(lambda x: update_Contract(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  del sys.path[0]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [41]:
print(
    reviews
    .groupby('Contract')
    .Rating
    .agg(['count', 'mean', 'std', q1, 'median', q3])
)

                count      mean       std   q1  median   q3
Contract                                                   
Not specified   13346  3.500375  1.294844  3.0     4.0  5.0
full-time      226117  3.617512  1.241071  3.0     4.0  5.0
part-time       43539  3.565769  1.175984  3.0     4.0  4.0


In [38]:
print(
    reviews
    .groupby('EmployeeRelationship')
    .Rating
    .agg(['count', 'mean', 'std', q1, 'median', q3])
)

                       count      mean       std   q1  median   q3
EmployeeRelationship                                              
Current Employee      157649  3.792723  1.164325  3.0     4.0  5.0
Former Employee       112008  3.350796  1.274997  3.0     4.0  4.0
Not specified          13345  3.500337  1.294885  3.0     4.0  5.0


In [32]:
print(
    reviews
    .Rating
    .agg(['count', 'mean', 'std', q1, 'median', q3])
)

count     283002.000000
mean           3.604028
std            1.234241
q1             3.000000
median         4.000000
q3             5.000000
Name: Rating, dtype: float64


In [35]:
reviews.Contract.unique()

array(['full-time', 'part-time', nan], dtype=object)