## Settings

In [1]:
reviews_path = 'C:\\Data\\UCL\\@MSc Project - Data and sources\\reviews.csv'

In [2]:
import numpy as np
import pandas as pd

from os.path import join
from datetime import datetime

## Load the data

In [3]:
reviews = pd.read_csv(reviews_path)

min_date = datetime.strptime('2018-7-1', '%Y-%m-%d')
max_date = datetime.strptime('2020-6-30', '%Y-%m-%d')
reviews['Date'] = pd.to_datetime(reviews.Date)

reviews = reviews[(reviews.Date >= min_date) & (reviews.Date <= max_date)]

In [13]:
reviews['Review'] = reviews['Pros'] + ' ' + reviews['Cons']

def length(review):
    try:
        return len(review.split())
    except:
        return 0

reviews['ReviewLength'] = reviews['Review'].apply(lambda x: length(x))

## ==FUNCTIONS==

In [4]:
# helper quantile/quartile functions

def q1(x):
    return x.quantile(.25)

def q3(x):
    return x.quantile(.75)

def q10(x):
    return x.quantile(.1)

def q90(x):
    return x.quantile(.9)

## 1. Reviews length w.r.t. market sector

In [15]:
print(
    reviews
    .groupby('Sector')
    .ReviewLength
    .agg(['mean', 'std', q10, q1, 'median', q3, q90])
)

                             mean        std   q10  q1  median  q3    q90
Sector                                                                   
Basic Materials         34.600736  50.252710  11.0  13      19  35   70.0
Communication Services  35.133511  55.931461  11.0  13      18  34   72.0
Consumer Cyclical       32.322703  48.931872  11.0  13      18  32   64.0
Consumer Defensive      30.932061  46.808121  11.0  13      17  30   60.0
Energy                  30.277014  46.995151  11.0  13      17  29   57.6
Financial Services      31.429954  47.688007  11.0  13      17  30   61.0
Healthcare              34.833119  50.321192  11.0  13      18  35   73.0
Industrials             35.875773  56.412888  11.0  13      19  35   74.0
None                    12.111111   1.763834  10.8  11      12  13   13.6
Real Estate             46.837552  66.063087  11.0  14      23  52  104.0
Technology              31.986482  50.599223  11.0  13      17  31   63.0
Utilities               38.205475  59.

## 2. Reviews length w.r.t. stock market

In [17]:
print(
    reviews
    .groupby('ListedOn')
    .ReviewLength
    .agg(['mean', 'std', q10, q1, 'median', q3, q90])
)

                    mean        std  q10  q1  median  q3  q90
ListedOn                                                     
EURO STOXX 50  29.658994  40.917837   11  13      17  30   58
FTSE 100       29.700024  42.879776   11  12      17  29   58
S&P 500        33.367618  51.738888   11  13      18  32   67


## 3. Reviews length w.r.t. contract

In [29]:
def update_Contract(x):
    if x not in ['full-time', 'part-time']:
        return 'Not specified'
    else:
        return x

reviews['Contract'] = reviews['Contract'].apply(lambda x: update_Contract(x))

In [34]:
print(
    reviews
    .groupby('Contract')
    .ReviewLength
    .agg(['mean', 'std', q10, q1, 'median', q3, q90])
)

                    mean        std  q10  q1  median  q3  q90
Contract                                                     
Not specified  29.477666  45.990413   11  12      16  28   58
full-time      33.791182  52.139903   11  13      18  33   68
part-time      28.762779  41.832570   11  12      17  28   54


## 4. Reviews length w.r.t. employee relationship

In [35]:
def update_EmployeeRelationship(x):
    if x not in ['Current Employee', 'Former Employee']:
        return 'Not specified'
    else:
        return x
    
reviews['EmployeeRelationship'] = reviews['EmployeeRelationship'].apply(lambda x: update_EmployeeRelationship(x))

In [36]:
print(
    reviews
    .groupby('EmployeeRelationship')
    .ReviewLength
    .agg(['mean', 'std', q10, q1, 'median', q3, q90])
)

                           mean        std  q10  q1  median  q3  q90
EmployeeRelationship                                                
Current Employee      31.910563  46.631240   11  13      18  31   64
Former Employee       34.577048  55.976625   11  13      18  33   69
Not specified         29.478771  45.992825   11  12      16  28   58


In [37]:
print(
    reviews
    .ReviewLength
    .agg(['mean', 'std', q10, q1, 'median', q3, q90])
)

mean      32.846623
std       50.505398
q10       11.000000
q1        13.000000
median    18.000000
q3        32.000000
q90       66.000000
Name: ReviewLength, dtype: float64
