## 1. Setup and Import libraries

In [1]:
# django path
mysite_path = "C:\\Data\\UCL\\@MSc Project\\DB\\mysite\\"

In [2]:
# standard packages
import os
import sys
import re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime

import django
from set_django_db import set_django_db
from asgiref.sync import sync_to_async

from IPython.core.display import HTML

%matplotlib inline

In [3]:
# set django models
set_django_db(mysite_path)
from tables_daniel.models import Company, Review

# specifically for Jupyter notebooks
os.environ["DJANGO_ALLOW_ASYNC_UNSAFE"] = "true"

In [4]:
# center plots
HTML("""
<style>
.output_png {
    display: table-cell;
    text-align: center;
    vertical_align: middle;
}
</style>
""")

## 2. Load, merge and filter the datasets

<hr>

**Content**

    2.1 Load companies
    2.2 Load reviews
    2.3 Some useful merges/adds
    2.4 Filter the data from the monitored period between 2018-07-01 and 2020-06-30
    2.5 Filter only the reviews for the companies with at least 10 reviews

### 2.1 Companies

In [5]:
companies = pd.DataFrame(
    list(
        Company
        .objects
        .values('id', 'Company', 'Sector', 'ListedOn')
        .all()
    )
)

companies_id = list(companies.id)

### 2.2 Reviews

In [6]:
reviews = list(
    Review
    .objects
    .values(
        'id', 'Company_id', 'ReviewTitle', 'Rating',
        'JobTitle', 'EmployeeRelationship',
        'Contract', 'Pros', 'Cons'
    )
    .all()
    .filter(Company_id = company_id) for company_id in companies_id
)

reviews_df = pd.DataFrame(
    sum([list(reviews_i) for reviews_i in reviews],[])
).drop_duplicates()

In [20]:
row = dict(reviews_df.iloc[0,:])
row['id']

Review.objects.values('id', 'JobTitle', 'EmployeeRelationship').filter(id=row['id']).update(JobTitle='Former Employee')

1

In [None]:
for i in range(reviews_df.shape[0]):
    row = dict(reviews_df.iloc[i,:])
    
    review = (
        Review
        .objects
        .values('id', 'JobTitle' ,'EmployeeRelationship')
        .get(id=row['id'])
    )
    
    if review 'JobTitle' in ['Former Employee', 'Current Employee']:
        new_jobTitle = review['EmployeeRelationship']
        new_relationship = review['JobTitle']
        
        (Review
         .objects
         .filter(id=row['id'])
         .update(
             JobTitle = new_jobTitle,
             EmployeeRelationship = new_relationship
         )
        )
    else:
        pass

{'id': 24966, 'JobTitle': 'Former Employee', 'EmployeeRelationship': 'Anonymous Employee'}
{'id': 24967, 'JobTitle': 'Former Employee', 'EmployeeRelationship': 'Software Engineer'}
{'id': 24968, 'JobTitle': 'Current Employee', 'EmployeeRelationship': 'Manager/In'}
{'id': 24969, 'JobTitle': 'Former Employee', 'EmployeeRelationship': 'Sales Assistant'}
{'id': 24970, 'JobTitle': 'Former Employee', 'EmployeeRelationship': 'Questions Solving'}
{'id': 24971, 'JobTitle': 'Former Employee', 'EmployeeRelationship': 'Retail Sales Associate'}
{'id': 24972, 'JobTitle': 'Former Employee', 'EmployeeRelationship': 'Team Lead'}
{'id': 24973, 'JobTitle': 'Current Employee', 'EmployeeRelationship': 'Supervisor'}
{'id': 24974, 'JobTitle': 'Current Employee', 'EmployeeRelationship': 'Business Solutions Manager'}
{'id': 24975, 'JobTitle': 'Former Employee', 'EmployeeRelationship': 'Senior Manager'}


In [None]:
reviews = list(
    Review
    .objects
    .values('Company_id', 'ReviewTitle', 'Rating', 'Pros', 'Cons', 'Year', 'Month', 'Day',
            'JobTitle', 'Contract', 'EmployeeRelationship')
    .all()
)

reviews_df = pd.DataFrame(
    sum([list(reviews_i) for reviews_i in reviews],[])
).drop_duplicates()

### 2.3 Some useful merges/adds

In [36]:
# add sector and company name
reviews_df = reviews_df.merge(
    companies[['id', 'Company', 'Sector']].rename(columns={'id': 'Company_id',
                                                           'Company': 'Company',
                                                           'Sector': 'Sector'                                                 
                                                }),
    on='Company_id'
)

# add date column used for filtering
reviews_df['Date'] = reviews_df.apply(lambda x: '-'.join(
    [str(x['Year']), str(x['Month']), str(x['Day'])]
    ), axis=1
)

reviews_df

Unnamed: 0,Company_id,ReviewTitle,Rating,Pros,Cons,Year,Month,Day,JobTitle,Contract,EmployeeRelationship,Company,Sector,Date
0,1,Part Time,4.0,"Friendly colleagues, willing to give you disco...",There are some politics going on (like any oth...,2020,7,1,Former Employee,part-time,Anonymous Employee,Adidas,Consumer Cyclical,2020-7-1
1,1,Terrible experience,1.0,Some nice welcoming colleagues and benefits (M...,"Really poor culture and collaboration spirit, ...",2020,7,1,Former Employee,full-time,Software Engineer,Adidas,Consumer Cyclical,2020-7-1
2,1,Top,5.0,"Work-Life Balance, nice offices, top sport","Global company, long decision making",2020,6,30,Current Employee,full-time,Manager/In,Adidas,Consumer Cyclical,2020-6-30
3,1,Sales Assistant,4.0,Staff helpfulPay was goof Staff are respectuful,Might get a little hectic,2020,6,29,Former Employee,part-time,Sales Assistant,Adidas,Consumer Cyclical,2020-6-29
4,1,My experience is too maintain the task and group,2.0,My biggest dream is to work with Adidas family,Difficult to maintain but once know easy to ha...,2020,6,28,Former Employee,part-time,Questions Solving,Adidas,Consumer Cyclical,2020-6-28
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
402517,649,Review,4.0,"Work-life balance, great if you have a young f...",Lack of structured career progression,2018,8,27,Former Employee,full-time,Anonymous Employee,WPP plc,Communication Services,2018-8-27
402518,649,Engaging Projects,4.0,"Flexibility, Responsibility, Authority to get ...",Working across agencies is fun but rarely stru...,2018,8,18,Former Employee,full-time,Senior Project Manager,WPP plc,Communication Services,2018-8-18
402519,649,Healthcare and 401k are a joke,1.0,none - 401k match is extremely low,"401k has a tiny match, why bother?Healthcare i...",2018,8,16,Current Employee,full-time,Anonymous Employee,WPP plc,Communication Services,2018-8-16
402520,649,Where hacks go to die.,1.0,Lack of talent is considered a plus.,"Impersonal, low paying, account folk rule the ...",2018,7,17,Former Employee,full-time,Anonymous Employee,WPP plc,Communication Services,2018-7-17


In [37]:
def string_to_date(date_str):
    try:
        return datetime.strptime(date_str, '%Y-%m-%d')
    except:
        return datetime.strptime('1800-1-1', '%Y-%m-%d')
    
def string_to_YM(date_str):
    try:
        return datetime.strptime(date_str, '%Y-%m')
    except:
        return datetime.strptime('1800-1-1', '%Y-%m-%d')

reviews_df['Date'] = reviews_df['Date'].apply(lambda x: string_to_date(x))

reviews_df['Year-Month'] = reviews_df.apply(lambda x: string_to_YM('-'.join([str(x['Year']), str(x['Month'])])), axis=1)

### 2.4 Filter the data from the monitored period between 

In [38]:
# further analysis focusing only on the companies with at least 10 reviews in the monitored period
min_date = datetime.strptime('2018-7-1', '%Y-%m-%d')
max_date = datetime.strptime('2020-6-30', '%Y-%m-%d')

reviews_df = pd.DataFrame(
    reviews_df[(reviews_df.Date >= min_date) & (reviews_df.Date <= max_date)]
)

reviews_df

Unnamed: 0,Company_id,ReviewTitle,Rating,Pros,Cons,Year,Month,Day,JobTitle,Contract,EmployeeRelationship,Company,Sector,Date,Year-Month
2,1,Top,5.0,"Work-Life Balance, nice offices, top sport","Global company, long decision making",2020,6,30,Current Employee,full-time,Manager/In,Adidas,Consumer Cyclical,2020-06-30,2020-06-01
3,1,Sales Assistant,4.0,Staff helpfulPay was goof Staff are respectuful,Might get a little hectic,2020,6,29,Former Employee,part-time,Sales Assistant,Adidas,Consumer Cyclical,2020-06-29,2020-06-01
4,1,My experience is too maintain the task and group,2.0,My biggest dream is to work with Adidas family,Difficult to maintain but once know easy to ha...,2020,6,28,Former Employee,part-time,Questions Solving,Adidas,Consumer Cyclical,2020-06-28,2020-06-01
5,1,great job,4.0,cool team fun work environment great company,dealing with some difficult customers,2020,6,28,Former Employee,part-time,Retail Sales Associate,Adidas,Consumer Cyclical,2020-06-28,2020-06-01
6,1,Sales Floor Team lead,3.0,employee discount 40-60% most items excluding ...,"low paid job, almost much minimum wage to star...",2020,6,27,Former Employee,full-time,Team Lead,Adidas,Consumer Cyclical,2020-06-27,2020-06-01
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
402516,649,Don't even consider going there,1.0,- There are some talented and nice people,- Lives up to all the bad stereotypes of agenc...,2018,8,28,Former Employee,full-time,Account Director,WPP plc,Communication Services,2018-08-28,2018-08-01
402517,649,Review,4.0,"Work-life balance, great if you have a young f...",Lack of structured career progression,2018,8,27,Former Employee,full-time,Anonymous Employee,WPP plc,Communication Services,2018-08-27,2018-08-01
402518,649,Engaging Projects,4.0,"Flexibility, Responsibility, Authority to get ...",Working across agencies is fun but rarely stru...,2018,8,18,Former Employee,full-time,Senior Project Manager,WPP plc,Communication Services,2018-08-18,2018-08-01
402519,649,Healthcare and 401k are a joke,1.0,none - 401k match is extremely low,"401k has a tiny match, why bother?Healthcare i...",2018,8,16,Current Employee,full-time,Anonymous Employee,WPP plc,Communication Services,2018-08-16,2018-08-01


### 2.5 Filter only the reviews for companies with at least 10 reviews

In [39]:
# count reviews
reviews_count = (
    reviews_df
    .groupby('Company')
    .Rating
    .count()
)

# filter companies
companies_filtered = list(reviews_count[reviews_count>10].index)

In [40]:
reviews_df = reviews_df[reviews_df.Company.isin(companies_filtered)]

print(
    f"There are {reviews_df.shape[0]:.0f} reviews in total."
)

There are 391869 reviews in total.


## 3. Summary statistics

<hr>

**Content**
    
    3.1 Distribution of reviews over former/current and part/full-time employees

### 3.1 Distribution of reviews over former/current and part/full-time employees

In [49]:
reviews_df.head()

Unnamed: 0,Company_id,ReviewTitle,Rating,Pros,Cons,Year,Month,Day,JobTitle,Contract,EmployeeRelationship,Company,Sector,Date,Year-Month
2,1,Top,5.0,"Work-Life Balance, nice offices, top sport","Global company, long decision making",2020,6,30,Current Employee,full-time,Manager/In,Adidas,Consumer Cyclical,2020-06-30,2020-06-01
3,1,Sales Assistant,4.0,Staff helpfulPay was goof Staff are respectuful,Might get a little hectic,2020,6,29,Former Employee,part-time,Sales Assistant,Adidas,Consumer Cyclical,2020-06-29,2020-06-01
4,1,My experience is too maintain the task and group,2.0,My biggest dream is to work with Adidas family,Difficult to maintain but once know easy to ha...,2020,6,28,Former Employee,part-time,Questions Solving,Adidas,Consumer Cyclical,2020-06-28,2020-06-01
5,1,great job,4.0,cool team fun work environment great company,dealing with some difficult customers,2020,6,28,Former Employee,part-time,Retail Sales Associate,Adidas,Consumer Cyclical,2020-06-28,2020-06-01
6,1,Sales Floor Team lead,3.0,employee discount 40-60% most items excluding ...,"low paid job, almost much minimum wage to star...",2020,6,27,Former Employee,full-time,Team Lead,Adidas,Consumer Cyclical,2020-06-27,2020-06-01


In [34]:
print(
    reviews_df
    .groupby('JobTitle')
    .Company
    .count()
)

JobTitle
                                                        8
"Delegado comercial, área internacional"                1
"Marketing"                                             1
"Sales, Marketing and Business Intelligence Analyst"    1
(Classified)                                            1
                                                       ..
Yield Engineer                                          4
Yield Enhancement Engineer                              1
YouTube                                                 1
genious                                                 2
microarchitect                                          1
Name: Company, Length: 5640, dtype: int64
