In [1]:
import numpy as np
import pandas as pd
import json
from collections import defaultdict

### Load the data (cleaned)

In [2]:
df = pd.read_json("cleaned_company_data.json")
df.head()

Unnamed: 0,name,sector,website,year_of_founding,employees,gender_breakdown,average_age,social_links,text_blocks,id,...,Description _entities,About Loono_entities,The story of Livesport_entities,What is good to know_entities,Who are they looking for_entities,The beating IT heart of Heidelberg Materials_entities,A great match for the team_entities,A place you never get bored of_entities,About us_entities,What are they looking for ?_entities
0,Filigran,"[Cyber Security, SaaS, Cloud Services, Software]",https://www.filigran.io,2022.0,74.0,"{'women': '37', 'men': '63'}",36.0,{'facebook': 'https://www.facebook.com/Filigra...,{'Presentation': 'Filigran provides cyber thre...,136,...,,,,,,,,,,
1,Atomic Digital Design,[Digital],https://atomicdigital.design,2012.0,70.0,"{'women': '35', 'men': '65'}",31.0,{'instagram': 'https://instagram.com/atomicdig...,"{'Presentation': 'Founded by Antoine Vu, winne...",57,...,,,,,,,,,,
2,Oncrawl,"[Digital Marketing, Data Marketing]",https://www.oncrawl.com,2013.0,34.0,"{'women': '40', 'men': '60'}",31.0,{'linkedin': 'https://www.linkedin.com/company...,{'Presentation': 'Oncrawl provides data for te...,553,...,,,,,,,,,,
3,Inato,"[Health, Pharmaceutical, Biotech, Software]",https://inato.com,2016.0,70.0,"{'women': '56', 'men': '44'}",35.0,{'linkedin': 'https://www.linkedin.com/company...,{'Presentation': 'Inato is a tech-for-good com...,345,...,,,,,,,,,,
4,Sinequa,"[Artificial Intelligence, Machine Learning, Sa...",https://www.sinequa.com,2002.0,,"{'women': '30', 'men': '70'}",36.0,{'linkedin': 'https://www.linkedin.com/company...,{'Presentation': 'Headquartered in Paris with ...,701,...,,,,,,,,,,


In [17]:
df.columns

Index(['name', 'sector', 'website', 'year_of_founding', 'employees',
       'gender_breakdown', 'average_age', 'social_links', 'text_blocks', 'id',
       'Language', 'Presentation_entities',
       'What they are looking for_entities', 'Good to know_entities',
       'Description_entities', 'What they're looking for_entities',
       '4flow is a global firm_entities', 'Become a 4flower!_entities',
       'What you should know_entities', 'Présentation_entities',
       'Company Introduction_entities', 'Who Are They Looking for_entities',
       'Good to Know_entities', 'Le nom B-HIVE _entities',
       'Yves Rocher_entities', 'Petit Bateau_entities',
       'People Connected inspiring meaningful experiences._entities',
       'Unleash your potential in an empowering culture_entities',
       'About_entities', 'Who Are They Looking For_entities',
       'Good To Know_entities', 'Looking for_entities',
       'About company_entities', 'We are looking for..._entities',
       ' _entities'

### Check NaNs

In [23]:
# we don't have any full-NaN rows
df[df.isna().sum(axis=1) == len(df)]

Unnamed: 0,name,sector,website,year_of_founding,employees,gender_breakdown,average_age,social_links,text_blocks,id,...,Description _entities,About Loono_entities,The story of Livesport_entities,What is good to know_entities,Who are they looking for_entities,The beating IT heart of Heidelberg Materials_entities,A great match for the team_entities,A place you never get bored of_entities,About us_entities,What are they looking for ?_entities


In [20]:
print("Length of the data frame: ", len(df))
print("Number of NaNs in colums: ")
print(df.isna().sum().values)

Length of the data frame:  675
Number of NaNs in colums: 
[  0   0   6  12  14   0  70   0   0   0   8  42  45  41 670 673 674 674
 674 672 673 673 672 674 674 674 674 674 671 672 673 674 674 674 673 672
 674 674 674 674 674 674 673 674 674 674 674 674 674 674 674 674 674 674
 674 674 674 674 674 674 674 674 674 674]


We have a lot of colums with only 1 (or several) not NaN values, they are quite specific as represent answers to special qeustions/statements. We can drop them for the futute analysis and better visualisation.

In [39]:
# names of these columns
df.columns[df.isna().sum().values > 669]

Index(['Description_entities', 'What they're looking for_entities',
       '4flow is a global firm_entities', 'Become a 4flower!_entities',
       'What you should know_entities', 'Présentation_entities',
       'Company Introduction_entities', 'Who Are They Looking for_entities',
       'Good to Know_entities', 'Le nom B-HIVE _entities',
       'Yves Rocher_entities', 'Petit Bateau_entities',
       'People Connected inspiring meaningful experiences._entities',
       'Unleash your potential in an empowering culture_entities',
       'About_entities', 'Who Are They Looking For_entities',
       'Good To Know_entities', 'Looking for_entities',
       'About company_entities', 'We are looking for..._entities',
       ' _entities', 'About the company_entities',
       'Who we are looking for_entities', 'It's good to know_entities',
       'Are you the right fit? _entities', 'Life At Philips_entities',
       'O Danone_entities', 'Koho hledají_entities',
       'Who they're looking for_en

In [None]:
# names of regular columns
df.columns[~(df.isna().sum().values > 669)]

Index(['name', 'sector', 'website', 'year_of_founding', 'employees',
       'gender_breakdown', 'average_age', 'social_links', 'text_blocks', 'id',
       'Language', 'Presentation_entities',
       'What they are looking for_entities', 'Good to know_entities'],
      dtype='object')

Let's keep them

In [24]:
df = df[df.columns[df.isna().sum().values < 670]]

In [32]:
df.head(3)

Unnamed: 0,name,sector,website,year_of_founding,employees,gender_breakdown,average_age,social_links,text_blocks,id,Language,Presentation_entities,What they are looking for_entities,Good to know_entities
0,Filigran,"[Cyber Security, SaaS, Cloud Services, Software]",https://www.filigran.io,2022.0,74.0,"{'women': '37', 'men': '63'}",36.0,{'facebook': 'https://www.facebook.com/Filigra...,{'Presentation': 'Filigran provides cyber thre...,136,en,"{'organizations': ['Filigran'], 'locations': [...",{},"{'locations': ['Strong', 'France'], 'organizat..."
1,Atomic Digital Design,[Digital],https://atomicdigital.design,2012.0,70.0,"{'women': '35', 'men': '65'}",31.0,{'instagram': 'https://instagram.com/atomicdig...,"{'Presentation': 'Founded by Antoine Vu, winne...",57,en,"{'people': ['Gabriel Picard', 'Atomic Digital ...",{},"{'people': ['Adidas', 'Cartier', 'Lancôme', 'A..."
2,Oncrawl,"[Digital Marketing, Data Marketing]",https://www.oncrawl.com,2013.0,34.0,"{'women': '40', 'men': '60'}",31.0,{'linkedin': 'https://www.linkedin.com/company...,{'Presentation': 'Oncrawl provides data for te...,553,en,"{'organizations': ['Oncrawl', 'Forbes', 'Lastm...","{'organizations': ['Oncrawl', 'SEO']}",{}


In [33]:
df.isna().sum()

name                                   0
sector                                 0
website                                6
year_of_founding                      12
employees                             14
gender_breakdown                       0
average_age                           70
social_links                           0
text_blocks                            0
id                                     0
Language                               8
Presentation_entities                 42
What they are looking for_entities    45
Good to know_entities                 41
dtype: int64

### Now we can compute some statistics 

In [25]:
# Number of unique companies
print("Number of unique companies: ", df["name"].nunique())

Number of unique companies:  675


#### Average founding year

In [None]:
print("Average founding year: ", np.floor(df['year_of_founding'].mean()))

Average founding year:  2006.0


#### Average age

In [27]:
print("Average employees age: ", np.floor(df['average_age'].mean()))

Average employees age:  32.0


#### Average number of employees

In [28]:
print("Average number of employees: ", np.floor(df['employees'].mean()))

Average number of employees:  2463.0


### Let's explore some Gender Biases

In [29]:
tmp = {"men": 0, "women": 0}
total = 0

# count the sum of ratios
for item in df['gender_breakdown']:
    if item['men'] and item['women']:
        tmp['men'] += int(item['men'])
        tmp['women'] += int(item['women'])
        total += 1

print("Average male percentage:", tmp['men'] / total)
print("Average female percentage:", tmp['women'] / total)

print("No data in ", len(df) - total, "cases")

Average male percentage: 58.06054279749478
Average female percentage: 41.93945720250522
No data in  196 cases


### Now, Let's take a look at sectors

In [30]:
# count occurences for each sector
tmp = defaultdict(int)
for s in df['sector']:
    for it in s:
        tmp[it] += 1

# print top-10 most popular sectors
print("Top-10 most popular sectors (asc.): ")
print(pd.DataFrame(sorted(tmp.items(), key=lambda item: item[1], reverse=True)[:10], columns=["sector", "count"]))

Top-10 most popular sectors (asc.): 
                    sector  count
0                     SaaS    198
1           Cloud Services    198
2                 Software    160
3                  Digital     98
4  Artificial Intelligence     96
5         Machine Learning     96
6              Mobile Apps     73
7                       IT     68
8                  FinTech     62
9                InsurTech     62


#### And also the main language of a company

In [31]:
langs = defaultdict(int)

for lang in df['Language']:
    if lang:
        langs[lang] += 1
print(*langs.items())

('en', 627) ('cs', 4) ('vi', 1) ('fr', 34) (nan, 8) ('de', 1)
