In [None]:
import numpy as np
import pandas as pd
from collections import defaultdict

Load the data

In [2]:
df = pd.read_json("data/cleaned_company_data.json")
df.head()

Unnamed: 0,name,sector,website,year_of_founding,employees,gender_breakdown,average_age,social_links,text_blocks,id,Language
0,Basikon,"[FinTech, InsurTech, Finance, Software]",http://www.basikon.com,2019.0,36.0,"{'women': None, 'men': None}",,{'linkedin': 'https://www.linkedin.com/company...,"{'Presentation': 'Founded in 2019, Basikon's m...",38,en
1,Kameleoon,"[Big Data, SaaS, Cloud Services]",http://www.kameleoon.com/fr,2012.0,135.0,"{'women': '50', 'men': '50'}",29.0,{'facebook': 'https://www.facebook.com/kameleo...,{'Presentation': 'Kameleoon empowers brands to...,583,en
2,APIDAY,"[Artificial Intelligence, Machine Learning, Sa...",https://apiday.com,2021.0,39.0,"{'women': '49', 'men': '51'}",30.0,{'linkedin': 'https://www.linkedin.com/company...,{'Presentation': 'Apiday is a next-generation ...,76,en
3,Ever Dye,"[Fashion, SocialTech, GreenTech, Specialised E...",https://everdye.fr,2021.0,16.0,"{'women': '55', 'men': '45'}",32.0,{'linkedin': 'https://www.linkedin.com/company...,{'Presentation': 'Ever Dye is on a mission to ...,150,en
4,Vulog,"[Collaborative Economy, Mobility, SaaS, Cloud ...",https://www.vulog.com,2006.0,100.0,"{'women': None, 'men': None}",35.0,{'instagram': 'https://instagram.com/vulog_tec...,{'Presentation': 'Vulog is the world's leading...,838,en


In [3]:
df.columns

Index(['name', 'sector', 'website', 'year_of_founding', 'employees',
       'gender_breakdown', 'average_age', 'social_links', 'text_blocks', 'id',
       'Language'],
      dtype='object')

In [4]:
df.isna().sum().values

array([ 0,  0,  6, 12, 14,  0, 70,  0,  0,  0, 19])

In [5]:
df.columns[df.isna().sum().values > 669]

Index([], dtype='object')

In [6]:
df.columns[~(df.isna().sum().values > 669)]

Index(['name', 'sector', 'website', 'year_of_founding', 'employees',
       'gender_breakdown', 'average_age', 'social_links', 'text_blocks', 'id',
       'Language'],
      dtype='object')

Full NaN rows:

In [10]:
df[df.isna().sum(axis=1) == df.shape[1]]

Unnamed: 0,name,sector,website,year_of_founding,employees,gender_breakdown,average_age,social_links,text_blocks,id,Language


In [11]:
len(df.columns[df.isna().sum().values != len(df)])

11

In [12]:
df["name"].nunique()

675

Average founding year

In [13]:
np.floor(df['year_of_founding'].mean())

np.float64(2006.0)

Average age

In [14]:
np.floor(df['average_age'].mean())

np.float64(32.0)

Average employees

In [15]:
np.floor(df['employees'].mean())

np.float64(2463.0)

Gender bias:

In [17]:
tmp = {"men": 0, "women": 0}
total = 0

for item in df['gender_breakdown']:
    if item['men'] and item['women']:
        tmp['men'] += int(item['men'])
        tmp['women'] += int(item['women'])
        total += 1

print("Average male percentage:", tmp['men'] / total)
print("Average female percentage:", tmp['women'] / total)

print("No data in", len(df) - total, "cases")

Average male percentage: 58.06054279749478
Average female percentage: 41.93945720250522
No data in 196 cases


Sectors

In [19]:
tmp = defaultdict(int)
for s in df['sector']:
    for it in s:
        tmp[it] += 1

print(pd.DataFrame(sorted(tmp.items(), key=lambda item: item[1], reverse=True)[:10], columns=["sector", "count"]))

                    sector  count
0                     SaaS    198
1           Cloud Services    198
2                 Software    160
3                  Digital     98
4  Artificial Intelligence     96
5         Machine Learning     96
6              Mobile Apps     73
7                       IT     68
8                  FinTech     62
9                InsurTech     62


Language

In [20]:
langs = defaultdict(int)

for lang in df['Language']:
    if lang:
        langs[lang] += 1
print(*langs.items())

('en', 618) ('fr', 33) ('cs', 3) (nan, 19) ('vi', 1) ('de', 1)


Remove most nan columns

In [21]:
df_small = df[df.columns[~(df.isna().sum().values > 669)]]

In [22]:
df_small.head()

Unnamed: 0,name,sector,website,year_of_founding,employees,gender_breakdown,average_age,social_links,text_blocks,id,Language
0,Basikon,"[FinTech, InsurTech, Finance, Software]",http://www.basikon.com,2019.0,36.0,"{'women': None, 'men': None}",,{'linkedin': 'https://www.linkedin.com/company...,"{'Presentation': 'Founded in 2019, Basikon's m...",38,en
1,Kameleoon,"[Big Data, SaaS, Cloud Services]",http://www.kameleoon.com/fr,2012.0,135.0,"{'women': '50', 'men': '50'}",29.0,{'facebook': 'https://www.facebook.com/kameleo...,{'Presentation': 'Kameleoon empowers brands to...,583,en
2,APIDAY,"[Artificial Intelligence, Machine Learning, Sa...",https://apiday.com,2021.0,39.0,"{'women': '49', 'men': '51'}",30.0,{'linkedin': 'https://www.linkedin.com/company...,{'Presentation': 'Apiday is a next-generation ...,76,en
3,Ever Dye,"[Fashion, SocialTech, GreenTech, Specialised E...",https://everdye.fr,2021.0,16.0,"{'women': '55', 'men': '45'}",32.0,{'linkedin': 'https://www.linkedin.com/company...,{'Presentation': 'Ever Dye is on a mission to ...,150,en
4,Vulog,"[Collaborative Economy, Mobility, SaaS, Cloud ...",https://www.vulog.com,2006.0,100.0,"{'women': None, 'men': None}",35.0,{'instagram': 'https://instagram.com/vulog_tec...,{'Presentation': 'Vulog is the world's leading...,838,en


In [23]:
df_small.isna().sum()

name                 0
sector               0
website              6
year_of_founding    12
employees           14
gender_breakdown     0
average_age         70
social_links         0
text_blocks          0
id                   0
Language            19
dtype: int64