In [2]:
import numpy as np
import pandas as pd
import json
from collections import defaultdict

### Load jobs data

In [3]:
input_file = "job_data.json"

df = pd.read_json(input_file)
df.head()

Unnamed: 0,company_name,job_title,location,posted_date,contract_type,remote_status,job_link
0,Bump,Customer Accountant / Comptable Clients,Paris,2024-11-22T17:19:14Z,Permanent contract,A few days at home,/en/companies/bump-charge/jobs/customer-accoun...
1,Bump,Senior Legal Counsel,Paris,2024-11-22T14:48:44Z,Permanent contract,A few days at home,/en/companies/bump-charge/jobs/senior-legal-co...
2,Bump,Operation Manager,Paris,2024-11-19T14:52:17Z,Internship,,/en/companies/bump-charge/jobs/operation-manag...
3,Bump,Responsable d'Affaires eMobility,Lyon,2024-11-18T09:22:23Z,Permanent contract,A few days at home,/en/companies/bump-charge/jobs/responsable-d-a...
4,Bump,Technical Support Engineer,Paris,2024-11-07T15:04:22Z,Permanent contract,A few days at home,/en/companies/bump-charge/jobs/ingenieur-techn...


### Count some NaNs

In [4]:
print(df.isna().sum())

company_name     5
job_title        0
location         0
posted_date      0
contract_type    0
remote_status    0
job_link         0
dtype: int64


we have only 5 NaNs in company_name column in our dataset

In [11]:
df[df['company_name'].isna()]

Unnamed: 0,company_name,job_title,location,posted_date,contract_type,remote_status,job_link
1436,,Suivi de maintenance et performance en Géothermie,Clamart,2024-11-25T16:38:42Z,Internship,,/en/companies/celsius-energy/jobs/suivi-de-mai...
1437,,Stage- Office manager,Clamart,2024-11-12T17:04:59Z,Internship,,/en/companies/celsius-energy/jobs/stage-office...
1438,,Stage- Développement d'outils numériques pour ...,Clamart,2024-11-12T16:55:17Z,Internship,,/en/companies/celsius-energy/jobs/stage-develo...
1439,,Technico-Commercial Géoénergie,Clamart,2024-10-24T09:59:39Z,Permanent contract,Fully-remote,/en/companies/celsius-energy/jobs/technico-com...
1440,,Stage - Digital - Montpellier - Développement ...,Clamart,2024-10-24T09:52:43Z,Internship,,/en/companies/celsius-energy/jobs/stage-digita...


And basically, we can see from the link that the company name is Celsius-Energy actually

### Now, some stats

In [15]:
print("Total data rows: ", len(df))
print("\n")

print("Unique data in each column: ")
print(df.nunique())

Total data rows:  3893


Unique data in each column: 
company_name      420
job_title        3512
location          405
posted_date      3699
contract_type       8
remote_status       3
job_link         3893
dtype: int64


#### Let's compute remote options:

In [40]:
remote = defaultdict(int)
for s in df['remote_status']:
    remote[s] += 1
print(remote.items())

dict_items([('A few days at home', 1452), ('', 2194), ('Fully-remote', 247)])


#### And the contract type:

In [36]:
c_type = defaultdict(int)
for s in df['contract_type']:
    c_type[s] += 1

print("Sorted contract types:")
for k, v in sorted(c_type.items(), key=lambda item: -item[1]):
    print(k, v)

Sorted contract types:
Permanent contract 2746
Internship 792
Temporary 115
Other 86
Work study 80
Freelance 48
Part time 16
International Corporate Volunteer Program 10


### And the top-10 locations:

In [38]:
location = defaultdict(int)
for s in df['location']:
    location[s] += 1

print("Most popular locations:")
for k, v in sorted(location.items(), key=lambda item: -item[1])[:10]:
    print(k, v)

Most popular locations:
Paris 1649
Lyon 110
London 100
Puteaux 83
Toulouse 75
Boulogne-Billancourt 65
Barcelona 58
Bordeaux 50
Brussel 49
New York 49
