In [1]:
from datetime import timedelta

import numpy as np
import pandas as pd
import requests

from job_apps.api_requests import fetch_database_jsons
from job_apps.constants import API_HEADERS, URL_DATABASE
from job_apps.data_preprocessing import build_dataframe

In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
search_response = requests.post(url=URL_DATABASE, headers=API_HEADERS)
search_response_dict = search_response.json()

In [4]:
results = fetch_database_jsons(url=URL_DATABASE, headers=API_HEADERS)

In [124]:
df_base = build_dataframe(results)
df_base['date_applied'] = pd.to_datetime(df_base['date_applied'])
df_base = df_base.sort_values(by='date_applied', ascending=True)
df_base.reset_index(inplace=True, drop='index')
df_base['month_applied'] = df_base['date_applied'].dt.to_period('M')
df_base['interview'] = ~df_base['stage'].isin(['Application Sent', 'Rejection'])
df_base['interview_with_cover'] = df_base['interview'] & df_base['cover_letter'].isin(['Letter', 'Note'])

In [125]:
df_grouped = df_base.groupby('month_applied').agg(
    n_apps=('job_title', 'count'),
    n_cover_letters=('cover_letter', lambda x: x.isin(['Letter', 'Note']).sum()),
    n_interviews=('interview', 'sum'),
    n_interviews_with_cover=('interview_with_cover', 'sum'),
)

In [126]:
df_grouped

Unnamed: 0_level_0,n_apps,n_cover_letters,n_interviews,n_interviews_with_cover
month_applied,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2024-01,85,2,1,0
2024-02,47,6,1,0
2024-03,2,2,0,0
2024-04,37,4,1,1
2024-05,33,10,1,1
2024-06,18,17,1,1
2024-07,63,54,2,2
2024-09,20,17,0,0


In [127]:
print(
    "Average number of applications with cover letter per interview:",
    df_grouped.loc[df_grouped['n_interviews_with_cover'] > 0, 'n_apps'].sum()
    / df_grouped['n_interviews_with_cover'].sum()
)
print(
    "Average number of applications per interview:",
    round(df_grouped['n_apps'].sum() / df_grouped['n_interviews'].sum(), 2)
)

Average number of applications with cover letter per interview: 30.2
Average number of applications per interview: 43.57


In [128]:
df_interview = df_base.loc[df_base['interview']].sort_values(by='date_applied', ascending=True)
application_dt = df_interview.loc[:, 'date_applied'].diff()
application_dt = application_dt.rename('dt_applied')
df_interview = pd.concat([application_dt, df_interview], axis=1)
df_interview['n_apps_before_previous_interview'] = df_interview.index.diff().fillna(df_interview.index[0]).astype(int).to_list()

In [129]:
df_interview

Unnamed: 0,dt_applied,job_title,company,date_applied,origin,stage,job_description,cover_letter,month_applied,interview,interview_with_cover,n_apps_before_previous_interview
15,NaT,Data science / Power system engineer internship,Rte International,2024-01-11,Indeed,First Interview,https://fr.indeed.com/jobs?q=data+scientist&l=...,,2024-01,True,False,15
116,27 days,Data Scientist,Aquila,2024-02-07,APEC,Second Interview,,,2024-02,True,False,101
152,82 days,Docteur Data Scientist,Bimbamjob,2024-04-29,Welcome To The Jungle,First Interview,https://www.welcometothejungle.com/en/companie...,Letter,2024-04,True,True,36
201,30 days,Data Scientist,Hello Watt,2024-05-29,Welcome To The Jungle,Final interview,https://www.welcometothejungle.com/en/companie...,Letter,2024-05,True,True,49
204,5 days,Data Scientist,Dataleon,2024-06-03,Welcome To The Jungle,First Interview,https://www.welcometothejungle.com/en/companie...,Letter,2024-06,True,True,3
261,44 days,Data scientist,Wecasa,2024-07-17,Welcome To The Jungle,First Interview,https://www.welcometothejungle.com/en/companie...,Letter,2024-07,True,True,57
270,5 days,Jeune Docteur - Data Scientist R&D,Yanport,2024-07-22,Welcome To The Jungle,Offer,https://www.welcometothejungle.com/en/companie...,Letter,2024-07,True,True,9


In [130]:
chomage_days = 174
start_date = pd.to_datetime("2025-06-01").date()

last_chomage_day = start_date + timedelta(chomage_days)
print("Chomage last day:", last_chomage_day)

end_date = last_chomage_day

days = pd.date_range(start=start_date, end=end_date)
weekdays = days[~days.weekday.isin([5, 6])]
print(f"Weekdays from {start_date} to {last_chomage_day}:", len(weekdays))
print("Number of apps during this period:", len(weekdays) * 5)

Chomage last day: 2025-11-22
Weekdays from 2025-06-01 to 2025-11-22: 125
Number of apps during this period: 625


In [167]:
df_base = df_base.drop(columns=['n_apps_before_previous_interview'], errors='ignore')
df_base = df_base.join(df_interview[['n_apps_before_previous_interview']])
df_base['n_apps_before_previous_interview'] = df_base['n_apps_before_previous_interview'].map(lambda x: int(x) if not pd.isna(x) else 0)

In [171]:
df_base[['job_title', 'company', 'n_apps_before_previous_interview']]

Unnamed: 0,job_title,company,n_apps_before_previous_interview
0,Data Scientist,Polyconseil,0
1,Ingénieur-e de recherche data science,Saint-Gobain,0
2,"Research Scientist Intern, AI Core Machine Lea...",Meta,0
3,Data Analyst,Societe Generale,0
4,Ingénieur Machine Learning - Imagerie Médicale,BlueDocker,0
...,...,...,...
300,Modeller – Climate,Iceberg Data Lab,0
301,Data Scientist - Timesseries Machine Learning ...,Schneider Electric,0
302,Data Scientist,Gameloft,0
303,Post docteur Datascience,Finovox,0
