In [13]:
import pandas as pd
import json
from llmsinferer import LlmsInferer

# Load data

In [14]:
cvs_train = pd.read_parquet("data/cvs_train_processed.parquet")
cvs_train

Unnamed: 0,id,analysis
0,CVANON0002,"{""professionality"": 4, ""visual"": 3, ""grammar"":..."
1,CVANON0247,"{""professionality"": 7, ""visual"": 6, ""grammar"":..."
2,CVANON0105,"{""professionality"": 7, ""visual"": 5, ""grammar"":..."
3,CVANON0009,"{""professionality"": 7, ""visual"": 6, ""grammar"":..."
4,CVANON0033,"{\n ""professionality"": 7,\n ""visual"": 6,\n ..."
...,...,...
322,CVANON0206,"{""professionality"": 7, ""visual"": 6, ""grammar"":..."
323,CVANON0185,"{""professionality"": 7, ""visual"": 6, ""grammar"":..."
324,CVANON0119,"{""professionality"": 6, ""visual"": 5, ""grammar"":..."
325,CVANON0059,"{""professionality"": 7, ""visual"": 6, ""grammar"":..."


In [16]:
cvs_predict = pd.read_parquet("data/cvs_predict_processed.parquet")
cvs_predict

Unnamed: 0,id,analysis
0,CVANON0002,"{""professionality"": 3, ""visual"": 4, ""grammar"":..."
1,CVANON0009,"{""professionality"": 3, ""visual"": 4, ""grammar"":..."
2,CVANON0033,"{""professionality"": 6, ""visual"": 5, ""grammar"":..."
3,CVANON0010,"{""professionality"": 6, ""visual"": 5, ""grammar"":..."
4,CVANON0015,"{\n ""professionality"": 3,\n ""visual"": 4,\n ..."
...,...,...
76,CVANON0018,"{\n ""professionality"": 8,\n ""visual"": 6,\n ..."
77,CVANON0019,"{""professionality"": 6, ""visual"": 5, ""grammar"":..."
78,CVANON0070,"{\n ""professionality"": 6,\n ""visual"": 5,\n ..."
79,CVANON0017,"{""professionality"": 7, ""visual"": 5, ""grammar"":..."


In [17]:
scores = pd.read_excel("data/scores.xlsx", sheet_name="Data")
scores.rename(columns={"ID": "id", "Total Points (up to) first 9 months": "points_9m"}, inplace=True)
scores = scores[["id", "points_9m"]]
scores

Unnamed: 0,id,points_9m
0,CVANON0001,
1,CVANON0002,3617.79800
2,CVANON0003,
3,CVANON0004,749.26500
4,CVANON0005,1373.16000
...,...,...
304,CVANON0333,1876.70008
305,CVANON0334,1092.65000
306,CVANON0335,1784.50000
307,CVANON0336,


# Define functions

In [18]:
def process_cvs_df(df: pd.DataFrame, mapping: dict, formulas: dict) -> pd.DataFrame:

	def exploded_table(series: pd.Series) -> pd.DataFrame:
		exploded_table = series.explode()
		exploded_table_index = exploded_table.index
		exploded_table = pd.json_normalize(exploded_table)
		exploded_table.index = exploded_table_index
		return exploded_table
	
	def score_education(exploded_table: pd.DataFrame, mapping: dict, formula: str) -> pd.Series:
		exploded_table["level_score"] = exploded_table["level"].apply(lambda x: mapping["education"][f"level"].get(x, 0))
		exploded_table["related_score"] = exploded_table["related"].apply(lambda x: mapping["education"]["related_multiplier"].get(x, 0))
		exploded_table["score"] = exploded_table.eval(formula)
		return exploded_table.groupby(level=0)["score"].sum()
	
	def score_experience(exploded_table: pd.DataFrame, mapping: dict, formula: str):
		exploded_table["responsibility_level_score"] = exploded_table["responsibility_level"].apply(lambda x: mapping["experience"][f"responsibility_level"].get(x, 0))
		exploded_table["duration"] = pd.to_numeric(exploded_table["duration"], errors="coerce").fillna(0)
		exploded_table["duration_score"] = exploded_table["duration"] / 12
		exploded_table["duration_score"] = exploded_table["duration_score"].fillna(0)
		exploded_table["related_score"] = exploded_table["related"].apply(lambda x: mapping["experience"]["related_multiplier"].get(x, 0))
		exploded_table["score"] = exploded_table.eval(formula)
		return exploded_table.groupby(level=0)["score"].sum()
	
	def score_languages(exploded_table: pd.DataFrame, mapping:dict, formula:str):
		exploded_table["languages_score"] = exploded_table["level"].apply(lambda x: mapping["languages"][f"level"].get(x, 0))
		exploded_table["score"] = exploded_table.eval(formula)
		return exploded_table.groupby(level=0)["score"].sum()
	
	score_functions = {
		"education": score_education,
		"experience": score_experience,
		"languages": score_languages,
	}

	# Convert the "analysys" column from a string formatted as JSON into a JSON object
	df["analysis"] = df["analysis"].apply(lambda x: json.loads(x))
	
	# Extract the columns from the "analysis" column
	extracted_columns = pd.json_normalize(df["analysis"])

	# Join the extracted columns with the original DataFrame
	df = pd.concat([df, extracted_columns], axis=1)

	# Score these fields using the corresponding scoring function
	fields = ["education", "experience", "languages"]
	for field in fields:
		df[field] = score_functions[field](exploded_table(df[field]), mapping, formulas[field])

	# Drop the original "analysis" column
	df = df.drop(columns=["analysis"])

	# Convert the columns into float to unify the data types
	for col in df.columns.drop("id"):
		df[col] = df[col].astype("float")

	return df

# Score maps and formulas

In [19]:
mapping = {
	"education": {
		"level": {
			"Primary": 1,
			"Secondary": 2,
			"Preparatory": 3,
			"Vocational": 3,
			"Diploma": 3,
			"Associate": 4,
			"Bachelor": 5,
			"Postgraduate Diploma": 6,
			"Master": 7,
			"Doctorate": 8,
			"Non-regulated Course": 1,
			"Other": 0
		},
		"related_multiplier": {
			True: 2,
			False: 1,
		},
	},
	"experience": {
		"responsibility_level": {
			"Entry": 1,
			"Intermediate": 2,
			"Senior": 3,
			"Manager": 4,
			"Director": 5,
			"Executive": 6,
			"Other": 0,
		},
		"related_multiplier": {
			True: 2,
			False: 1,
		},
	},
	"languages": {
		"level": {
			"Basic": 1,
			"Intermediate": 2,
			"Advanced": 3,
			"Fluent": 4,
			"Native": 5,
			"Other": 0,
		}
	}
}

In [20]:
formulas = {
    "education": "level_score * related_score",
    "experience": "responsibility_level_score * duration_score * related_score",
    "languages": "languages_score",
}

# Processing

In [21]:
cvs_train = process_cvs_df(df=cvs_train, mapping=mapping, formulas=formulas)
cvs_train

Unnamed: 0,id,professionality,visual,grammar,ai_likelihood,education,experience,languages
0,CVANON0002,4.0,3.0,5.0,3.0,4.0,9.833333,8.0
1,CVANON0247,7.0,6.0,8.0,4.0,12.0,3.083333,10.0
2,CVANON0105,7.0,5.0,4.0,5.0,23.0,87.666667,2.0
3,CVANON0009,7.0,6.0,6.0,5.0,15.0,4.166667,11.0
4,CVANON0033,7.0,6.0,8.0,4.0,9.0,5.166667,5.0
...,...,...,...,...,...,...,...,...
322,CVANON0206,7.0,6.0,8.0,3.0,13.0,25.583333,17.0
323,CVANON0185,7.0,6.0,6.0,5.0,10.0,11.666667,3.0
324,CVANON0119,6.0,5.0,8.0,4.0,22.0,74.500000,16.0
325,CVANON0059,7.0,6.0,8.0,5.0,20.0,11.000000,11.0


In [22]:
df_train = cvs_train.merge(scores, on="id", how="inner")
df_train

Unnamed: 0,id,professionality,visual,grammar,ai_likelihood,education,experience,languages,points_9m
0,CVANON0002,4.0,3.0,5.0,3.0,4.0,9.833333,8.0,3617.798
1,CVANON0247,7.0,6.0,8.0,4.0,12.0,3.083333,10.0,1271.650
2,CVANON0009,7.0,6.0,6.0,5.0,15.0,4.166667,11.0,2569.874
3,CVANON0238,7.0,6.0,5.0,4.0,24.0,10.500000,0.0,685.200
4,CVANON0269,7.0,6.0,8.0,3.0,12.0,8.000000,13.0,652.800
...,...,...,...,...,...,...,...,...,...
284,CVANON0318,5.0,4.0,3.0,6.0,6.0,13.916667,8.0,164.000
285,CVANON0210,8.0,7.0,6.0,5.0,7.0,284.000000,18.0,2292.700
286,CVANON0206,7.0,6.0,8.0,3.0,13.0,25.583333,17.0,82.500
287,CVANON0185,7.0,6.0,6.0,5.0,10.0,11.666667,3.0,130.500


In [23]:
df_predict = process_cvs_df(df=cvs_predict, mapping=mapping, formulas=formulas)
df_predict

Unnamed: 0,id,professionality,visual,grammar,ai_likelihood,education,experience,languages
0,CVANON0002,3.0,4.0,5.0,1.0,5.0,0.000000,10.0
1,CVANON0009,3.0,4.0,6.0,2.0,10.0,0.000000,10.0
2,CVANON0033,6.0,5.0,8.0,3.0,8.0,5.000000,10.0
3,CVANON0010,6.0,5.0,6.0,5.0,10.0,0.333333,6.0
4,CVANON0015,3.0,4.0,3.0,6.0,3.0,0.000000,8.0
...,...,...,...,...,...,...,...,...
76,CVANON0018,8.0,6.0,7.0,4.0,25.0,31.416667,13.0
77,CVANON0019,6.0,5.0,4.0,3.0,13.0,22.083333,9.0
78,CVANON0070,6.0,5.0,5.0,4.0,5.0,5.333333,8.0
79,CVANON0017,7.0,5.0,4.0,3.0,11.0,23.583333,12.0


# Export results

In [24]:
df_train.to_parquet("data/df_train.parquet")
df_predict.to_parquet("data/df_predict.parquet")