In [74]:
import os
import pandas as pd
import numpy as np

In [None]:
from sklearn import preprocessing
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import SimpleImputer, KNNImputer, IterativeImputer
from sklearn.preprocessing import LabelEncoder

## Load data

In [135]:
for file_name in os.listdir("files"):
    if file_name.endswith('.csv'):
        file_path = os.path.join("files", file_name)

        df_name = os.path.splitext(file_name)[0]
        globals()[df_name] = pd.read_csv(file_path)


In [136]:
#dfs with missing data: learn_dataset_job, learn_dataset_retired_jobs, test_dataset_job, test_dataset_retired_jobs

## Simplification of categories

In [137]:
# sports
learn_dataset_sport = pd.merge(learn_dataset_sport, code_Sports, left_on="Sports", right_on="Code")
learn_dataset_sport["Sports_Category"] = learn_dataset_sport["Categorie"]
learn_sports = learn_dataset_sport[["PRIMARY_KEY", "Sports_Category"]]

# departments into regions
def merge_and_extract_region(df, merge_column, region_column_name):
    df = pd.merge(df, departments, left_on=merge_column, right_on="DEP")
    df[region_column_name] = df["REG"]
    df.drop(["Nom du département", "REG", "DEP", merge_column], axis=1, inplace=True)
    return df


learn_dataset_job = merge_and_extract_region(
    learn_dataset_job, merge_column="JOB_DEP", region_column_name="REG_JOB"
)

learn_dataset_retired_jobs = merge_and_extract_region(
    learn_dataset_retired_jobs, merge_column="JOB_DEP", region_column_name="REG_JOB"
)

learn_dataset_retired_jobs = merge_and_extract_region(
    learn_dataset_retired_jobs, merge_column="FORMER_DEP", region_column_name="REG_FORMER"
)

In [138]:
#Economic sector into fewer categories (and numeric instead of object/string)
def sector_mapping(nace_code):
    if nace_code == "AZ":  
        return "Agriculture, forestry and fishing)"
    elif "BZ" <= nace_code <= "EZ":
        return "Manufacturing, mining and quarrying and other industrial activities"
    elif nace_code == "FZ": 
        return "Construction"
    elif "GZ" <= nace_code <= "IZ":  
        return "Wholesale and retail trade, transportation and storage, accommodation and food service activities"
    elif "JA" <= nace_code <= "JC":
        return "Information and communication"
    elif nace_code == "KZ": 
        return "Financial and insurance activities"
    elif nace_code == "LZ": 
        return "Real estate activities"
    elif "MA" <= nace_code <= "NZ":
        return "Professional, scientific, technical, administrative and support service activities"
    elif "OZ" <= nace_code <= "QB":
        return "Public administration and defence, education, human health and social work activities"
    elif "RZ" <= nace_code <= "UZ":
        return "Other services activities"
    else:
        return "Unknown Sector"

code_Economic_sector["Nomenclature"] = code_Economic_sector["Code"].map(sector_mapping)
code_Economic_sector["Economic_sector_num"] = pd.factorize(code_Economic_sector["Nomenclature"])[0] + 1

## Merging

In [139]:
learn_dfs = [learn_dataset_emp_contract, learn_dataset_job, learn_dataset_retired_former, learn_dataset_retired_jobs, learn_dataset_retired_pension, learn_sports]

learn_data = learn_dataset

for df in learn_dfs:
    learn_data = pd.merge(learn_data, df, on="PRIMARY_KEY", how="outer")

In [140]:
def combine_columns(col_x, col_y):
    return col_x.fillna(col_y) if col_y is not None else col_x

for column in learn_data.columns:
    if column.endswith('_x'):
        base_column = column[:-2]  # Remove `_x` suffix
        y_column = base_column + '_y'
        if y_column in learn_data.columns:
            # Combine the columns
            learn_data[base_column] = combine_columns(learn_data[column], learn_data[y_column])
            # Drop the original `_x` and `_y` columns
            learn_data = learn_data.drop(columns=[column, y_column])


## Formatting

In [141]:
learn_data = pd.merge(learn_data, code_Economic_sector, left_on="Economic_sector", right_on="Code", how="left")

learn_data = pd.merge(learn_data, code_work_description_map, left_on="work_description", right_on="N3", how="left")
learn_data.drop(["work_description", "N3", "N2"], axis=1, inplace=True)
learn_data["N1"] = learn_data["N1"].str.extract(r'csp_(\d)')[0].astype("Int64")
learn_data.rename(columns={"N1": "work_description"}, inplace=True)

learn_data["emp_contract"] = combine_columns(learn_data["emp_contract"], learn_data["former_emp_contract"])
learn_data["Pay"] = combine_columns(learn_data["Pay"], learn_data["RETIREMENT_PAY"])
learn_data['is_retired'] = learn_data['JOB_42'].str.startswith('csp_7').astype(int)
learn_data['is_unemployed'] = (learn_data['act'].str.startswith('TACT2_') & (learn_data['act'] != 'TACT2_1')).astype(int)
#learn_data['is_unemployed'] = (learn_data['act'] == 'TACT1_2').astype(int)

learn_data.loc[learn_data['JOB_42'].str.startswith('csp_7', na=False), 'JOB_42'] = learn_data['FORMER_JOB_42']

learn_data = learn_data.drop(columns=["act", "former_emp_contract", "RETIREMENT_PAY", "FORMER_JOB_42", "Economic_sector", "Code", "Libellé", "Nomenclature"])
#or keep nomenclature, remove economic_sector_num

learn_data["JOB_42"] = learn_data["JOB_42"].str.extract(r'csp_(\d+)_')[0].astype(int)
learn_data["Employer_category"] = learn_data["Employer_category"].str.extract(r'ct_(\d)')[0].astype("Int64")
learn_data["employee_count"] = learn_data["employee_count"].str.extract(r'tr_(\d)')[0].astype("Int64")

In [142]:
# types
learn_data["sex"] = pd.factorize(learn_data["sex"])[0]
learn_data["studying"] = learn_data["studying"].astype("int64")
learn_data["Sports_Category"] = pd.to_numeric(learn_data["Sports_Category"], errors='coerce').astype("Int64")
#or learn_data["Sports_Category"] = learn_data["Sports_Category"].fillna(0).astype("int64")
learn_data["REG_JOB"] = pd.to_numeric(learn_data["REG_JOB"], errors='coerce').astype('Int64')
learn_data["REG_FORMER"] = pd.to_numeric(learn_data["REG_FORMER"], errors='coerce').astype('Int64')
learn_data["retirement_age"] = pd.to_numeric(learn_data["retirement_age"], errors='coerce').astype('Int64')
learn_data["WORKING_HOURS"] = pd.to_numeric(learn_data["WORKING_HOURS"], errors='coerce').astype('Int64')
learn_data["Economic_sector_num"] = pd.to_numeric(learn_data["Economic_sector_num"], errors='coerce').astype('Int64')

In [143]:
def household_num(value):
    parts = value.split('|')  # Split the value by '|'
    if parts[1] in {'1', '2', '3'}:  # For M|1|-- to M|3|--
        return int(parts[1])
    elif parts[1] == '4':  # For M|4|1 to M|4|4
        return 4 + (int(parts[2]) - 1)  # 4 + (1-1), 4 + (2-1), etc.
    return None  # Handle unexpected cases gracefully

code_HOUSEHOLD_TYPE['HOUSEHOLD_TYPE_num'] = code_HOUSEHOLD_TYPE['Code'].apply(household_num)
learn_data['HOUSEHOLD_TYPE'] = learn_data['HOUSEHOLD_TYPE'].apply(household_num)


do one-hot-encoding for WORK_CONDITION, TYPE_OF_CONTRACT, labor_force_status

figure out highest credentials simplification?

consider making pay categorical as well with eg tax level boundaries?

link dep to INSEE code for missing ones

for all now numerical cats but with missing values, can do +1 and make fillna as 0

note: TACT2_3 doesn't exist in dataset - no under 14 year olds

In [144]:
learn_data.dtypes #maybe working hours?, retirement age, (retirement) pay, should be int64

PRIMARY_KEY              int64
sex                      int64
JOB_42                   int64
HIGHEST_CREDENTIAL      object
studying                 int64
INSEE_CODE              object
age_2020                 int64
HOUSEHOLD_TYPE           int64
target                 float64
emp_contract            object
Pay                    float64
retirement_age           Int64
REG_FORMER               Int64
Sports_Category          Int64
Employer_category        Int64
employee_count           Int64
TYPE_OF_CONTRACT        object
WORK_CONDITION          object
labor_force_status      object
WORKING_HOURS            Int64
REG_JOB                  Int64
Economic_sector_num      Int64
work_description         Int64
is_retired               int64
is_unemployed            int64
dtype: object

# Handling Missing Data

In [None]:
def missing_values_table(df):
        mis_val = df.isnull().sum()
        mis_val_percent = 100 * df.isnull().sum() / len(df)  #% of missing values
        mis_val_table = pd.concat([mis_val, mis_val_percent], axis=1)  #create result table
        mis_val_table_ren_columns = mis_val_table.rename(
        columns = {0 : 'Missing Values', 1 : '% of Total Values'})
        mis_val_table_ren_columns = mis_val_table_ren_columns[
            mis_val_table_ren_columns.iloc[:,1] != 0].sort_values(
        '% of Total Values', ascending=False).round(1)
        print ("There are " + str(mis_val_table_ren_columns.shape[0]) +
              " columns that have missing values.")
        return mis_val_table_ren_columns

print(missing_values_table(learn_data)) 

In [None]:
# for job_42 cats csp_1 and 2 have emp_contract but missing data for all other job stuff
# and for csp_8 have missing data for all job stuff 


### FOR employee_count

In [None]:
#no longer necessary? - employee count now numerical
le = preprocessing.LabelEncoder()
le.fit(learn_data.loc[:,"employee_count"])
dict(zip(le.classes_, le.transform(le.classes_)))

In [None]:
learn_data['employee_count_encoded'] = le.transform(learn_data['employee_count'])
learn_data['employee_count_encoded'] = learn_data['employee_count_encoded'].map(lambda x: np.nan if x==7 else x)
df_train = learn_data.loc[:,["PRIMARY_KEY", "employee_count_encoded","studying", "WORKING_HOURS", "age_2020","Pay", "retirement_age","RETIREMENT_PAY",
                             "Sports_Category", "REG_JOB","REG_FORMER_JOB"]]
df_train.head()

In [None]:
#imputer = IterativeImputer(random_state=100)
#imputer.fit(df_train)
#df_imputed = imputer.transform(df_train)
knn_imputer = KNNImputer(n_neighbors=5, weights="uniform")
knn_imputer.fit(df_train)
df_imputed = knn_imputer.transform(df_train)

In [None]:
learn_data.loc[:,"employee_count_encoded"] = df_imputed[:,1].round()
count_imputed = list(le.inverse_transform(learn_data['employee_count_encoded'].round().astype('int')))
learn_data["employee_count_encoded"] = count_imputed

In [None]:
learn_data