In [None]:
import os
import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import SimpleImputer, KNNImputer, IterativeImputer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, ShuffleSplit, KFold, cross_val_score, GridSearchCV, StratifiedKFold
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import root_mean_squared_error
from sklearn.pipeline import Pipeline

In [None]:
for file_name in os.listdir("files"):
    if file_name.endswith('.csv'):
        file_path = os.path.join("files", file_name)

        df_name = os.path.splitext(file_name)[0]
        globals()[df_name] = pd.read_csv(file_path)

# Sanity check

In [None]:
#NO MISTAKE IN RETIREES CLASSIFICATION ?

ids_1 = set(learn_dataset_retired_pension["PRIMARY_KEY"])
ids_2 = set(learn_dataset_retired_jobs["PRIMARY_KEY"])
ids_3 = set(learn_dataset_retired_former["PRIMARY_KEY"])

# Step 2: Check the equality of the IDs between datasets
def check_sanity(ids_a, ids_b, dataset_a, dataset_b):
    if ids_a == ids_b:
        print(f"Sanity check passed: {dataset_a} and {dataset_b} contain the same individuals.")
    else:
        print(f"Sanity check failed: There are differences in the individuals from {dataset_a} and {dataset_b}")

check_sanity(ids_1, ids_2, "Dataset 1", "Dataset 2")
check_sanity(ids_1, ids_3, "Dataset 1", "Dataset 3")
check_sanity(ids_2, ids_3, "Dataset 2", "Dataset 3")

not_common_ids_1 = ids_1.symmetric_difference(ids_3)
print(f"Number of people that are not common to dataset 1 and 3: {len(not_common_ids_1)}")

not_common_ids_2 = ids_2.symmetric_difference(ids_3)
print(f"Number of people that are not common to dataset 2 and 3: {len(not_common_ids_2)}")

print("These individuals are in dataset3 but not in dataset1:", ids_3 - ids_1)
print("These individuals are in dataset3 but not in dataset2:", ids_3 - ids_2)

#We are OK, it just that the the dataset 3 has more data. 


In [None]:
# sports
learn_dataset_sport = pd.merge(learn_dataset_sport, code_Sports, left_on="Sports", right_on="Code")
learn_dataset_sport["Sports_Category"] = learn_dataset_sport["Categorie"]
learn_sports = learn_dataset_sport[["PRIMARY_KEY", "Sports_Category"]]

# departments into regions
def merge_and_extract_region(df, merge_column, region_column_name):
    df = pd.merge(df, departments, left_on=merge_column, right_on="DEP")
    df[region_column_name] = df["REG"]
    df.drop(["Nom du département", "REG", "DEP", merge_column], axis=1, inplace=True)
    return df


learn_dataset_job = merge_and_extract_region(
    learn_dataset_job, merge_column="JOB_DEP", region_column_name="REG_JOB"
)

learn_dataset_retired_jobs = merge_and_extract_region(
    learn_dataset_retired_jobs, merge_column="JOB_DEP", region_column_name="REG_JOB"
)

learn_dataset_retired_jobs = merge_and_extract_region(
    learn_dataset_retired_jobs, merge_column="FORMER_DEP", region_column_name="REG_FORMER"
)

In [None]:
#Economic sector into fewer categories (and numeric instead of object/string)
def sector_mapping(nace_code):
    if nace_code == "AZ":  
        return "Agriculture, forestry and fishing)"
    elif "BZ" <= nace_code <= "EZ":
        return "Manufacturing, mining and quarrying and other industrial activities"
    elif nace_code == "FZ": 
        return "Construction"
    elif "GZ" <= nace_code <= "IZ":  
        return "Wholesale and retail trade, transportation and storage, accommodation and food service activities"
    elif "JA" <= nace_code <= "JC":
        return "Information and communication"
    elif nace_code == "KZ": 
        return "Financial and insurance activities"
    elif nace_code == "LZ": 
        return "Real estate activities"
    elif "MA" <= nace_code <= "NZ":
        return "Professional, scientific, technical, administrative and support service activities"
    elif "OZ" <= nace_code <= "QB":
        return "Public administration and defence, education, human health and social work activities"
    elif "RZ" <= nace_code <= "UZ":
        return "Other services activities"
    else:
        return "Unknown Sector"

code_Economic_sector["Nomenclature"] = code_Economic_sector["Code"].map(sector_mapping)
code_Economic_sector["Economic_sector_num"] = pd.factorize(code_Economic_sector["Nomenclature"])[0] + 1

#code_emp_contract["emp_contract_num"] = pd.factorize(code_emp_contract["Code"])[0] + 1
code_HIGHEST_CREDENTIAL["HIGHEST_CREDENTIAL_num"] = pd.factorize(code_HIGHEST_CREDENTIAL["Code"])[0] + 1
code_act["act_num"] = pd.factorize(code_act["Code"])[0] + 1

#learn_dataset_emp_contract = pd.merge(learn_dataset_emp_contract, code_emp_contract, left_on="emp_contract",  right_on="Code")
#learn_dataset_emp_contract.drop(["Code", "Libellé"], axis=1, inplace=True)
#replace this with one-hot-encoding

In [None]:
learn_data = learn_dataset
learn_data = learn_data[['target'] + [col for col in learn_data.columns if col != 'target']]

learn_data = pd.merge(learn_data, code_act, left_on="act", right_on="Code", how="left")
learn_data.drop(["Code", "Libellé"], axis=1, inplace=True)
learn_data = pd.merge(learn_data, code_HIGHEST_CREDENTIAL, left_on="HIGHEST_CREDENTIAL", right_on="Code", how="left")
learn_data.drop(["Code", "Libellé", "HIGHEST_CREDENTIAL"], axis=1, inplace=True)

learn_data = pd.merge(learn_data, city_pop, on="INSEE_CODE", how="left")
learn_data = pd.merge(learn_data, city_loc, on="INSEE_CODE", how="left")

In [None]:
learn_dfs = [learn_dataset_emp_contract, learn_dataset_job, learn_dataset_retired_former, learn_dataset_retired_jobs, learn_dataset_retired_pension, learn_sports]

for df in learn_dfs:
    learn_data = pd.merge(learn_data, df, on="PRIMARY_KEY", how="outer")

In [None]:
def combine_columns(col_x, col_y):
    return col_x.fillna(col_y) if col_y is not None else col_x

for column in learn_data.columns:
    if column.endswith('_x'):
        base_column = column[:-2]  # Remove `_x` suffix
        y_column = base_column + '_y'
        if y_column in learn_data.columns:
            # Combine the columns
            learn_data[base_column] = combine_columns(learn_data[column], learn_data[y_column])
            # Drop the original `_x` and `_y` columns
            learn_data = learn_data.drop(columns=[column, y_column])

In [None]:
def household_num(value):
    parts = value.split('|')  # Split the value by '|'
    if parts[1] in {'1', '2', '3'}:  # For M|1|-- to M|3|--
        return int(parts[1])
    elif parts[1] == '4':  # For M|4|1 to M|4|4
        return 4 + (int(parts[2]) - 1)  # 4 + (1-1), 4 + (2-1), etc.
    return None  # Handle unexpected cases gracefully

code_HOUSEHOLD_TYPE['HOUSEHOLD_TYPE_num'] = code_HOUSEHOLD_TYPE['Code'].apply(household_num)
learn_data['HOUSEHOLD_TYPE'] = learn_data['HOUSEHOLD_TYPE'].apply(household_num)

In [None]:
learn_data["JOB_42_og"] = learn_data["JOB_42"]
learn_data["FORMER_JOB_42_og"] = learn_data["FORMER_JOB_42"]
learn_data["JOB_42"] = learn_data["JOB_42"].str.extract(r'csp_(\d+)_')[0].astype(int)
learn_data["FORMER_JOB_42"] = learn_data["FORMER_JOB_42"].str.extract(r'csp_(\d+)_')[0].astype("Int64")
learn_data["employee_count"] = learn_data["employee_count"].str.extract(r'tr_(\d)')[0].astype("Int64")
learn_data["Employer_category"] = learn_data["Employer_category"].str.extract(r'ct_(\d)')[0].astype("Int64")

learn_data = pd.merge(learn_data, code_Economic_sector, left_on="Economic_sector", right_on="Code", how="left")

learn_data = pd.merge(learn_data, code_work_description_map, left_on="work_description", right_on="N3", how="left")
learn_data.drop(["work_description", "N3", "N2"], axis=1, inplace=True)
learn_data["N1"] = learn_data["N1"].str.extract(r'csp_(\d)')[0].astype("Int64")
learn_data.rename(columns={"N1": "work_description"}, inplace=True)

learn_data["emp_contract"] = combine_columns(learn_data["emp_contract"], learn_data["former_emp_contract"])
learn_data["Pay"] = combine_columns(learn_data["Pay"], learn_data["RETIREMENT_PAY"])
#learn_data['is_retired'] = (learn_data['JOB_42'] == 7).astype(int)
#learn_data['is_unemployed'] = (learn_data['act'].str.startswith('TACT2_') & (learn_data['act'] != 'TACT2_1')).astype(int)
#learn_data['is_unemployed'] = (learn_data['act'] == 'TACT1_2').astype(int)

In [None]:
def categorize_retirement_age():
    global learn_data  # Modify the global learn_data DataFrame
    
    # Ensure the retirement_age column is numeric
    learn_data['retirement_age'] = pd.to_numeric(learn_data['retirement_age'], errors='coerce')
    
    # Define the bins and corresponding labels
    bins = [0, 57, 60, 61, 63, 65, float('inf')]  # Specify edges for the ranges
    labels = ['<57', '57-59', '60', '61-62', '63-64', '65+']  # Labels for ranges

    # Initial categorization with pd.cut
    learn_data['retirement_age_cat'] = pd.cut(
        learn_data['retirement_age'], 
        bins=bins, 
        labels=labels, 
        right=False,  # Left-closed intervals
        include_lowest=True
    )


    # Ensure missing values in retirement_age_cat are handled properly
    learn_data['retirement_age_cat'] = learn_data['retirement_age_cat'].astype(object)  # Avoid ambiguity with NA
    
    # Handle exact matches for 60 and 65
    learn_data.loc[learn_data['retirement_age'] == 60, 'retirement_age_cat'] = '60'

# Call the function
categorize_retirement_age()

In [None]:
# types
learn_data["sex"] = pd.factorize(learn_data["sex"])[0]
learn_data["studying"] = learn_data["studying"].astype("int64")
#or learn_data["Sports_Category"] = pd.to_numeric(learn_data["Sports_Category"], errors='coerce').astype("Int64")
learn_data["Sports_Category"] = learn_data["Sports_Category"].fillna(0).astype("int64")
learn_data["REG_JOB"] = pd.to_numeric(learn_data["REG_JOB"], errors='coerce').astype('Int64')
learn_data["REG_FORMER"] = pd.to_numeric(learn_data["REG_FORMER"], errors='coerce').astype('Int64')
learn_data["retirement_age"] = pd.to_numeric(learn_data["retirement_age"], errors='coerce').astype('Int64')
learn_data["WORKING_HOURS"] = pd.to_numeric(learn_data["WORKING_HOURS"], errors='coerce').astype('Int64')
learn_data["Economic_sector_num"] = pd.to_numeric(learn_data["Economic_sector_num"], errors='coerce').astype('Int64')
learn_data["Pay"] = pd.to_numeric(learn_data["Pay"], errors='coerce').astype('Int64')

In [None]:
def replace_na_with_category(column_name):
    global learn_data  # Ensures we modify the global learn_data directly

    # Convert the column to categorical
    learn_data[column_name] = learn_data[column_name].astype('category')
    
    # Define categories to add
    additional_categories = ['Unemployed', 'Retired_Missing', 'Employed_Missing']
    
    # Add the specified categories
    learn_data[column_name] = learn_data[column_name].cat.add_categories(additional_categories)
    
    learn_data.loc[(learn_data[column_name].isna()) & (learn_data['JOB_42'] == 7), column_name] = 'Retired_Missing'
    learn_data.loc[(learn_data[column_name].isna()) & (learn_data['act_num'] == 1), column_name] = 'Employed_Missing'
    learn_data.loc[(learn_data[column_name].isna()) & ((learn_data['JOB_42'] == 8) | (learn_data['act_num'] == 2)), column_name] = 'Unemployed'
    #learn_data[column_name] = learn_data[column_name].fillna("Unemployed")

replace_na_with_category("emp_contract")
replace_na_with_category("TYPE_OF_CONTRACT")
replace_na_with_category("WORK_CONDITION")
replace_na_with_category("labor_force_status")
replace_na_with_category("Economic_sector_num")
replace_na_with_category("REG_JOB")
replace_na_with_category("REG_FORMER")
replace_na_with_category("work_description")
replace_na_with_category("retirement_age_cat")
#do last after imputing
#replace_na_with_category("Employer_category") #need numbers only
#replace_na_with_category("employee_count")

In [None]:
jidijmqzeswf