In [1]:
import os
import pandas as pd
import numpy as np

In [2]:
from sklearn import preprocessing
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import SimpleImputer, KNNImputer, IterativeImputer
from sklearn.preprocessing import LabelEncoder

## Load data

In [613]:
for file_name in os.listdir("files"):
    if file_name.endswith('.csv'):
        file_path = os.path.join("files", file_name)

        df_name = os.path.splitext(file_name)[0]
        globals()[df_name] = pd.read_csv(file_path)


## SANITY CHECK

In [9]:
#NO MISTAKE IN RETIREES CLASSIFICATION ?

ids_1 = set(learn_dataset_retired_pension["PRIMARY_KEY"])
ids_2 = set(learn_dataset_retired_jobs["PRIMARY_KEY"])
ids_3 = set(learn_dataset_retired_former["PRIMARY_KEY"])

# Step 2: Check the equality of the IDs between datasets
def check_sanity(ids_a, ids_b, dataset_a, dataset_b):
    if ids_a == ids_b:
        print(f"Sanity check passed: {dataset_a} and {dataset_b} contain the same individuals.")
    else:
        print(f"Sanity check failed: There are differences in the individuals from {dataset_a} and {dataset_b}")

check_sanity(ids_1, ids_2, "Dataset 1", "Dataset 2")
check_sanity(ids_1, ids_3, "Dataset 1", "Dataset 3")
check_sanity(ids_2, ids_3, "Dataset 2", "Dataset 3")

not_common_ids_1 = ids_1.symmetric_difference(ids_3)
print(f"Number of people that are not common to dataset 1 and 3: {len(not_common_ids_1)}")

not_common_ids_2 = ids_2.symmetric_difference(ids_3)
print(f"Number of people that are not common to dataset 2 and 3: {len(not_common_ids_2)}")

print("These individuals are in dataset3 but not in dataset1:", ids_3 - ids_1)
print("These individuals are in dataset3 but not in dataset2:", ids_3 - ids_2)

#We are OK, it just that the the dataset 3 has more data. 


Sanity check passed: Dataset 1 and Dataset 2 contain the same individuals.
Sanity check failed: There are differences in the individuals from Dataset 1 and Dataset 3
Sanity check failed: There are differences in the individuals from Dataset 2 and Dataset 3
Number of people that are not common to dataset 1 and 3: 2083
Number of people that are not common to dataset 2 and 3: 2083
These individuals are in dataset3 but not in dataset1: {16389, 40966, 8202, 32783, 32784, 8207, 65556, 40989, 40996, 8231, 24621, 8238, 16438, 90170, 32835, 24656, 41044, 41047, 41051, 82011, 32862, 41054, 41056, 41058, 41065, 41067, 73841, 73842, 41078, 90232, 8318, 24707, 90245, 138, 90253, 16528, 49318, 82092, 8369, 24754, 90293, 16571, 49340, 82111, 195, 204, 210, 65749, 57559, 41178, 98525, 49386, 65772, 57589, 98550, 24833, 262, 263, 82184, 65803, 49421, 82189, 82192, 8467, 82197, 310, 82237, 8511, 33090, 74051, 16713, 57680, 24918, 57695, 8544, 33122, 74085, 358, 16743, 362, 41323, 57709, 90488, 90489, 57

## Simplification of categories

In [615]:
# sports
learn_dataset_sport = pd.merge(learn_dataset_sport, code_Sports, left_on="Sports", right_on="Code")
learn_dataset_sport["Sports_Category"] = learn_dataset_sport["Categorie"]
learn_sports = learn_dataset_sport[["PRIMARY_KEY", "Sports_Category"]]

# departments into regions
def merge_and_extract_region(df, merge_column, region_column_name):
    df = pd.merge(df, departments, left_on=merge_column, right_on="DEP")
    df[region_column_name] = df["REG"]
    df.drop(["Nom du département", "REG", "DEP", merge_column], axis=1, inplace=True)
    return df


learn_dataset_job = merge_and_extract_region(
    learn_dataset_job, merge_column="JOB_DEP", region_column_name="REG_JOB"
)

learn_dataset_retired_jobs = merge_and_extract_region(
    learn_dataset_retired_jobs, merge_column="JOB_DEP", region_column_name="REG_JOB"
)

learn_dataset_retired_jobs = merge_and_extract_region(
    learn_dataset_retired_jobs, merge_column="FORMER_DEP", region_column_name="REG_FORMER"
)

In [617]:
#Economic sector into fewer categories (and numeric instead of object/string)
def sector_mapping(nace_code):
    if nace_code == "AZ":  
        return "Agriculture, forestry and fishing)"
    elif "BZ" <= nace_code <= "EZ":
        return "Manufacturing, mining and quarrying and other industrial activities"
    elif nace_code == "FZ": 
        return "Construction"
    elif "GZ" <= nace_code <= "IZ":  
        return "Wholesale and retail trade, transportation and storage, accommodation and food service activities"
    elif "JA" <= nace_code <= "JC":
        return "Information and communication"
    elif nace_code == "KZ": 
        return "Financial and insurance activities"
    elif nace_code == "LZ": 
        return "Real estate activities"
    elif "MA" <= nace_code <= "NZ":
        return "Professional, scientific, technical, administrative and support service activities"
    elif "OZ" <= nace_code <= "QB":
        return "Public administration and defence, education, human health and social work activities"
    elif "RZ" <= nace_code <= "UZ":
        return "Other services activities"
    else:
        return "Unknown Sector"

code_Economic_sector["Nomenclature"] = code_Economic_sector["Code"].map(sector_mapping)
code_Economic_sector["Economic_sector_num"] = pd.factorize(code_Economic_sector["Nomenclature"])[0] + 1

#code_emp_contract["emp_contract_num"] = pd.factorize(code_emp_contract["Code"])[0] + 1
code_HIGHEST_CREDENTIAL["HIGHEST_CREDENTIAL_num"] = pd.factorize(code_HIGHEST_CREDENTIAL["Code"])[0] + 1
code_act["act_num"] = pd.factorize(code_act["Code"])[0] + 1

#learn_dataset_emp_contract = pd.merge(learn_dataset_emp_contract, code_emp_contract, left_on="emp_contract",  right_on="Code")
#learn_dataset_emp_contract.drop(["Code", "Libellé"], axis=1, inplace=True)
#replace this with one-hot-encoding

## Merging

In [620]:
learn_data = learn_dataset
learn_data = learn_data[['target'] + [col for col in learn_data.columns if col != 'target']]

learn_data = pd.merge(learn_data, code_act, left_on="act", right_on="Code", how="left")
learn_data.drop(["Code", "Libellé"], axis=1, inplace=True)
learn_data = pd.merge(learn_data, code_HIGHEST_CREDENTIAL, left_on="HIGHEST_CREDENTIAL", right_on="Code", how="left")
learn_data.drop(["Code", "Libellé", "HIGHEST_CREDENTIAL"], axis=1, inplace=True)

learn_data = pd.merge(learn_data, city_pop, on="INSEE_CODE", how="left")
learn_data = pd.merge(learn_data, city_loc, on="INSEE_CODE", how="left")

In [622]:
learn_dfs = [learn_dataset_emp_contract, learn_dataset_job, learn_dataset_retired_former, learn_dataset_retired_jobs, learn_dataset_retired_pension, learn_sports]

for df in learn_dfs:
    learn_data = pd.merge(learn_data, df, on="PRIMARY_KEY", how="outer")

In [624]:
def combine_columns(col_x, col_y):
    return col_x.fillna(col_y) if col_y is not None else col_x

for column in learn_data.columns:
    if column.endswith('_x'):
        base_column = column[:-2]  # Remove `_x` suffix
        y_column = base_column + '_y'
        if y_column in learn_data.columns:
            # Combine the columns
            learn_data[base_column] = combine_columns(learn_data[column], learn_data[y_column])
            # Drop the original `_x` and `_y` columns
            learn_data = learn_data.drop(columns=[column, y_column])


## Formatting

In [627]:
def household_num(value):
    parts = value.split('|')  # Split the value by '|'
    if parts[1] in {'1', '2', '3'}:  # For M|1|-- to M|3|--
        return int(parts[1])
    elif parts[1] == '4':  # For M|4|1 to M|4|4
        return 4 + (int(parts[2]) - 1)  # 4 + (1-1), 4 + (2-1), etc.
    return None  # Handle unexpected cases gracefully

code_HOUSEHOLD_TYPE['HOUSEHOLD_TYPE_num'] = code_HOUSEHOLD_TYPE['Code'].apply(household_num)
learn_data['HOUSEHOLD_TYPE'] = learn_data['HOUSEHOLD_TYPE'].apply(household_num)


In [629]:
learn_data["JOB_42_og"] = learn_data["JOB_42"]
learn_data["JOB_42"] = learn_data["JOB_42"].str.extract(r'csp_(\d+)_')[0].astype(int)
learn_data["FORMER_JOB_42"] = learn_data["FORMER_JOB_42"].str.extract(r'csp_(\d+)_')[0].astype("Int64")
learn_data["employee_count"] = learn_data["employee_count"].str.extract(r'tr_(\d)')[0].astype("Int64")
learn_data["Employer_category"] = learn_data["Employer_category"].str.extract(r'ct_(\d)')[0].astype("Int64")

learn_data = pd.merge(learn_data, code_Economic_sector, left_on="Economic_sector", right_on="Code", how="left")

learn_data = pd.merge(learn_data, code_work_description_map, left_on="work_description", right_on="N3", how="left")
learn_data.drop(["work_description", "N3", "N2"], axis=1, inplace=True)
learn_data["N1"] = learn_data["N1"].str.extract(r'csp_(\d)')[0].astype("Int64")
learn_data.rename(columns={"N1": "work_description"}, inplace=True)

learn_data["emp_contract"] = combine_columns(learn_data["emp_contract"], learn_data["former_emp_contract"])
learn_data["Pay"] = combine_columns(learn_data["Pay"], learn_data["RETIREMENT_PAY"])
#learn_data['is_retired'] = (learn_data['JOB_42'] == 7).astype(int)
#learn_data['is_unemployed'] = (learn_data['act'].str.startswith('TACT2_') & (learn_data['act'] != 'TACT2_1')).astype(int)
#learn_data['is_unemployed'] = (learn_data['act'] == 'TACT1_2').astype(int)

In [630]:
def categorize_retirement_age():
    global learn_data  # Modify the global learn_data DataFrame
    
    # Ensure the retirement_age column is numeric
    learn_data['retirement_age'] = pd.to_numeric(learn_data['retirement_age'], errors='coerce')
    
    # Define the bins and corresponding labels
    bins = [0, 57, 60, 61, 63, 65, float('inf')]  # Specify edges for the ranges
    labels = ['<57', '57-59', '60', '61-62', '63-64', '65+']  # Labels for ranges

    # Initial categorization with pd.cut
    learn_data['retirement_age_cat'] = pd.cut(
        learn_data['retirement_age'], 
        bins=bins, 
        labels=labels, 
        right=False,  # Left-closed intervals
        include_lowest=True
    )


    # Ensure missing values in retirement_age_cat are handled properly
    learn_data['retirement_age_cat'] = learn_data['retirement_age_cat'].astype(object)  # Avoid ambiguity with NA
    
    # Handle exact matches for 60 and 65
    learn_data.loc[learn_data['retirement_age'] == 60, 'retirement_age_cat'] = '60'

# Call the function
categorize_retirement_age()

In [631]:
# types
learn_data["sex"] = pd.factorize(learn_data["sex"])[0]
learn_data["studying"] = learn_data["studying"].astype("int64")
#or learn_data["Sports_Category"] = pd.to_numeric(learn_data["Sports_Category"], errors='coerce').astype("Int64")
learn_data["Sports_Category"] = learn_data["Sports_Category"].fillna(0).astype("int64")
learn_data["REG_JOB"] = pd.to_numeric(learn_data["REG_JOB"], errors='coerce').astype('Int64')
learn_data["REG_FORMER"] = pd.to_numeric(learn_data["REG_FORMER"], errors='coerce').astype('Int64')
learn_data["retirement_age"] = pd.to_numeric(learn_data["retirement_age"], errors='coerce').astype('Int64')
learn_data["WORKING_HOURS"] = pd.to_numeric(learn_data["WORKING_HOURS"], errors='coerce').astype('Int64')
learn_data["Economic_sector_num"] = pd.to_numeric(learn_data["Economic_sector_num"], errors='coerce').astype('Int64')
learn_data["Pay"] = pd.to_numeric(learn_data["Pay"], errors='coerce').astype('Int64')


In [635]:
def replace_na_with_category(column_name):
    global learn_data  # Ensures we modify the global learn_data directly

    # Convert the column to categorical
    learn_data[column_name] = learn_data[column_name].astype('category')
    
    # Define categories to add
    additional_categories = ['Unemployed', 'Retired_Missing', 'Employed_Missing']
    
    # Add the specified categories
    learn_data[column_name] = learn_data[column_name].cat.add_categories(additional_categories)
    
    learn_data.loc[(learn_data[column_name].isna()) & (learn_data['JOB_42'] == 7), column_name] = 'Retired_Missing'
    learn_data.loc[(learn_data[column_name].isna()) & (learn_data['act_num'] == 1), column_name] = 'Employed_Missing'
    learn_data.loc[(learn_data[column_name].isna()) & ((learn_data['JOB_42'] == 8) | (learn_data['act_num'] == 2)), column_name] = 'Unemployed'
    #learn_data[column_name] = learn_data[column_name].fillna("Unemployed")

replace_na_with_category("emp_contract")
replace_na_with_category("TYPE_OF_CONTRACT")
replace_na_with_category("WORK_CONDITION")
replace_na_with_category("labor_force_status")
replace_na_with_category("Economic_sector_num")
replace_na_with_category("REG_JOB")
replace_na_with_category("REG_FORMER")
replace_na_with_category("work_description")
replace_na_with_category("retirement_age_cat")
#do last after imputing
#replace_na_with_category("Employer_category") #need numbers only
#replace_na_with_category("employee_count")

do one-hot-encoding for WORK_CONDITION, TYPE_OF_CONTRACT, labor_force_status
--should we just do one hot encoding for every vategorical var? even sex?

for all now numerical cats but with missing values, can do +1 and make fillna as 0

note: TACT2_3 doesn't exist in dataset - no under 14 year olds

In [638]:
learn_data['Employer_category'] = learn_data.apply(
    lambda row: 10 if pd.isna(row['Employer_category']) and (row['JOB_42'] == 8 or row['act_num'] == 2) #unemployed
    else (0 if pd.isna(row['Employer_category']) else row['Employer_category']), #employed and retired missing
    axis=1
)
learn_data['employee_count'] = learn_data.apply(
    lambda row: 7 if pd.isna(row['employee_count']) and (row['JOB_42'] == 8 or row['act_num'] == 2)
    else (0 if pd.isna(row['employee_count']) else row['employee_count']),
    axis=1
)

In [639]:
#learn_data.loc[learn_data['JOB_42'].str.startswith('csp_7', na=False), 'JOB_42'] = learn_data['FORMER_JOB_42']
# Replace values in 'JOB_42' with 'FORMER_JOB_42' where JOB_42 == 7
learn_data.loc[learn_data['JOB_42'] == 7, 'JOB_42'] = learn_data['FORMER_JOB_42']
learn_data.loc[(learn_data['emp_contract'] == 'Unemployed') & (learn_data['Pay'].isna()), 'Pay'] = 0
learn_data.loc[(learn_data['emp_contract'] == 'Unemployed') & (learn_data['WORKING_HOURS'].isna()), 'WORKING_HOURS'] = 0

learn_data = learn_data.drop(columns=["act", "former_emp_contract", "RETIREMENT_PAY", "retirement_age", "FORMER_JOB_42", "Economic_sector", "Code", "Libellé", "Nomenclature", "X", "Y"])
#or keep nomenclature, remove economic_sector_num
#remove INSEE_CODE?

# Handling Missing Data

In [643]:
def missing_values_table(df):
        mis_val = df.isnull().sum()
        mis_val_percent = 100 * df.isnull().sum() / len(df)  #% of missing values
        mis_val_table = pd.concat([mis_val, mis_val_percent], axis=1)  #create result table
        mis_val_table_ren_columns = mis_val_table.rename(
        columns = {0 : 'Missing Values', 1 : '% of Total Values'})
        mis_val_table_ren_columns = mis_val_table_ren_columns[
            mis_val_table_ren_columns.iloc[:,1] != 0].sort_values(
        '% of Total Values', ascending=False).round(1)
        print ("There are " + str(mis_val_table_ren_columns.shape[0]) +
              " columns that have missing values.")
        return mis_val_table_ren_columns

print(missing_values_table(learn_data)) 

There are 2 columns that have missing values.
               Missing Values  % of Total Values
WORKING_HOURS            7345               14.7
Pay                      6961               13.9


In [645]:
learn_data.dtypes # pay should be int64 because always .0

target                     float64
PRIMARY_KEY                  int64
sex                          int64
JOB_42                       int64
studying                     int64
INSEE_CODE                  object
age_2020                     int64
HOUSEHOLD_TYPE               int64
act_num                      int64
HIGHEST_CREDENTIAL_num       int64
RESIDENTS                    int64
Lat                        float64
long                       float64
emp_contract              category
Pay                          Int64
REG_FORMER                category
Sports_Category              int64
Employer_category            int64
employee_count               int64
TYPE_OF_CONTRACT          category
WORK_CONDITION            category
labor_force_status        category
WORKING_HOURS                Int64
REG_JOB                   category
JOB_42_og                   object
Economic_sector_num       category
work_description          category
retirement_age_cat        category
dtype: object

## replace with mean

### for working hours

In [649]:
## for working hours, ignore the 0 values to not bias the mean
#factorise to impute then delete the factorised col

In [651]:
#avg_working_hours = learn_data.groupby("JOB_42")["WORKING_HOURS"].mean().round()
avg_working_hours = learn_data.loc[learn_data["WORKING_HOURS"] != 0].groupby("JOB_42_og")["WORKING_HOURS"].mean().round()

avg_working_hours

JOB_42_og
csp_3_1    1239.0
csp_3_3    1728.0
csp_3_4    1656.0
csp_3_5    1258.0
csp_3_7    1649.0
csp_3_8    1673.0
csp_4_2    1477.0
csp_4_3    1549.0
csp_4_4    1859.0
csp_4_5    1653.0
csp_4_6    1557.0
csp_4_7    1648.0
csp_4_8    1616.0
csp_5_2    1477.0
csp_5_3    1633.0
csp_5_4    1428.0
csp_5_5    1310.0
csp_5_6    1221.0
csp_6_2    1521.0
csp_6_3    1337.0
csp_6_4    1537.0
csp_6_5    1482.0
csp_6_7    1341.0
csp_6_8    1206.0
csp_6_9    1301.0
csp_7_4    1539.0
csp_7_5    1424.0
csp_7_7    1312.0
csp_7_8    1280.0
Name: WORKING_HOURS, dtype: Float64

In [653]:
#learn_data["WORKING_HOURS"] = learn_data.apply(
#    lambda row: avg_working_hours[row["JOB_42"]] if pd.isnull(row["WORKING_HOURS"]) else row["WORKING_HOURS"],
#    axis=1
#)

learn_data["WORKING_HOURS"] = learn_data.apply(
    lambda row: avg_working_hours.get(row["JOB_42_og"], np.nan) if pd.isnull(row["WORKING_HOURS"]) else row["WORKING_HOURS"],
    axis=1
)
learn_data

Unnamed: 0,target,PRIMARY_KEY,sex,JOB_42,studying,INSEE_CODE,age_2020,HOUSEHOLD_TYPE,act_num,HIGHEST_CREDENTIAL_num,...,employee_count,TYPE_OF_CONTRACT,WORK_CONDITION,labor_force_status,WORKING_HOURS,REG_JOB,JOB_42_og,Economic_sector_num,work_description,retirement_age_cat
0,9.367020,1,0,5,0,01004,34,3,1,10,...,1,CDI,C,O,1470.0,84,csp_5_4,7,5,Employed_Missing
1,8.648771,5,1,6,0,01004,80,7,3,6,...,1,CDI,C,O,793.0,44,csp_7_8,4,6,60
2,10.792503,7,0,3,0,01004,63,4,1,11,...,0,Employed_Missing,Employed_Missing,Employed_Missing,1239.0,Employed_Missing,csp_3_1,Employed_Missing,Employed_Missing,Employed_Missing
3,8.508222,8,1,3,0,01004,50,4,1,9,...,1,CDI,C,O,1201.0,11,csp_3_7,2,3,Employed_Missing
4,8.966272,10,0,5,0,01004,68,7,2,6,...,7,Unemployed,Unemployed,Unemployed,0.0,Unemployed,csp_5_6,Unemployed,Unemployed,Unemployed
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
50038,11.154438,100077,0,8,1,95680,29,5,6,6,...,7,Unemployed,Unemployed,Unemployed,0.0,Unemployed,csp_8_5,Unemployed,Unemployed,Unemployed
50039,9.962345,100078,0,4,0,95680,34,4,1,9,...,6,CDI,P,O,665.0,11,csp_4_3,9,4,Employed_Missing
50040,12.226288,100079,0,5,0,95680,29,6,1,8,...,4,CDI,C,O,3000.0,11,csp_5_2,9,5,Employed_Missing
50041,8.965529,100081,1,8,0,95680,57,3,7,6,...,7,Unemployed,Unemployed,Unemployed,0.0,Unemployed,csp_8_5,Unemployed,Unemployed,Unemployed


### for pay

In [656]:
avg_pay = learn_data.loc[learn_data["Pay"] != 0].groupby("JOB_42_og")["Pay"].mean()
avg_pay


JOB_42_og
csp_3_1      28607.9375
csp_3_3     36405.16955
csp_3_4    35958.044723
csp_3_5    29092.817308
csp_3_7    41752.871637
csp_3_8    41338.334173
csp_4_2    20698.882825
csp_4_3    23091.478704
csp_4_4         19375.0
csp_4_5    22239.081448
csp_4_6    25706.552411
csp_4_7    27873.100868
csp_4_8    28345.384804
csp_5_2     17022.07322
csp_5_3    18430.684343
csp_5_4    19050.509905
csp_5_5    14608.040992
csp_5_6    10638.242279
csp_6_2    20887.796974
csp_6_3    16098.589958
csp_6_4    18032.440208
csp_6_5    19320.407285
csp_6_7    15040.540305
csp_6_8    11910.141369
csp_6_9    14116.090909
csp_7_4    29029.940299
csp_7_5     21413.79526
csp_7_7    15494.445429
csp_7_8    17560.450934
Name: Pay, dtype: Float64

In [658]:
learn_data["Pay"] = learn_data.apply(
    lambda row: avg_working_hours.get(row["JOB_42_og"], np.nan) if pd.isnull(row["Pay"]) else row["Pay"],
    axis=1
)

learn_data

Unnamed: 0,target,PRIMARY_KEY,sex,JOB_42,studying,INSEE_CODE,age_2020,HOUSEHOLD_TYPE,act_num,HIGHEST_CREDENTIAL_num,...,employee_count,TYPE_OF_CONTRACT,WORK_CONDITION,labor_force_status,WORKING_HOURS,REG_JOB,JOB_42_og,Economic_sector_num,work_description,retirement_age_cat
0,9.367020,1,0,5,0,01004,34,3,1,10,...,1,CDI,C,O,1470.0,84,csp_5_4,7,5,Employed_Missing
1,8.648771,5,1,6,0,01004,80,7,3,6,...,1,CDI,C,O,793.0,44,csp_7_8,4,6,60
2,10.792503,7,0,3,0,01004,63,4,1,11,...,0,Employed_Missing,Employed_Missing,Employed_Missing,1239.0,Employed_Missing,csp_3_1,Employed_Missing,Employed_Missing,Employed_Missing
3,8.508222,8,1,3,0,01004,50,4,1,9,...,1,CDI,C,O,1201.0,11,csp_3_7,2,3,Employed_Missing
4,8.966272,10,0,5,0,01004,68,7,2,6,...,7,Unemployed,Unemployed,Unemployed,0.0,Unemployed,csp_5_6,Unemployed,Unemployed,Unemployed
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
50038,11.154438,100077,0,8,1,95680,29,5,6,6,...,7,Unemployed,Unemployed,Unemployed,0.0,Unemployed,csp_8_5,Unemployed,Unemployed,Unemployed
50039,9.962345,100078,0,4,0,95680,34,4,1,9,...,6,CDI,P,O,665.0,11,csp_4_3,9,4,Employed_Missing
50040,12.226288,100079,0,5,0,95680,29,6,1,8,...,4,CDI,C,O,3000.0,11,csp_5_2,9,5,Employed_Missing
50041,8.965529,100081,1,8,0,95680,57,3,7,6,...,7,Unemployed,Unemployed,Unemployed,0.0,Unemployed,csp_8_5,Unemployed,Unemployed,Unemployed


## IMPUTATION

### For employee_category (does this make sense for people that never worked i.e students ??)

In [662]:
#make all relevant (categorical) cols numerical
learn_data['employee_count_num'] = LabelEncoder().fit_transform(learn_data['employee_count'])
learn_data['Employer_category_num'] = LabelEncoder().fit_transform(learn_data['Employer_category'])
learn_data['retirement_age_cat_num'] = LabelEncoder().fit_transform(learn_data['retirement_age_cat'])
learn_data['REG_JOB_num'] = pd.factorize(learn_data['REG_JOB'])[0]
#learn_data['REG_JOB_num'] = LabelEncoder().fit_transform(learn_data['REG_JOB'])
#learn_data['retirement_age_cat_num'] = LabelEncoder().fit_transform(learn_data['retirement_age_cat'])
#learn_data['Employer_category_num']

In [664]:
learn_data['Employer_category_encoded'] = learn_data['Employer_category_num'].map(lambda x: np.nan if x==0 else x)
df_train_2 = learn_data.loc[:,["PRIMARY_KEY","Employer_category_encoded", "studying", "WORKING_HOURS", "age_2020", "Pay", "retirement_age_cat_num",
                             "Sports_Category", "REG_JOB_num"]]
df_train_2.head()

Unnamed: 0,PRIMARY_KEY,Employer_category_encoded,studying,WORKING_HOURS,age_2020,Pay,retirement_age_cat_num,Sports_Category,REG_JOB_num
0,1,9.0,0,1470.0,34,17011.0,6,1,0
1,5,9.0,0,793.0,80,17598.0,1,0,1
2,7,,0,1239.0,63,1239.0,6,0,2
3,8,9.0,0,1201.0,50,57220.0,6,0,3
4,10,10.0,0,0.0,68,0.0,7,0,4


In [666]:
imputer = IterativeImputer(random_state=200)
imputer.fit(df_train_2)
df_imputed_2 = imputer.transform(df_train_2)
#knn_imputer = KNNImputer(n_neighbors=5, weights="uniform")
#knn_imputer.fit(df_train_2)
#df_imputed_2 = knn_imputer.transform(df_train_2)

In [667]:
learn_data.loc[:,"Employer_category_encoded"] = df_imputed_2[:,1].round().astype(int)
learn_data

#why float ?

Unnamed: 0,target,PRIMARY_KEY,sex,JOB_42,studying,INSEE_CODE,age_2020,HOUSEHOLD_TYPE,act_num,HIGHEST_CREDENTIAL_num,...,REG_JOB,JOB_42_og,Economic_sector_num,work_description,retirement_age_cat,employee_count_num,Employer_category_num,retirement_age_cat_num,REG_JOB_num,Employer_category_encoded
0,9.367020,1,0,5,0,01004,34,3,1,10,...,84,csp_5_4,7,5,Employed_Missing,1,9,6,0,9.0
1,8.648771,5,1,6,0,01004,80,7,3,6,...,44,csp_7_8,4,6,60,1,9,1,1,9.0
2,10.792503,7,0,3,0,01004,63,4,1,11,...,Employed_Missing,csp_3_1,Employed_Missing,Employed_Missing,Employed_Missing,0,0,6,2,9.0
3,8.508222,8,1,3,0,01004,50,4,1,9,...,11,csp_3_7,2,3,Employed_Missing,1,9,6,3,9.0
4,8.966272,10,0,5,0,01004,68,7,2,6,...,Unemployed,csp_5_6,Unemployed,Unemployed,Unemployed,7,10,7,4,10.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
50038,11.154438,100077,0,8,1,95680,29,5,6,6,...,Unemployed,csp_8_5,Unemployed,Unemployed,Unemployed,7,10,7,4,10.0
50039,9.962345,100078,0,4,0,95680,34,4,1,9,...,11,csp_4_3,9,4,Employed_Missing,6,8,6,3,8.0
50040,12.226288,100079,0,5,0,95680,29,6,1,8,...,11,csp_5_2,9,5,Employed_Missing,4,8,6,3,8.0
50041,8.965529,100081,1,8,0,95680,57,3,7,6,...,Unemployed,csp_8_5,Unemployed,Unemployed,Unemployed,7,10,7,4,10.0


### FOR employee_count

In [671]:
#no longer necessary? - employee count now numerical
#le = preprocessing.LabelEncoder()
#le.fit(learn_data.loc[:,"employee_count"])
#dict(zip(le.classes_, le.transform(le.classes_)))

In [673]:
#learn_data['employee_count_encoded'] = le.transform(learn_data['employee_count'])

In [675]:
learn_data['employee_count_encoded'] = learn_data['employee_count_num'].map(lambda x: np.nan if x==0 else x)

df_train_1 = learn_data.loc[:,["PRIMARY_KEY", "employee_count_encoded", "studying", "WORKING_HOURS", "age_2020", "Pay", "retirement_age_cat_num",
                             "Sports_Category", "REG_JOB_num", "Lat", "long"]]
df_train_1.head()

Unnamed: 0,PRIMARY_KEY,employee_count_encoded,studying,WORKING_HOURS,age_2020,Pay,retirement_age_cat_num,Sports_Category,REG_JOB_num,Lat,long
0,1,1.0,0,1470.0,34,17011.0,6,1,0,45.960848,5.372926
1,5,1.0,0,793.0,80,17598.0,1,0,1,45.960848,5.372926
2,7,,0,1239.0,63,1239.0,6,0,2,45.960848,5.372926
3,8,1.0,0,1201.0,50,57220.0,6,0,3,45.960848,5.372926
4,10,7.0,0,0.0,68,0.0,7,0,4,45.960848,5.372926


In [677]:
imputer = IterativeImputer(random_state=100)
imputer.fit(df_train_1)
df_imputed_1 = imputer.transform(df_train_1)
#knn_imputer = KNNImputer(n_neighbors=5, weights="uniform")
#knn_imputer.fit(df_train)
#df_imputed = knn_imputer.transform(df_train)

In [678]:
learn_data.loc[:,"employee_count_encoded"] = df_imputed_1[:,1].round().astype(int)
#count_imputed = list(le.inverse_transform(learn_data['employee_count_encoded'].round().astype('int')))
#learn_data["employee_count_encoded"] = count_imputed

## checks

In [682]:
learn_data.dtypes

target                        float64
PRIMARY_KEY                     int64
sex                             int64
JOB_42                          int64
studying                        int64
INSEE_CODE                     object
age_2020                        int64
HOUSEHOLD_TYPE                  int64
act_num                         int64
HIGHEST_CREDENTIAL_num          int64
RESIDENTS                       int64
Lat                           float64
long                          float64
emp_contract                 category
Pay                           float64
REG_FORMER                   category
Sports_Category                 int64
Employer_category               int64
employee_count                  int64
TYPE_OF_CONTRACT             category
WORK_CONDITION               category
labor_force_status           category
WORKING_HOURS                 float64
REG_JOB                      category
JOB_42_og                      object
Economic_sector_num          category
work_descrip

In [684]:
learn_data.drop(["employee_count", "Employer_category", "RESIDENTS", "Lat", "long", "INSEE_CODE", "Employer_category_num", "employee_count_num", "retirement_age_cat_num", "REG_JOB_num", "JOB_42_og"], axis=1, inplace=True)

In [686]:
learn_data

Unnamed: 0,target,PRIMARY_KEY,sex,JOB_42,studying,age_2020,HOUSEHOLD_TYPE,act_num,HIGHEST_CREDENTIAL_num,emp_contract,...,TYPE_OF_CONTRACT,WORK_CONDITION,labor_force_status,WORKING_HOURS,REG_JOB,Economic_sector_num,work_description,retirement_age_cat,Employer_category_encoded,employee_count_encoded
0,9.367020,1,0,5,0,34,3,1,10,EMP1-6,...,CDI,C,O,1470.0,84,7,5,Employed_Missing,9.0,1.0
1,8.648771,5,1,6,0,80,7,3,6,EMP1-6,...,CDI,C,O,793.0,44,4,6,60,9.0,1.0
2,10.792503,7,0,3,0,63,4,1,11,EMP2-2,...,Employed_Missing,Employed_Missing,Employed_Missing,1239.0,Employed_Missing,Employed_Missing,Employed_Missing,Employed_Missing,9.0,4.0
3,8.508222,8,1,3,0,50,4,1,9,EMP1-6,...,CDI,C,O,1201.0,11,2,3,Employed_Missing,9.0,1.0
4,8.966272,10,0,5,0,68,7,2,6,Unemployed,...,Unemployed,Unemployed,Unemployed,0.0,Unemployed,Unemployed,Unemployed,Unemployed,10.0,7.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
50038,11.154438,100077,0,8,1,29,5,6,6,Unemployed,...,Unemployed,Unemployed,Unemployed,0.0,Unemployed,Unemployed,Unemployed,Unemployed,10.0,7.0
50039,9.962345,100078,0,4,0,34,4,1,9,EMP1-6,...,CDI,P,O,665.0,11,9,4,Employed_Missing,8.0,6.0
50040,12.226288,100079,0,5,0,29,6,1,8,EMP1-6,...,CDI,C,O,3000.0,11,9,5,Employed_Missing,8.0,4.0
50041,8.965529,100081,1,8,0,57,3,7,6,Unemployed,...,Unemployed,Unemployed,Unemployed,0.0,Unemployed,Unemployed,Unemployed,Unemployed,10.0,7.0


In [688]:
print(learn_data['Employer_category_encoded'].value_counts())
#added a value to retired_missing????

Employer_category_encoded
9.0     23958
10.0    12805
8.0      9265
2.0       866
7.0       817
1.0       754
3.0       639
6.0       486
5.0       316
4.0       137
Name: count, dtype: int64


In [694]:
#print(learn_data['Employer_category'].value_counts())


In [696]:
#comparison = learn_data[['Employer_category', 'Employer_category_num']].drop_duplicates()
#print(comparison)

In [702]:
def missing_values_table(df):
        mis_val = df.isnull().sum()
        mis_val_percent = 100 * df.isnull().sum() / len(df)  #% of missing values
        mis_val_table = pd.concat([mis_val, mis_val_percent], axis=1)  #create result table
        mis_val_table_ren_columns = mis_val_table.rename(
        columns = {0 : 'Missing Values', 1 : '% of Total Values'})
        mis_val_table_ren_columns = mis_val_table_ren_columns[
            mis_val_table_ren_columns.iloc[:,1] != 0].sort_values(
        '% of Total Values', ascending=False).round(1)
        print ("There are " + str(mis_val_table_ren_columns.shape[0]) +
              " columns that have missing values.")
        return mis_val_table_ren_columns

print(missing_values_table(learn_data)) 

There are 2 columns that have missing values.
               Missing Values  % of Total Values
Pay                      3633                7.3
WORKING_HOURS            3633                7.3


# Prediction start

In [705]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, root_mean_squared_error
from sklearn.model_selection import train_test_split, ShuffleSplit, KFold, cross_val_score, GridSearchCV


In [709]:
learn_data["sex"] = learn_data["sex"].astype('category')
learn_data["JOB_42"] = learn_data["JOB_42"].astype('category')
learn_data["Sports_Category"] = learn_data["Sports_Category"].astype('category')
learn_data["studying"] = learn_data["studying"].astype('category')
learn_data["act_num"] = learn_data["act_num"].astype('category')
learn_data["HOUSEHOLD_TYPE"] = learn_data["HOUSEHOLD_TYPE"].astype('category')
learn_data["act_num"] = learn_data["act_num"].astype('category')
learn_data["Employer_category_encoded"] = learn_data["Employer_category_encoded"].astype('category')
learn_data["employee_count_encoded"] = learn_data["employee_count_encoded"].astype('category')


In [711]:
learn_data.dtypes

target                        float64
PRIMARY_KEY                     int64
sex                          category
JOB_42                       category
studying                     category
age_2020                        int64
HOUSEHOLD_TYPE               category
act_num                      category
HIGHEST_CREDENTIAL_num          int64
emp_contract                 category
Pay                           float64
REG_FORMER                   category
Sports_Category              category
TYPE_OF_CONTRACT             category
WORK_CONDITION               category
labor_force_status           category
WORKING_HOURS                 float64
REG_JOB                      category
Economic_sector_num          category
work_description             category
retirement_age_cat           category
Employer_category_encoded    category
employee_count_encoded       category
dtype: object

In [713]:
learn = learn_data
learn = pd.get_dummies(ML)