In [94]:
import os
import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.preprocessing import LabelEncoder

In [95]:
# Specify the directory containing the CSV files
files_folder = "files"

# Iterate over all files in the folder
for file_name in os.listdir(files_folder):
    if file_name.endswith('.csv'):  # Check if the file is a CSV
        file_path = os.path.join(files_folder, file_name)
        # Create a variable with the name of the CSV file (minus extension)
        df_name = os.path.splitext(file_name)[0]
        globals()[df_name] = pd.read_csv(file_path)


## CHECKING DATA TYPES

In [96]:
dataframes_list = [var_name for var_name in globals() if isinstance(globals()[var_name], pd.DataFrame)]

for df_name in dataframes_list:
    df = globals()[df_name]
    print(f"DataFrame name: {df_name}")
    print(df.dtypes)
    print("-" * 40) 

DataFrame name: _
PRIMARY_KEY                 int64
Economic_sector            object
FORMER_DEP                 object
JOB_DEP                    object
work_description           object
TYPE_OF_CONTRACT           object
labor_force_status         object
WORK_CONDITION             object
Employer_category          object
WORKING_HOURS             float64
employee_count             object
employee_count_encoded     object
dtype: object
----------------------------------------
DataFrame name: city_adm
Nom de la commune    object
INSEE_CODE           object
municipality_type    object
DEP                  object
dtype: object
----------------------------------------
DataFrame name: city_loc
X             float64
Y             float64
INSEE_CODE     object
Lat           float64
long          float64
dtype: object
----------------------------------------
DataFrame name: city_pop
INSEE_CODE    object
RESIDENTS      int64
dtype: object
----------------------------------------
DataFrame name:

## CHECKING MISSING VALUES

METHOD 1

In [97]:
def missing_values_table(df):
        mis_val = df.isnull().sum()
        mis_val_percent = 100 * df.isnull().sum() / len(df)  #% of missing values
        mis_val_table = pd.concat([mis_val, mis_val_percent], axis=1)  #create result table
        mis_val_table_ren_columns = mis_val_table.rename(
        columns = {0 : 'Missing Values', 1 : '% of Total Values'})
        mis_val_table_ren_columns = mis_val_table_ren_columns[
            mis_val_table_ren_columns.iloc[:,1] != 0].sort_values(
        '% of Total Values', ascending=False).round(1)
        print ("There are " + str(mis_val_table_ren_columns.shape[0]) +
              " columns that have missing values.")
        return mis_val_table_ren_columns

In [98]:
dataframe = [var_name for var_name in globals() if isinstance(globals()[var_name], pd.DataFrame)]

for df_name in dataframe:
    df = globals()[df_name]
    mis_val = df.isnull().sum()
    if mis_val.sum() > 0: 
        print(f"Missing values in {df_name}:")
        print(missing_values_table(df))  # Pass the actual DataFrame to the function
        print("-" * 40)  #

Missing values in _:
There are 5 columns that have missing values.
                   Missing Values  % of Total Values
Employer_category             756                6.7
employee_count                670                6.0
FORMER_DEP                    358                3.2
JOB_DEP                       343                3.1
WORKING_HOURS                   9                0.1
----------------------------------------
Missing values in learn_dataset_job:
There are 4 columns that have missing values.
                   Missing Values  % of Total Values
Employer_category             622                3.3
employee_count                520                2.7
JOB_DEP                        22                0.1
WORKING_HOURS                   9                0.0
----------------------------------------
Missing values in learn_dataset_retired_jobs:
There are 5 columns that have missing values.
                   Missing Values  % of Total Values
Employer_category             756       

METHOD 2

In [99]:
dataframe = [var_name for var_name in globals() if isinstance(globals()[var_name], pd.DataFrame)]

for df_name in dataframe:
    df = globals()[df_name]  
    missing_counts = df.isnull().sum()
    if missing_counts.sum() > 0:                               #Check for NA values
        print(f"Missing values in {df_name}:")
        print(missing_counts[missing_counts > 0])                #Print only df with NA values
        print("-" * 40) 
#NA values in : learn_dataset_job, learn_dataset_retired_jobs, test_dataset_job, test_dataset_retired_jobs

Missing values in _:
FORMER_DEP           358
JOB_DEP              343
Employer_category    756
WORKING_HOURS          9
employee_count       670
dtype: int64
----------------------------------------
Missing values in learn_dataset_job:
Employer_category    622
employee_count       520
WORKING_HOURS          9
JOB_DEP               22
dtype: int64
----------------------------------------
Missing values in learn_dataset_retired_jobs:
FORMER_DEP           358
JOB_DEP              343
Employer_category    756
WORKING_HOURS          9
employee_count       670
dtype: int64
----------------------------------------
Missing values in test_dataset_job:
Employer_category    636
employee_count       533
WORKING_HOURS          8
JOB_DEP               23
dtype: int64
----------------------------------------
Missing values in test_dataset_retired_jobs:
FORMER_DEP           393
JOB_DEP              386
Employer_category    763
WORKING_HOURS         10
employee_count       686
dtype: int64
-----------

## HANDLING MISSING VALUES

CATEGORICAL VARIABLES

In [100]:
le = preprocessing.LabelEncoder()
le.fit(learn_dataset_retired_jobs.loc[:,"employee_count"])
dict(zip(le.classes_, le.transform(le.classes_)))

{'tr_0': 0,
 'tr_1': 1,
 'tr_2': 2,
 'tr_3': 3,
 'tr_4': 4,
 'tr_5': 5,
 'tr_6': 6,
 nan: 7}

In [101]:
learn_dataset_retired_jobs['employee_count_encoded'] = le.transform(learn_dataset_retired_jobs['employee_count'])
learn_dataset_retired_jobs['employee_count_encoded'] = learn_dataset_retired_jobs['employee_count_encoded'].map(lambda x: np.nan if x==7 else x)
df_train = learn_dataset_retired_jobs.loc[:,["PRIMARY_KEY", "employee_count_encoded","WORKING_HOURS"]]

In [102]:
#The issue isthat KNN only works for numerical columns but no value to do K-NN with just those 3 variables, ig we need to encode the rest of colums
df_train = learn_dataset_retired_jobs.loc[:,["PRIMARY_KEY", "employee_count_encoded","WORKING_HOURS"]]
knn_imputer = KNNImputer(n_neighbors=5, weights="uniform")
knn_imputer.fit(df_train)
#learn_dataset_retired_jobs[f'{column}_encoded'] = knn_imputer.fit_transform(learn_dataset_retired_jobs[[f'{column}_encoded']])

In [103]:
df_imputed = knn_imputer.transform(df_train)

In [104]:
learn_dataset_retired_jobs.loc[:,"employee_count_encoded"] = df_imputed[:,1].round()

In [105]:
count_imputed = list(le.inverse_transform(learn_dataset_retired_jobs['employee_count_encoded'].round().astype('int')))
learn_dataset_retired_jobs['employee_count_encoded'] = count_imputed
learn_dataset_retired_jobs

Unnamed: 0,PRIMARY_KEY,Economic_sector,FORMER_DEP,JOB_DEP,work_description,TYPE_OF_CONTRACT,labor_force_status,WORK_CONDITION,Employer_category,WORKING_HOURS,employee_count,employee_count_encoded
0,5,GZ,67,67,634a,CDI,O,C,ct_9,793.0,tr_1,tr_1
1,19,FZ,93,78,621b,CDI,O,C,ct_9,1750.0,tr_6,tr_6
2,21,CM,77,89,431f,CDI,O,C,ct_9,1872.0,tr_6,tr_6
3,25,MB,64,64,387b,CDI,O,P,ct_9,781.0,tr_6,tr_6
4,38,QB,60,60,525d,CDI,O,C,ct_9,1785.0,tr_4,tr_4
...,...,...,...,...,...,...,...,...,...,...,...,...
11235,100044,IZ,2B,2B,561e,CDI,O,C,,211.0,,tr_3
11236,100048,QA,95,95,542a,CDI,O,C,ct_9,1820.0,tr_6,tr_6
11237,100053,TZ,13,13,563c,CDI,O,N,ct_7,1931.0,tr_1,tr_1
11238,100067,GZ,69,69,643a,CDI,O,C,ct_9,1586.0,tr_2,tr_2


NUMERICAL VARIABLES

In [106]:
# Imputation for rows with only 1 NA in 'WORKING_HOURS'
df_w_na = ['learn_dataset_job', 'learn_dataset_retired_jobs', 'test_dataset_job', 'test_dataset_retired_jobs']

for df_w_na_name in df_w_na:
    df = globals()[df_w_na_name]
    if 'WORKING_HOURS' in df.columns:
        rows_with_one_na_in_working_hours = df[df['WORKING_HOURS'].isnull() & df.isnull().sum(axis=1) == 1]
        if not rows_with_one_na_in_working_hours.empty: 
            mean_imputer = SimpleImputer(strategy='mean')
            # Perform the imputation and assign the result to the corresponding rows and column
            imputed_values = mean_imputer.fit_transform(rows_with_one_na_in_working_hours[['WORKING_HOURS']])
            # Assign the imputed values back to the DataFrame
            df.loc[rows_with_one_na_in_working_hours.index, 'WORKING_HOURS'] = imputed_values.flatten()  # Flatten the 2D array
            print(f"Imputed missing values for 'WORKING_HOURS' in {df_w_na_name}")



ValueError: Must have equal len keys and value when setting with an iterable

In [None]:
#imputation with mean for rows with only 1 NA values (the NA values being for 'WORKING_HOURS')
df_w_na = ['learn_dataset_job', 'learn_dataset_retired_jobs', 'test_dataset_job', 'test_dataset_retired_jobs']

for df_w_na_name in df_w_na:
    df = globals()[df_w_na_name]
    if 'WORKING_HOURS' in df.columns:
        rows_with_one_na_in_working_hours = df[df['WORKING_HOURS'].isnull() & df.isnull().sum(axis=1) == 1]
        if not rows_with_one_na_in_working_hours.empty: 
            mean_imputer = SimpleImputer(strategy='mean')
            df.loc[rows_with_one_na_in_working_hours.index, 'WORKING_HOURS'] = mean_imputer.fit_transform(
                rows_with_one_na_in_working_hours[['WORKING_HOURS']])
    print(f"Imputed missing values for 'WORKING_HOURS' in {df_w_na_name}")