In [2]:
import os
import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.preprocessing import LabelEncoder

In [3]:
# Specify the directory containing the CSV files
files_folder = "files"

# Iterate over all files in the folder
for file_name in os.listdir(files_folder):
    if file_name.endswith('.csv'):  # Check if the file is a CSV
        file_path = os.path.join(files_folder, file_name)
        # Create a variable with the name of the CSV file (minus extension)
        df_name = os.path.splitext(file_name)[0]
        globals()[df_name] = pd.read_csv(file_path)


## CHECKING DATA TYPES

In [None]:
dataframes_list = [var_name for var_name in globals() if isinstance(globals()[var_name], pd.DataFrame)]

for df_name in dataframes_list:
    df = globals()[df_name]
    print(f"DataFrame name: {df_name}")
    print(df.dtypes)
    print("-" * 40) 

## CHECKING MISSING VALUES

METHOD 1

In [None]:
def missing_values_table(df):
        mis_val = df.isnull().sum()
        mis_val_percent = 100 * df.isnull().sum() / len(df)  #% of missing values
        mis_val_table = pd.concat([mis_val, mis_val_percent], axis=1)  #create result table
        mis_val_table_ren_columns = mis_val_table.rename(
        columns = {0 : 'Missing Values', 1 : '% of Total Values'})
        mis_val_table_ren_columns = mis_val_table_ren_columns[
            mis_val_table_ren_columns.iloc[:,1] != 0].sort_values(
        '% of Total Values', ascending=False).round(1)
        print ("There are " + str(mis_val_table_ren_columns.shape[0]) +
              " columns that have missing values.")
        return mis_val_table_ren_columns

In [None]:
dataframe = [var_name for var_name in globals() if isinstance(globals()[var_name], pd.DataFrame)]

for df_name in dataframe:
    df = globals()[df_name]
    mis_val = df.isnull().sum()
    if mis_val.sum() > 0: 
        print(f"Missing values in {df_name}:")
        print(missing_values_table(df))  # Pass the actual DataFrame to the function
        print("-" * 40)  #

METHOD 2

In [None]:
dataframe = [var_name for var_name in globals() if isinstance(globals()[var_name], pd.DataFrame)]

for df_name in dataframe:
    df = globals()[df_name]  
    missing_counts = df.isnull().sum()
    if missing_counts.sum() > 0:                               #Check for NA values
        print(f"Missing values in {df_name}:")
        print(missing_counts[missing_counts > 0])                #Print only df with NA values
        print("-" * 40) 
#NA values in : learn_dataset_job, learn_dataset_retired_jobs, test_dataset_job, test_dataset_retired_jobs

## HANDLING MISSING VALUES

CATEGORICAL VARIABLES

In [None]:
le = preprocessing.LabelEncoder()
le.fit(learn_dataset_retired_jobs.loc[:,"employee_count"])
dict(zip(le.classes_, le.transform(le.classes_)))

In [None]:
learn_dataset_retired_jobs['employee_count_encoded'] = le.transform(learn_dataset_retired_jobs['employee_count'])
learn_dataset_retired_jobs['employee_count_encoded'] = learn_dataset_retired_jobs['employee_count_encoded'].map(lambda x: np.nan if x==7 else x)
df_train = learn_dataset_retired_jobs.loc[:,["PRIMARY_KEY", "employee_count_encoded","WORKING_HOURS"]]

In [None]:
#The issue isthat KNN only works for numerical columns but no value to do K-NN with just those 3 variables, ig we need to encode the rest of colums
df_train = learn_dataset_retired_jobs.loc[:,["PRIMARY_KEY", "employee_count_encoded","WORKING_HOURS"]]
knn_imputer = KNNImputer(n_neighbors=5, weights="uniform")
knn_imputer.fit(df_train)
#learn_dataset_retired_jobs[f'{column}_encoded'] = knn_imputer.fit_transform(learn_dataset_retired_jobs[[f'{column}_encoded']])

In [None]:
knn_imputer = KNNImputer(n_neighbors=5, weights="uniform")
knn_imputer.fit(df_train)
#learn_dataset_retired_jobs[f'{column}_encoded'] = knn_imputer.fit_transform(learn_dataset_retired_jobs[[f'{column}_encoded']])

In [None]:
knn_imputer = KNNImputer(n_neighbors=5, weights="uniform")
knn_imputer.fit(df_train)
#learn_dataset_retired_jobs[f'{column}_encoded'] = knn_imputer.fit_transform(learn_dataset_retired_jobs[[f'{column}_encoded']])

In [None]:
count_imputed = list(le.inverse_transform(learn_dataset_retired_jobs['employee_count_encoded'].round().astype('int')))
learn_dataset_retired_jobs["employee_count_encoded"] = count_imputed
learn_dataset_retired_jobs

NUMERICAL VARIABLES

In [None]:
# Imputation for rows with only 1 NA in 'WORKING_HOURS'
df_w_na = ['learn_dataset_job', 'learn_dataset_retired_jobs', 'test_dataset_job', 'test_dataset_retired_jobs']

for df_w_na_name in df_w_na:
    df = globals()[df_w_na_name]
    if 'WORKING_HOURS' in df.columns:
        rows_with_one_na_in_working_hours = df[df['WORKING_HOURS'].isnull() & df.isnull().sum(axis=1) == 1]
        if not rows_with_one_na_in_working_hours.empty: 
            mean_imputer = SimpleImputer(strategy='mean')
            # Perform the imputation and assign the result to the corresponding rows and column
            imputed_values = mean_imputer.fit_transform(rows_with_one_na_in_working_hours[['WORKING_HOURS']])
            # Assign the imputed values back to the DataFrame
            df.loc[rows_with_one_na_in_working_hours.index, 'WORKING_HOURS'] = imputed_values.flatten()  # Flatten the 2D array
            print(f"Imputed missing values for 'WORKING_HOURS' in {df_w_na_name}")

In [None]:
#imputation with mean for rows with only 1 NA values (the NA values being for 'WORKING_HOURS')
df_w_na = ['learn_dataset_job', 'learn_dataset_retired_jobs', 'test_dataset_job', 'test_dataset_retired_jobs']

for df_w_na_name in df_w_na:
    df = globals()[df_w_na_name]
    if 'WORKING_HOURS' in df.columns:
        rows_with_one_na_in_working_hours = df[df['WORKING_HOURS'].isnull() & df.isnull().sum(axis=1) == 1]
        if not rows_with_one_na_in_working_hours.empty: 
            mean_imputer = SimpleImputer(strategy='mean')
            df.loc[rows_with_one_na_in_working_hours.index, 'WORKING_HOURS'] = mean_imputer.fit_transform(
                rows_with_one_na_in_working_hours[['WORKING_HOURS']])
    print(f"Imputed missing values for 'WORKING_HOURS' in {df_w_na_name}")