In [None]:
import os
import sys
import pandas as pd
from datetime import datetime, timedelta

In [None]:
sys.path.append(os.path.abspath('..'))
from configs.config import *
# from src.util import Logger, Util

In [None]:
import importlib
import configs.config
importlib.reload(configs.config)
from configs.config import *

# データ読み込み

In [None]:
df_train = pd.read_csv(os.path.join(DIR_INPUT, FILE_NAME_TRAIN))
df_test = pd.read_csv(os.path.join(DIR_INPUT, FILE_NAME_TEST))
df_sample_submission = pd.read_csv(os.path.join(DIR_INPUT, FILE_SAMPLE_SUBMISSION))
df_udemy_activity = pd.read_csv(os.path.join(DIR_INPUT, FILE_NAME_UDEMY_ACTIVITY))
df_career = pd.read_csv(os.path.join(DIR_INPUT, FILE_NAME_CAREER))
df_dx = pd.read_csv(os.path.join(DIR_INPUT, FILE_NAME_DX))
df_hr = pd.read_csv(os.path.join(DIR_INPUT, FILE_NAME_HR))
df_overtime_work_by_month = pd.read_csv(os.path.join(DIR_INPUT, FILE_NAME_ORVER_TIME))
df_position_history = pd.read_csv(os.path.join(DIR_INPUT, FILE_NAME_POSITION_HISTORY))

# 前処理

In [None]:
def clean_feature_names(data):
    # 特徴量名を修正
    data.columns = data.columns.str.replace(r'[^\w]', '_', regex=True)
    return data

In [None]:
# train
df_prep_train = df_train.copy()
# カラム名を修正
df_prep_train = clean_feature_names(df_prep_train)

In [None]:
# test
df_prep_test = df_test.copy()
# カラム名を修正
df_prep_test = clean_feature_names(df_prep_test)

In [None]:
# udemy activity
df_prep_udemy_activity = df_udemy_activity.copy()
# カラム名を修正
df_prep_udemy_activity = clean_feature_names(df_prep_udemy_activity)

# datetime型に変換
df_prep_udemy_activity["開始日"] = pd.to_datetime(df_prep_udemy_activity["開始日"], format='%Y/%m/%d %H:%M')
df_prep_udemy_activity["終了日"] = pd.to_datetime(df_prep_udemy_activity["終了日"], format='%Y/%m/%d %H:%M')

# booleanを数値に変換
df_prep_udemy_activity["マーク済み修了"] = df_prep_udemy_activity["マーク済み修了"].astype(int)

In [None]:
# career
df_prep_career = df_career.copy()
# カラム名から\nと空文字を削除
df_prep_career.columns = df_prep_career.columns.str.replace('\n', '', regex=True)
df_prep_career.columns = df_prep_career.columns.str.replace(' ', '', regex=True)
# カラム名を修正
df_prep_career = clean_feature_names(df_prep_career)

# 先頭の数値を抽出しint型に変換
target_cols = df_prep_career.columns[1:]
for col in target_cols:
    df_prep_career[col] = df_prep_career[col].apply(lambda x: int(x[0]))

In [None]:
# dx
df_prep_dx = df_dx.copy()
# カラム名を修正
df_prep_dx = clean_feature_names(df_prep_dx)

df_prep_dx['研修実施日'] = pd.to_datetime(df_prep_dx["研修実施日"], format='%Y-%m-%d %H:%M:%S')

In [None]:
# hr
df_prep_hr = df_hr.copy()

# カラム名を修正
df_prep_hr = clean_feature_names(df_prep_hr)

# datetime型に変換
def extract_start_date(x):
    # x = correction_dict.get(x, x)  # Apply correction if exists
    if ('-' in x) and ('/' in x):
        result = x.split('-')[0]
        result = datetime.strptime(result, '%Y/%m/%d')
        
    elif '-' in x:
        result = x
        result = datetime.strptime(result, '%Y-%m-%d %H:%M:%S')

    elif ',' in x:
        result = x.split(',')[0]
        result = datetime.strptime(result, '%Y/%m/%d')

    else:
        print(f"Unexpected format: {x}")
        result = x

    return result

def extract_end_date(x):
    # x = correction_dict.get(x, x)  # Apply correction if exists
    try:
        if ('-' in x) and ('/' in x):
            result = x.split('-')[-1]
            result = datetime.strptime(result, '%Y/%m/%d')
            
        elif '-' in x:
            result = x
            result = datetime.strptime(result, '%Y-%m-%d %H:%M:%S')

        elif ',' in x:
            result = x.split(',')[-1]
            result = datetime.strptime(result, '%Y/%m/%d')

        else:
            print(f"Unexpected format: {x}")
            result = x
    except:
        result = extract_start_date(x) + timedelta(days=10) 
        print(f"Error parsing date: {x}, using fallback: {result}")
        
    return result


df_prep_hr['実施開始日'] = df_prep_hr['実施日'].apply(extract_start_date)
df_prep_hr['実施終了日'] = df_prep_hr['実施日'].apply(extract_end_date)

In [None]:
# overtime_work_by_month
df_prep_overtime_work_by_month = df_overtime_work_by_month.copy()

# カラム名を修正
df_prep_overtime_work_by_month = clean_feature_names(df_prep_overtime_work_by_month)

# datetime型に変換
df_prep_overtime_work_by_month['date'] = pd.to_datetime(df_prep_overtime_work_by_month['date'], format='%Y-%m-%d')

In [None]:
# position_history
df_prep_position_history = df_position_history.copy()

# カラム名を修正
df_prep_position_history = clean_feature_names(df_prep_position_history)

# データ出力

In [None]:
# データ出力
df_prep_train.to_pickle(os.path.join(DIR_INTERIM, "df_prep_train.pkl"))
df_prep_test.to_pickle(os.path.join(DIR_INTERIM, "df_prep_test.pkl"))
df_prep_udemy_activity.to_pickle(os.path.join(DIR_INTERIM, "df_prep_udemy_activity.pkl"))
df_prep_career.to_pickle(os.path.join(DIR_INTERIM, "df_prep_career.pkl"))
df_prep_dx.to_pickle(os.path.join(DIR_INTERIM, "df_prep_dx.pkl"))
df_prep_hr.to_pickle(os.path.join(DIR_INTERIM, "df_prep_hr.pkl"))
df_prep_overtime_work_by_month.to_pickle(os.path.join(DIR_INTERIM, "df_prep_overtime_work_by_month.pkl"))
df_prep_position_history.to_pickle(os.path.join(DIR_INTERIM, "df_prep_position_history.pkl"))
