In [1]:
import pandas as pd

train = pd.read_csv("../data/processed/train_clean.csv")
test = pd.read_csv("../data/processed/test_clean.csv")

In [2]:
# Örnek: kategorik değişkenleri label encoding
from sklearn.preprocessing import LabelEncoder

cat_cols = train.select_dtypes(include=['object']).columns
for col in cat_cols:
    le = LabelEncoder()
    train[col] = le.fit_transform(train[col].astype(str))
    test[col] = le.transform(test[col].astype(str))

# Örnek: yeni feature ekleme
train['CREDIT_INCOME_RATIO'] = train['AMT_CREDIT'] / train['AMT_INCOME_TOTAL']
test['CREDIT_INCOME_RATIO'] = test['AMT_CREDIT'] / test['AMT_INCOME_TOTAL']

In [5]:
# Missing flag
missing_flags_train = {}
missing_flags_test = {}
for col in train.columns:
    if train[col].isnull().sum() > 0:
        missing_flags_train[col + '_MISSING_FLAG'] = train[col].isnull().astype(int)
        if col in test.columns:
            missing_flags_test[col + '_MISSING_FLAG'] = test[col].isnull().astype(int)

train = pd.concat([train, pd.DataFrame(missing_flags_train)], axis=1)
if missing_flags_test:
    test = pd.concat([test, pd.DataFrame(missing_flags_test)], axis=1)

# Outlier flag
num_cols = train.select_dtypes(include=['float64', 'int64']).columns.drop('TARGET')
outlier_flags_train = {}
outlier_flags_test = {}
for col in num_cols:
    Q1 = train[col].quantile(0.25)
    Q3 = train[col].quantile(0.75)
    IQR = Q3 - Q1
    lower = Q1 - 1.5 * IQR
    upper = Q3 + 1.5 * IQR
    outlier_flags_train[col + '_OUTLIER'] = ((train[col] < lower) | (train[col] > upper)).astype(int)
    if col in test.columns:
        outlier_flags_test[col + '_OUTLIER'] = ((test[col] < lower) | (test[col] > upper)).astype(int)

train = pd.concat([train, pd.DataFrame(outlier_flags_train)], axis=1)
if outlier_flags_test:
    test = pd.concat([test, pd.DataFrame(outlier_flags_test)], axis=1)

In [6]:
# Kategorik sütunları belirle
cat_cols = train.select_dtypes(include=['object']).columns

# Train ve test için one-hot encoding
train = pd.get_dummies(train, columns=cat_cols, drop_first=True)
test = pd.get_dummies(test, columns=cat_cols, drop_first=True)

# Train ve test kolonlarının uyumlu olduğundan emin ol
train, test = train.align(test, join='left', axis=1, fill_value=0)

In [7]:
# Hedef ve özellikleri ayır
X_train = train.drop('TARGET', axis=1)
y_train = train['TARGET']

X_test = test.drop('TARGET', axis=1, errors='ignore')  # testte TARGET yoksa hata verme

In [8]:
train.to_csv("../data/processed/train_fe.csv", index=False)
test.to_csv("../data/processed/test_fe.csv", index=False)