In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, fbeta_score
from sklearn.model_selection import train_test_split, KFold
from sklearn.neighbors import LocalOutlierFactor
from sklearn.preprocessing import MinMaxScaler, LabelEncoder, RobustScaler
import xgboost as xgb
from xgboost import XGBClassifier
from sklearn.metrics import mean_absolute_error, mean_squared_error
!pip install optuna
import optuna

### Ayarlar
pd.set_option('display.max_columns', None)  # Tüm sütunları göster
pd.set_option('display.max_rows', None)     # Tüm satırları göster

### Veri setini yükleme
train_transaction = pd.read_csv(r"/content/train_transaction.csv")
train_identity = pd.read_csv(r"/content/train_identity.csv")
train_left = train_transaction.merge(train_identity, on='TransactionID', how='left')

### Eksik veri oranları
((train_left.isnull().sum() / len(train_left)) * 100)  #.sort_values(ascending=False)

### Eksik değer silme
threshold = 40
missing_percentage = (train_left.isnull().sum() / len(train_left)) * 100
columns_to_drop = missing_percentage[(missing_percentage > threshold) & (missing_percentage.index != 'DeviceType')].index
train_left = train_left.drop(columns=columns_to_drop)
train_left = train_left.drop(columns=['P_emaildomain'])

### Değişken türlerini belirleme
def grab_col_names(dataframe, cat_th=10, car_th=20):
    """
    Veri setindeki kategorik, numerik ve kategorik fakat kardinal değişkenlerin isimlerini verir.
    Not: Kategorik değişkenlerin içerisine numerik görünen kategorik değişkenler de dahildir.

    Parameters
    ------
        dataframe: DataFrame
            Değişken isimleri alınmak istenilen dataframe
        cat_th: int, optional
            Numerik fakat kategorik olan değişkenler için sınır eşik değeri
        car_th: int, optional
            Kategorik fakat kardinal değişkenler için sınır eşik değeri

    Returns
    ------
        cat_cols: list
            Kategorik değişken listesi
        num_cols: list
            Numerik değişken listesi
        cat_but_car: list
            Kategorik görünümlü kardinal değişken listesi
    """
    # Kategorik değişkenler
    cat_cols = [col for col in dataframe.columns if dataframe[col].dtypes == "O"]
    num_but_cat = [col for col in dataframe.columns if dataframe[col].nunique() < cat_th and dataframe[col].dtypes != "O"]
    cat_but_car = [col for col in dataframe.columns if dataframe[col].nunique() > car_th and dataframe[col].dtypes == "O"]

    cat_cols = cat_cols + num_but_cat
    cat_cols = [col for col in cat_cols if col not in cat_but_car]

    # Numerik değişkenler
    num_cols = [col for col in dataframe.columns if dataframe[col].dtypes != "O"]
    num_cols = [col for col in num_cols if col not in num_but_cat]

    print(f"Observations: {dataframe.shape[0]}")
    print(f"Variables: {dataframe.shape[1]}")
    print(f'cat_cols: {len(cat_cols)}')
    print(f'num_cols: {len(num_cols)}')
    print(f'cat_but_car: {len(cat_but_car)}')
    print(f'num_but_cat: {len(num_but_cat)}')

    return cat_cols, num_cols, cat_but_car

grab_col_names(train_left)

cat_cols, num_cols, cat_but_car = grab_col_names(train_left)

### Değişkenlerin unique değerleri
for col in cat_cols:
    print(f"{col} değişkeninin benzersiz değerleri: {train_left[col].unique()}")

for col in cat_but_car:
    print(f"{col} değişkeninin benzersiz değerleri: {train_left[col].unique()}")

for col in num_cols:
    print(f"{col} değişkeninin benzersiz değerleri: {train_left[col].unique()}")

### Eksik değer doldurma ve One Hot Encoding
train_left[['card4', 'card6', 'M6', 'DeviceType']] = train_left[['card4', 'card6', 'M6', 'DeviceType']].fillna('unknown')
train_left = pd.get_dummies(train_left, columns=['ProductCD', 'card4', 'card6', 'DeviceType', 'M6'])

numeric_columns = train_left.select_dtypes(include=[np.number]).columns
train_left[numeric_columns] = train_left[numeric_columns].apply(lambda x: x.fillna(x.median()))

scaler = StandardScaler()
train_left[num_cols] = scaler.fit_transform(train_left[num_cols])

### Test setini oluşturma
test_identity = pd.read_csv(r"/content/test_identity.csv")
test_transaction = pd.read_csv(r"/content/test_transaction.csv")
test_left = test_transaction.merge(test_identity, on='TransactionID', how='left')

train_columns = set(train_left.columns)
test_columns = set(test_left.columns)

# Test veri setinde eğitim veri setinde bulunmayan kolonları belirleme
extra_columns = test_columns - train_columns

# Test veri setinde bu ekstra kolonları silme
test_left = test_left.drop(columns=extra_columns)

### Model oluşturma
y = train_left["isFraud"]
X = train_left.drop(["TransactionID", "isFraud"], axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=17)

from sklearn.ensemble import RandomForestClassifier

rf_model = RandomForestClassifier(random_state=46).fit(X_train, y_train)
y_pred = rf_model.predict(X_test)
print("Accuracy: ", accuracy_score(y_test, y_pred))

### XGBoost ile modelleme
def fit_xgbm(X_train, y_train, X_test, y_test, n_trials=12):
    kfold = KFold(n_splits=5, shuffle=True, random_state=42)

    def objective(trial):
        params = {
            "n_estimators": trial.suggest_int("n_estimators", 100, 5000),
            "learning_rate": trial.suggest_float("learning_rate", 1e-3, 0.3, log=True),
            "max_depth": trial.suggest_int("max_depth", 3, 18),
            "subsample": trial.suggest_float("subsample", 0.2, 1.0),
            "colsample_bytree": trial.suggest_float("colsample_bytree", 0.2, 1.0),
            "reg_alpha": trial.suggest_float("reg_alpha", 0, 10.0),
            "reg_lambda": trial.suggest_float("reg_lambda", 0.1, 10.0),
        }
        model = XGBClassifier(**params)
        model.set_params(early_stopping_rounds=200)
        accuracies = []
        for train_index, val_index in kfold.split(X_train):
            X_train_cv, X_val_cv = X_train.iloc[train_index], X_train.iloc[val_index]
            y_train_cv, y_val_cv = y_train.iloc[train_index], y_train.iloc[val_index]
            model.fit(X_train_cv, y_train_cv, eval_set=[(X_val_cv, y_val_cv)], verbose=True)
            preds = model.predict(X_val_cv)
            accuracy = accuracy_score(y_val_cv, preds)
            accuracies.append(accuracy)
        return np.mean(accuracies)

    study = optuna.create_study(direction='maximize')
    study.optimize(objective, n_trials=n_trials)

    best_params = study.best_params
    print("Best Parameters for XGBoost: ", best_params)

    # En iyi parametreler ile modeli başlat
    best_model = XGBClassifier(**best_params)
    best_model.set_params(early_stopping_rounds=200)

    def evaluate_cross_val(X, y):
        if isinstance(X, np.ndarray):
            X = pd.DataFrame(X)
        if isinstance(y, np.ndarray):
            y = pd.Series(y)

        accuracies, precisions, recalls, f2_scores = [], [], [], []
        for train_index, val_index in kfold.split(X):
            X_train_cv, X_val_cv = X.iloc[train_index], X.iloc[val_index]
            y_train_cv, y_val_cv = y.iloc[train_index], y.iloc[val_index]
            best_model.fit(X_train_cv, y_train_cv, eval_set=[(X_val_cv, y_val_cv)], verbose=True)
            preds = best_model.predict(X_val_cv)
            accuracy = accuracy_score(y_val_cv, preds)
            precision = precision_score(y_val_cv, preds)
            recall = recall_score(y_val_cv, preds)
            f2 = fbeta_score(y_val_cv, preds, beta=2)
            accuracies.append(accuracy)
            precisions.append(precision)
            recalls.append(recall)
            f2_scores.append(f2)
        return np.mean(accuracies), np.mean(precisions), np.mean(recalls), np.mean(f2_scores)

    train_accuracy, train_precision, train_recall, train_f2 = evaluate_cross_val(X_train, y_train)
    test_accuracy, test_precision, test_recall, test_f2 = evaluate_cross_val(X_test, y_test)

    # Final modeli early_stopping_rounds ile eğit
    best_model.fit(X_train, y_train, eval_set=[(X_test, y_test)], verbose=True)
    test_predictions = best_model.predict(X_test)

    metrics = {
        'train_accuracy': train_accuracy,
        'train_precision': train_precision,
        'train_recall': train_recall,
        'train_f2': train_f2,
        'test_accuracy': test_accuracy,
        'test_precision': test_precision,
        'test_recall': test_recall,
        'test_f2': test_f2
    }

    return best_model, metrics, test_predictions

best_model, metrics, test_predictions = fit_xgbm(X_train, y_train, X_test, y_test)

print("Model Metrics: ", metrics)


Observations: 44195
Variables: 183
cat_cols: 84
num_cols: 99
cat_but_car: 0
num_but_cat: 79
Observations: 44195
Variables: 183
cat_cols: 84
num_cols: 99
cat_but_car: 0
num_but_cat: 79
ProductCD değişkeninin benzersiz değerleri: ['W' 'H' 'C' 'S' 'R']
card4 değişkeninin benzersiz değerleri: ['discover' 'mastercard' 'visa' 'american express' nan]
card6 değişkeninin benzersiz değerleri: ['credit' 'debit' nan 'debit or credit' 'charge card']
M6 değişkeninin benzersiz değerleri: ['T' 'F' nan]
DeviceType değişkeninin benzersiz değerleri: [nan 'mobile' 'desktop']
isFraud değişkeninin benzersiz değerleri: [0 1]
C3 değişkeninin benzersiz değerleri: [ 0.  1.  8.  3.  2. 16.  4. nan]
V12 değişkeninin benzersiz değerleri: [ 1.  0. nan  2.  3.]
V13 değişkeninin benzersiz değerleri: [ 1.  0. nan  2.  3.  4.]
V14 değişkeninin benzersiz değerleri: [ 1. nan  0.]
V15 değişkeninin benzersiz değerleri: [ 0. nan  1.  2.  3.  4.  5.  6.  7.]
V19 değişkeninin benzersiz değerleri: [ 1. nan  0.  2.  3.  4.  5. 

[I 2024-08-05 20:36:28,786] A new study created in memory with name: no-name-6d206fa1-d443-4c03-8a4f-9e1e08fc7a07


V321 değişkeninin benzersiz değerleri: [0.00000000e+00 7.58874969e+01 7.74000000e+02 5.00000000e+01
 1.07949997e+02 2.00000000e+02 4.00000000e+01 1.39000000e+02
 5.90000000e+01 1.00000000e+02 1.40000000e+02 3.00000000e+02
 4.90000000e+01 5.00000000e+02 1.60500000e+02 8.39950012e+02
 1.50000000e+02 1.17900002e+02 3.00000000e+03 3.00000000e+01
 6.00000000e+01 1.13949997e+02 6.50000000e+02 8.09627991e+01
 4.29500008e+01 2.26000000e+02 8.75000000e+01 1.60118008e+01
 8.59000015e+01 4.50000000e+02 7.70000000e+01 7.59499969e+01
 4.49500008e+01 1.28850006e+02 3.35000000e+02 1.60000000e+02
 1.64850006e+02 1.37000000e+02 5.79500008e+01 1.17000000e+02
 1.49919998e+02 2.24949997e+02 1.97000000e+02 2.57000000e+02
 9.00000000e+02 1.37400000e+03 1.35000000e+03 1.82500000e+03
 1.94000000e+03 3.09950012e+02 4.59950012e+02 1.25500000e+02
 7.50000000e+02 2.40000000e+01 9.00000000e+01 2.50000000e+01
 2.06400000e+03 6.70000000e+02 9.70000000e+02 2.51400000e+03
 7.50000000e+01 3.15000000e+02 4.18779999e+02 

[I 2024-08-05 20:38:32,502] Trial 0 finished with value: 0.982447609721115 and parameters: {'n_estimators': 965, 'learning_rate': 0.246342127867564, 'max_depth': 14, 'subsample': 0.724915450717025, 'colsample_bytree': 0.3940558155772566, 'reg_alpha': 1.855392781366243, 'reg_lambda': 5.827927327752525}. Best is trial 0 with value: 0.982447609721115.


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
[4908]	validation_0-logloss:0.08899
[4909]	validation_0-logloss:0.08898
[4910]	validation_0-logloss:0.08898
[4911]	validation_0-logloss:0.08897
[4912]	validation_0-logloss:0.08897
[4913]	validation_0-logloss:0.08897
[4914]	validation_0-logloss:0.08897
[4915]	validation_0-logloss:0.08897
[4916]	validation_0-logloss:0.08896
[4917]	validation_0-logloss:0.08896
[4918]	validation_0-logloss:0.08896
[4919]	validation_0-logloss:0.08895
[4920]	validation_0-logloss:0.08895
[4921]	validation_0-logloss:0.08895
[4922]	validation_0-logloss:0.08895
[4923]	validation_0-logloss:0.08894
[4924]	validation_0-logloss:0.08894
[4925]	validation_0-logloss:0.08894
[4926]	validation_0-logloss:0.08893
[4927]	validation_0-logloss:0.08893
[4928]	validation_0-logloss:0.08892
[4929]	validation_0-logloss:0.08892
[4930]	validation_0-logloss:0.08892
[4931]	validation_0-logloss:0.08892
[4932]	validation_0-logloss:0.08891
[4933]	validation_0-logloss:0.08891

[I 2024-08-05 21:16:27,486] Trial 1 finished with value: 0.9774372762122219 and parameters: {'n_estimators': 4954, 'learning_rate': 0.001659470808527906, 'max_depth': 16, 'subsample': 0.41373665492134626, 'colsample_bytree': 0.6744642116478846, 'reg_alpha': 7.853860189760976, 'reg_lambda': 9.53433878744025}. Best is trial 0 with value: 0.982447609721115.


[0]	validation_0-logloss:0.18932
[1]	validation_0-logloss:0.18520
[2]	validation_0-logloss:0.18103
[3]	validation_0-logloss:0.17701
[4]	validation_0-logloss:0.17325
[5]	validation_0-logloss:0.16964
[6]	validation_0-logloss:0.16612
[7]	validation_0-logloss:0.16260
[8]	validation_0-logloss:0.15943
[9]	validation_0-logloss:0.15634
[10]	validation_0-logloss:0.15332
[11]	validation_0-logloss:0.15042
[12]	validation_0-logloss:0.14787
[13]	validation_0-logloss:0.14526
[14]	validation_0-logloss:0.14277
[15]	validation_0-logloss:0.14037
[16]	validation_0-logloss:0.13798
[17]	validation_0-logloss:0.13579
[18]	validation_0-logloss:0.13365
[19]	validation_0-logloss:0.13145
[20]	validation_0-logloss:0.12948
[21]	validation_0-logloss:0.12765
[22]	validation_0-logloss:0.12577
[23]	validation_0-logloss:0.12402
[24]	validation_0-logloss:0.12236
[25]	validation_0-logloss:0.12071
[26]	validation_0-logloss:0.11903
[27]	validation_0-logloss:0.11758
[28]	validation_0-logloss:0.11605
[29]	validation_0-loglos

[I 2024-08-05 21:26:56,703] Trial 2 finished with value: 0.9825769392189494 and parameters: {'n_estimators': 3943, 'learning_rate': 0.03623022980602116, 'max_depth': 12, 'subsample': 0.6683212672420529, 'colsample_bytree': 0.4858968440895482, 'reg_alpha': 1.2597079317501036, 'reg_lambda': 4.784139122520956}. Best is trial 2 with value: 0.9825769392189494.


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
[130]	validation_0-logloss:0.14536
[131]	validation_0-logloss:0.14510
[132]	validation_0-logloss:0.14482
[133]	validation_0-logloss:0.14455
[134]	validation_0-logloss:0.14429
[135]	validation_0-logloss:0.14403
[136]	validation_0-logloss:0.14377
[137]	validation_0-logloss:0.14351
[138]	validation_0-logloss:0.14326
[139]	validation_0-logloss:0.14299
[140]	validation_0-logloss:0.14272
[141]	validation_0-logloss:0.14247
[142]	validation_0-logloss:0.14222
[143]	validation_0-logloss:0.14195
[144]	validation_0-logloss:0.14170
[145]	validation_0-logloss:0.14145
[146]	validation_0-logloss:0.14120
[147]	validation_0-logloss:0.14095
[148]	validation_0-logloss:0.14071
[149]	validation_0-logloss:0.14046
[150]	validation_0-logloss:0.14021
[151]	validation_0-logloss:0.13997
[152]	validation_0-logloss:0.13973
[153]	validation_0-logloss:0.13949
[154]	validation_0-logloss:0.13926
[155]	validation_0-logloss:0.13903
[156]	validation_0-loglos

[I 2024-08-05 21:45:38,085] Trial 3 finished with value: 0.978213075584699 and parameters: {'n_estimators': 1710, 'learning_rate': 0.004398525811844021, 'max_depth': 11, 'subsample': 0.3893591097010711, 'colsample_bytree': 0.8627346632714383, 'reg_alpha': 6.329564186699267, 'reg_lambda': 5.654779004872801}. Best is trial 2 with value: 0.9825769392189494.


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
[434]	validation_0-logloss:0.12981
[435]	validation_0-logloss:0.12974
[436]	validation_0-logloss:0.12967
[437]	validation_0-logloss:0.12960
[438]	validation_0-logloss:0.12953
[439]	validation_0-logloss:0.12946
[440]	validation_0-logloss:0.12939
[441]	validation_0-logloss:0.12932
[442]	validation_0-logloss:0.12925
[443]	validation_0-logloss:0.12918
[444]	validation_0-logloss:0.12910
[445]	validation_0-logloss:0.12904
[446]	validation_0-logloss:0.12897
[447]	validation_0-logloss:0.12890
[448]	validation_0-logloss:0.12883
[449]	validation_0-logloss:0.12875
[450]	validation_0-logloss:0.12869
[451]	validation_0-logloss:0.12862
[452]	validation_0-logloss:0.12855
[453]	validation_0-logloss:0.12847
[454]	validation_0-logloss:0.12841
[455]	validation_0-logloss:0.12834
[456]	validation_0-logloss:0.12827
[457]	validation_0-logloss:0.12821
[458]	validation_0-logloss:0.12815
[459]	validation_0-logloss:0.12808
[460]	validation_0-loglos

[I 2024-08-05 22:04:42,278] Trial 4 finished with value: 0.9754654780562995 and parameters: {'n_estimators': 2717, 'learning_rate': 0.0029787601049300937, 'max_depth': 3, 'subsample': 0.40781675474137935, 'colsample_bytree': 0.31430290597862154, 'reg_alpha': 6.176140067712053, 'reg_lambda': 1.7503184351416832}. Best is trial 2 with value: 0.9825769392189494.


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
[2250]	validation_0-logloss:0.09083
[2251]	validation_0-logloss:0.09083
[2252]	validation_0-logloss:0.09082
[2253]	validation_0-logloss:0.09082
[2254]	validation_0-logloss:0.09081
[2255]	validation_0-logloss:0.09081
[2256]	validation_0-logloss:0.09080
[2257]	validation_0-logloss:0.09079
[2258]	validation_0-logloss:0.09078
[2259]	validation_0-logloss:0.09077
[2260]	validation_0-logloss:0.09076
[2261]	validation_0-logloss:0.09076
[2262]	validation_0-logloss:0.09075
[2263]	validation_0-logloss:0.09074
[2264]	validation_0-logloss:0.09073
[2265]	validation_0-logloss:0.09072
[2266]	validation_0-logloss:0.09071
[2267]	validation_0-logloss:0.09070
[2268]	validation_0-logloss:0.09069
[2269]	validation_0-logloss:0.09068
[2270]	validation_0-logloss:0.09068
[2271]	validation_0-logloss:0.09067
[2272]	validation_0-logloss:0.09066
[2273]	validation_0-logloss:0.09066
[2274]	validation_0-logloss:0.09065
[2275]	validation_0-logloss:0.09065

[I 2024-08-05 22:47:29,918] Trial 5 finished with value: 0.9793121542981307 and parameters: {'n_estimators': 3625, 'learning_rate': 0.0015800201666213048, 'max_depth': 16, 'subsample': 0.690170474498798, 'colsample_bytree': 0.21853298954985834, 'reg_alpha': 2.1844595302740855, 'reg_lambda': 2.52899037176916}. Best is trial 2 with value: 0.9825769392189494.


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
[2568]	validation_0-logloss:0.06392
[2569]	validation_0-logloss:0.06391
[2570]	validation_0-logloss:0.06391
[2571]	validation_0-logloss:0.06391
[2572]	validation_0-logloss:0.06392
[2573]	validation_0-logloss:0.06392
[2574]	validation_0-logloss:0.06392
[2575]	validation_0-logloss:0.06392
[2576]	validation_0-logloss:0.06393
[2577]	validation_0-logloss:0.06393
[2578]	validation_0-logloss:0.06393
[2579]	validation_0-logloss:0.06392
[2580]	validation_0-logloss:0.06393
[2581]	validation_0-logloss:0.06392
[2582]	validation_0-logloss:0.06392
[2583]	validation_0-logloss:0.06392
[2584]	validation_0-logloss:0.06391
[2585]	validation_0-logloss:0.06391
[2586]	validation_0-logloss:0.06391
[2587]	validation_0-logloss:0.06391
[2588]	validation_0-logloss:0.06391
[2589]	validation_0-logloss:0.06391
[2590]	validation_0-logloss:0.06391
[2591]	validation_0-logloss:0.06390
[2592]	validation_0-logloss:0.06390
[2593]	validation_0-logloss:0.06390