In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split

def split_and_save_data(file_path):
    # Veriyi yükle
    df = pd.read_csv(file_path)
    
    # Veriyi %70 eğitim, %30 test olacak şekilde ayır
    train_df, test_df = train_test_split(df, test_size=0.3, random_state=42)
    
    # Eğitim ve test verilerini aynı dizine kaydet
    train_path = file_path.replace('.csv', '_train.csv')
    test_path = file_path.replace('.csv', '_test.csv')
    
    train_df.to_csv(train_path, index=False)
    test_df.to_csv(test_path, index=False)
    
    print(f"Eğitim verisi kaydedildi: {train_path}")
    print(f"Test verisi kaydedildi: {test_path}")

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split


def load_data(file_path):
    df = pd.read_csv(file_path)
    # Sadece gerekli kolonları seçerek basitleştirme
    selected_columns = ["TransactionAmount", "CustomerAge", "TransactionDuration", "LoginAttempts", "AccountBalance"]
    df = df[selected_columns]
    # Hedef sütunu oluştur: Hedef olarak TransactionAmount'ın belirli bir eşikten büyük olup olmadığını kullanacağız
    df['HighTransaction'] = (df['TransactionAmount'] > 1000).astype(int)
    return df


def preprocess_data(df):
    # Gerekli ön işlem adımları (eksik verilerin temizlenmesi, dönüşümler vs)
    df.dropna(inplace=True)  # Eksik verileri kaldır
    return df


def split_data(df):
    X = df.drop("HighTransaction", axis=1)
    y = df["HighTransaction"]
    return train_test_split(X, y, test_size=0.3, random_state=42)  # Test setini %30 olarak artırarak overfitting'i azaltmaya çalışıyoruz


In [3]:
import pandas as pd
from sklearn.metrics import classification_report, accuracy_score

def evaluate_model(model, X_test, y_test):
    # Test verilerini H2OFrame'e dönüştür
    test_data = h2o.H2OFrame(pd.concat([X_test, y_test], axis=1))
    test_data[y_test.name] = test_data[y_test.name].asfactor()

    # Modeli test verileri üzerinde değerlendirin
    performance = model.leader.model_performance(test_data)
    print(performance)

    # Test verisi üzerindeki tahminler
    predictions = model.leader.predict(test_data).as_data_frame()
    print(predictions)

    # Tahminlerin değerlendirilmesi
    y_pred = predictions['predict'].astype(int)
    accuracy = accuracy_score(y_test, y_pred)
    report = classification_report(y_test, y_pred, output_dict=True)
    print("Accuracy Score:", accuracy)
    print("Classification Report:\n", classification_report(y_test, y_pred))

    return accuracy, report

In [6]:
!pip install h2o
import h2o
from h2o.automl import H2OAutoML
import pandas as pd
import pickle


def train_model(X_train, y_train, balance_classes=True, class_sampling_factors=None):
    # H2O sunucusunu başlat
    h2o.init()

    # Eğitim verilerini H2OFrame'e dönüştür
    train_data = h2o.H2OFrame(pd.concat([X_train, y_train], axis=1))
    train_data[y_train.name] = train_data[y_train.name].asfactor()  # Hedef sütunu kategorik yap

    # AutoML modelini tanımla ve eğit (aşırı öğrenmeyi engellemek için model karmaşıklığını ve eğitim süresini sınırlayalım)
    aml = H2OAutoML(max_runtime_secs=180, seed=42, stopping_metric='logloss', stopping_rounds=5, max_models=5, balance_classes=balance_classes, class_sampling_factors=class_sampling_factors)
    aml.train(y=y_train.name, training_frame=train_data)

    return aml

def save_model(model, model_path):
    model_path = h2o.save_model(model=model.leader, path=model_path, force=True)
    print(f"Model saved to {model_path}")




[notice] A new release of pip is available: 24.2 -> 24.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [7]:
from src.model import train_model, save_model
from src.evaluate import evaluate_model
import numpy as np
import os


def dynamic_class_weight_training(X_train, y_train, X_test, y_test, results_path="results.txt", models_dir="models"):
    # Model dosyalarının kaydedileceği klasörü oluştur
    if not os.path.exists(models_dir):
        os.makedirs(models_dir)

    best_model = None
    best_f1_score = 0
    acceptable_f1_range = (0.75, 0.85)  # Uygun gördüğümüz f1-score aralığı
    class_weights = np.linspace(0.5, 5.0, 10)  # 0.5 ile 5.0 arasında 10 farklı class weight faktörü denenecek
    attempts = 0  # Eğitim deneme sayısı

    for weight in class_weights:
        print(f"Class weight faktörü: {weight}")
        model = train_model(X_train, y_train, balance_classes=True, class_sampling_factors=[1.0, weight])
        accuracy, report = evaluate_model(model, X_test, y_test)

        f1_score_1 = report['1']['f1-score']  # Sınıf 1 için f1-score
        print(f"Sınıf 1 için F1-Score: {f1_score_1}")

        # Save the model in pickle format
        model_path = os.path.join(models_dir, f"model_weight_{weight:.2f}.pkl")
        save_model(model, model_path)
        print(f"Model kaydedildi: {model_path}")

        # Sonuçları dosyaya yaz
        with open(results_path, "a") as file:
            file.write(f"Class weight: {weight} # Accuracy: {accuracy} # F1-Score (Class 1): {f1_score_1} # Model Path: {model_path}\n")

        # Eğer f1-score uygun aralıkta ise dur
        if acceptable_f1_range[0] <= f1_score_1 <= acceptable_f1_range[1]:
            print(f"Uygun F1-Score elde edildi: {f1_score_1}, Class weight faktörü: {weight}")
            best_model = model
            break

        # Daha iyi bir f1-score elde edilirse en iyi modeli güncelle
        if f1_score_1 > best_f1_score:
            best_f1_score = f1_score_1
            best_model = model

        attempts += 1
        # Eğer 5 denemeden sonra istenilen sonuca ulaşılamadıysa, döngüğyü durdur ve farklı yaklaşımlar dene
        if attempts >= 5:
            print("5 deneme yapıldı, uygun sonuç elde edilemedi. Farklı class weight faktörleri deneniyor.")
            class_weights = np.linspace(5.5, 10.0, 5)  # Yeni bir class weight aralığı denenecek
            attempts = 0  # Deneme sayısını sıfırla

    return best_model

In [9]:


def main():
    # Veri Yükleme
    data_path = (r"C:\Users\CASPER\Desktop\5_hafta\data\bank_transactions_data_2.csv")
    
    # Veriyi Ayırma ve Kayıt Etme
    split_and_save_data(data_path)

    # Eğitim verisini yükle
    train_data_path = data_path.replace('.csv', '_train.csv')
    df = load_data(train_data_path)

    # Veri Ön İşleme
    df = preprocess_data(df)

    # Veri Ayırma
    X_train, X_test, y_train, y_test = split_data(df)

    # Dinamik Class Weight ile Model Eğitimi
    best_model = dynamic_class_weight_training(X_train, y_train, X_test, y_test)
    print("En iyi model eğitildi ve seçildi.")


if __name__ == "__main__":
    main()

Eğitim verisi kaydedildi: C:\Users\CASPER\Desktop\5_hafta\data\bank_transactions_data_2_train.csv
Test verisi kaydedildi: C:\Users\CASPER\Desktop\5_hafta\data\bank_transactions_data_2_test.csv
Class weight faktörü: 0.5
Checking whether there is an H2O instance running at http://localhost:54321..... not found.
Attempting to start a local H2O server...
; Java HotSpot(TM) 64-Bit Server VM (build 25.401-b10, mixed mode)
  Starting server from C:\Users\CASPER\AppData\Local\anaconda3\Lib\site-packages\h2o\backend\bin\h2o.jar
  Ice root: C:\Users\CASPER\AppData\Local\Temp\tmpktq_n9no
  JVM stdout: C:\Users\CASPER\AppData\Local\Temp\tmpktq_n9no\h2o_CASPER_started_from_python.out
  JVM stderr: C:\Users\CASPER\AppData\Local\Temp\tmpktq_n9no\h2o_CASPER_started_from_python.err
  Server is running at http://127.0.0.1:54321
Connecting to H2O server at http://127.0.0.1:54321 ... successful.


0,1
H2O_cluster_uptime:,04 secs
H2O_cluster_timezone:,Europe/Istanbul
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.46.0.6
H2O_cluster_version_age:,8 days
H2O_cluster_name:,H2O_from_python_CASPER_twpath
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,10.59 Gb
H2O_cluster_total_cores:,32
H2O_cluster_allowed_cores:,32


Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
AutoML progress: |
18:47:41.851: AutoML: XGBoost is not available; skipping it.

███████████████████████████████████████████████████████████████| (done) 100%
Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
ModelMetricsBinomialGLM: stackedensemble
** Reported on test data. **

MSE: 0.0018885593957084414
RMSE: 0.04345755855669346
LogLoss: 0.012413579636556942
AUC: 0.9999204771371769
AUCPR: 0.9984311714738687
Gini: 0.9998409542743538
Null degrees of freedom: 527
Residual degrees of freedom: 524
Null deviance: 207.6201491265345
Residual deviance: 13.108740096204231
AIC: 21.10874009620423

Confusion Matrix (Act/Pred) for max f1 @ threshold = 0.001431181544067482
       0    1    Error    Rate
-----  ---  ---  -------  -----------
0      502  1    0.002    (1.0/503.0)
1      0    25   0        (0.0/25.0)
Total  502  26   0.0019   (1.0/528.0)

Maximum M




 connected.


0,1
H2O_cluster_uptime:,52 secs
H2O_cluster_timezone:,Europe/Istanbul
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.46.0.6
H2O_cluster_version_age:,8 days
H2O_cluster_name:,H2O_from_python_CASPER_twpath
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,9.35 Gb
H2O_cluster_total_cores:,32
H2O_cluster_allowed_cores:,32


Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
AutoML progress: |█
18:48:45.225: AutoML: XGBoost is not available; skipping it.

██████████████████████████████████████████████████████████████| (done) 100%
Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
ModelMetricsBinomialGLM: stackedensemble
** Reported on test data. **

MSE: 0.001893921228908151
RMSE: 0.043519205288104136
LogLoss: 0.023098861701549067
AUC: 0.9998409542743539
AUCPR: 0.9969807737613723
Gini: 0.9996819085487079
Null degrees of freedom: 527
Residual degrees of freedom: 525
Null deviance: 207.6201491265345
Residual deviance: 24.392397956832127
AIC: 30.392397956832127

Confusion Matrix (Act/Pred) for max f1 @ threshold = 0.9998349253792045
       0    1    Error    Rate
-----  ---  ---  -------  -----------
0      503  0    0        (0.0/503.0)
1      1    24   0.04     (1.0/25.0)
Total  504  24   0.0019   (1.0/528.0)

Maximum Me




     predict        p0        p1
0          0  0.999996  0.000004
1          1  0.000154  0.999846
2          0  0.999996  0.000004
3          0  0.999996  0.000004
4          0  0.999996  0.000004
..       ...       ...       ...
523        0  0.999996  0.000004
524        0  0.999996  0.000004
525        0  0.999996  0.000004
526        0  0.999996  0.000004
527        0  0.999996  0.000004

[528 rows x 3 columns]
Accuracy Score: 0.9981060606060606
Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00       503
           1       1.00      0.96      0.98        25

    accuracy                           1.00       528
   macro avg       1.00      0.98      0.99       528
weighted avg       1.00      1.00      1.00       528

Sınıf 1 için F1-Score: 0.9795918367346939
Model saved to C:\Users\CASPER\Desktop\5_hafta\models\model_weight_1.00.pkl\StackedEnsemble_BestOfFamily_1_AutoML_2_20241110_184845
Model kaydedildi: mo

0,1
H2O_cluster_uptime:,3 mins 04 secs
H2O_cluster_timezone:,Europe/Istanbul
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.46.0.6
H2O_cluster_version_age:,8 days
H2O_cluster_name:,H2O_from_python_CASPER_twpath
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,9.34 Gb
H2O_cluster_total_cores:,32
H2O_cluster_allowed_cores:,32


Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
AutoML progress: |█
18:50:57.262: AutoML: XGBoost is not available; skipping it.

██████████████████████████████████████████████████████████████| (done) 100%
Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
ModelMetricsBinomialGLM: stackedensemble
** Reported on test data. **

MSE: 0.001893921228908151
RMSE: 0.043519205288104136
LogLoss: 0.023098861701549067
AUC: 0.9998409542743539
AUCPR: 0.9969807737613723
Gini: 0.9996819085487079
Null degrees of freedom: 527
Residual degrees of freedom: 525
Null deviance: 207.6201491265345
Residual deviance: 24.392397956832127
AIC: 30.392397956832127

Confusion Matrix (Act/Pred) for max f1 @ threshold = 0.9998349253792045
       0    1    Error    Rate
-----  ---  ---  -------  -----------
0      503  0    0        (0.0/503.0)
1      1    24   0.04     (1.0/25.0)
Total  504  24   0.0019   (1.0/528.0)

Maximum Me




     predict        p0        p1
0          0  0.999996  0.000004
1          1  0.000154  0.999846
2          0  0.999996  0.000004
3          0  0.999996  0.000004
4          0  0.999996  0.000004
..       ...       ...       ...
523        0  0.999996  0.000004
524        0  0.999996  0.000004
525        0  0.999996  0.000004
526        0  0.999996  0.000004
527        0  0.999996  0.000004

[528 rows x 3 columns]
Accuracy Score: 0.9981060606060606
Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00       503
           1       1.00      0.96      0.98        25

    accuracy                           1.00       528
   macro avg       1.00      0.98      0.99       528
weighted avg       1.00      1.00      1.00       528

Sınıf 1 için F1-Score: 0.9795918367346939
Model saved to C:\Users\CASPER\Desktop\5_hafta\models\model_weight_1.50.pkl\StackedEnsemble_BestOfFamily_1_AutoML_3_20241110_185057
Model kaydedildi: mo

KeyboardInterrupt: 