In [None]:
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt
import seaborn as sns

def calculate_haversine(lat1, lon1, lat2, lon2):
    """Vectorized Haversine Distance Calculation using Numpy."""
    R = 6371  # Radius of Earth in km
    lat1, lon1, lat2, lon2 = map(np.radians, [lat1, lon1, lat2, lon2])
    delta_lat = lat2 - lat1
    delta_lon = lon2 - lon1
    a = np.sin(delta_lat / 2.0)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(delta_lon / 2.0)**2
    c = 2 * np.arctan2(np.sqrt(a), np.sqrt(1 - a))
    return R * c

def generate_time_attributes(data):
    """Generate cyclic time features."""
    data['timestamp'] = pd.to_datetime(data['trans_date'] + ' ' + data['trans_time'])
    data['hour'] = data['timestamp'].dt.hour
    data['weekday'] = data['timestamp'].dt.dayofweek
    data['hour_sine'] = np.sin(2 * np.pi * data['hour'] / 24)
    data['hour_cosine'] = np.cos(2 * np.pi * data['hour'] / 24)
    return data

def compute_distance_feature(data):
    """Add distance column using vectorized Haversine calculation."""
    data['distance_km'] = calculate_haversine(data['lat'], data['long'], data['merch_lat'], data['merch_long'])
    return data

def robust_label_encoding(train_set, test_set):
    """Encode all object-type features in both datasets."""
    label_enc = LabelEncoder()
    for col in train_set.select_dtypes(include='object').columns:
        train_set[col] = train_set[col].astype(str)
        test_set[col] = test_set[col].astype(str)
        label_enc.fit(list(train_set[col]) + list(test_set[col]))
        train_set[col] = label_enc.transform(train_set[col])
        test_set[col] = label_enc.transform(test_set[col])
    return train_set, test_set

print("Step 1: Loading Data...")
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

print("Step 2: Feature Engineering...")
train_df = generate_time_attributes(train_df)
test_df = generate_time_attributes(test_df)

train_df = compute_distance_feature(train_df)
test_df = compute_distance_feature(test_df)

# Robust Label Encoding for categorical features
train_df, test_df = robust_label_encoding(train_df, test_df)

# Drop irrelevant features
drop_columns = ['trans_num', 'trans_date', 'trans_time', 'timestamp', 'id']
train_df = train_df.drop(columns=drop_columns, errors='ignore')
test_ids = test_df['id']
test_df = test_df.drop(columns=drop_columns, errors='ignore')

# Define features and target
X = train_df.drop(columns=['is_fraud'], errors='ignore')
y = train_df['is_fraud']
X_test = test_df[X.columns]  # Ensure test columns align with train

# Handle missing values
X.fillna(-1, inplace=True)
X_test.fillna(-1, inplace=True)

print("Step 3: Setting up Cross-Validation...")
n_folds = 5
kf = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=42)
lgb_params = {'n_estimators': 500, 'learning_rate': 0.03, 'num_leaves': 31, 'random_state': 42, 'class_weight': 'balanced'}

# Cross-validation training
out_of_fold_preds = np.zeros(len(X))
test_predictions = []

for fold, (train_idx, val_idx) in enumerate(kf.split(X, y)):
    print(f"Fold {fold + 1}")
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
    
    model = lgb.LGBMClassifier(**lgb_params)
    model.fit(X_train, y_train, eval_set=[(X_val, y_val)], eval_metric='f1')
    
    out_of_fold_preds[val_idx] = model.predict_proba(X_val)[:, 1]
    test_predictions.append(model.predict_proba(X_test)[:, 1])

print("Step 4: Finding Best Threshold...")
best_threshold, best_f1 = 0.5, 0
for threshold in np.linspace(0.01, 0.99, 100):
    preds = (out_of_fold_preds >= threshold).astype(int)
    score = f1_score(y, preds)
    if score > best_f1:
        best_f1 = score
        best_threshold = threshold

print("Step 5: Generating Final Predictions...")
final_test_preds = (np.mean(test_predictions, axis=0) >= best_threshold).astype(int)


# Save Submission
submission = pd.DataFrame({'id': test_ids, 'is_fraud': final_test_preds})
submission.to_csv('submission.csv', index=False)
print(f"Best Threshold from CV: {best_threshold:.3f}, Best OOF F1-score: {best_f1:.5f}")
