In [2]:
import pandas as pd

data = pd.read_csv('fraudtrain.csv')

In [3]:
import pandas as pd
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from scipy.sparse import csr_matrix, vstack, hstack
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, roc_auc_score
from collections import Counter

# Function to process data in chunks and convert to sparse matrix
def process_data_chunks(file_path, chunk_size, scaler, encoder):
    chunks = pd.read_csv(file_path, chunksize=chunk_size)
    sparse_chunks = []
    
    for chunk in chunks:
        # Feature engineering
        chunk['trans_hour'] = pd.to_datetime(chunk['trans_date_trans_time']).dt.hour
        chunk['trans_day_of_week'] = pd.to_datetime(chunk['trans_date_trans_time']).dt.dayofweek
        
        # Scaling numerical features
        numerical_features = ['amt', 'lat', 'long', 'city_pop']
        chunk[numerical_features] = scaler.transform(chunk[numerical_features])
        
        # OneHotEncoding categorical features
        categorical_features = ['merchant', 'category', 'gender']
        encoded_features = encoder.transform(chunk[categorical_features])
        
        # Combine numerical and encoded categorical features
        sparse_data = csr_matrix(chunk[numerical_features])
        combined_data = hstack([sparse_data, encoded_features])
        sparse_chunks.append(combined_data)
    
    # Concatenate all sparse chunks into a single sparse matrix
    final_sparse_matrix = vstack(sparse_chunks)
    
    return final_sparse_matrix

# Load initial chunk to fit the scaler and encoder
initial_chunk = pd.read_csv('fraudtrain.csv', nrows=100000)
initial_chunk['trans_hour'] = pd.to_datetime(initial_chunk['trans_date_trans_time']).dt.hour
initial_chunk['trans_day_of_week'] = pd.to_datetime(initial_chunk['trans_date_trans_time']).dt.dayofweek

# Initialize and fit scaler and encoder
numerical_features = ['amt', 'lat', 'long', 'city_pop']
categorical_features = ['merchant', 'category', 'gender']
scaler = StandardScaler()
encoder = OneHotEncoder(handle_unknown='ignore', sparse=True)
scaler.fit(initial_chunk[numerical_features])
encoder.fit(initial_chunk[categorical_features])

# Process the full dataset in chunks
file_path = 'fraudtrain.csv'
chunk_size = 100000
final_sparse_matrix = process_data_chunks(file_path, chunk_size, scaler, encoder)

# Load target variable
data = pd.read_csv('fraudtrain.csv', usecols=['is_fraud'])
y = data['is_fraud']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(final_sparse_matrix, y, test_size=0.2, random_state=42)

# Handling imbalanced data with SMOTE
print("Original training dataset shape:", Counter(y_train))
smote = SMOTE(random_state=42)
X_train_res, y_train_res = smote.fit_resample(X_train, y_train)



Original training dataset shape: Counter({0: 1031354, 1: 5986})


In [None]:
import optuna
import xgboost as xgb
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split

def objective(trial):
    param = {
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'learning_rate': trial.suggest_loguniform('learning_rate', 1e-4, 1e-1),
        'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
        'subsample': trial.suggest_uniform('subsample', 0.6, 1.0),
        'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.6, 1.0),
        'objective': 'binary:logistic',
        'eval_metric': 'auc',
        'random_state': 42,
    }
    
    # Splitting the training data into training and validation sets
    X_train, X_valid, y_train, y_valid = train_test_split(X_train_res, y_train_res, test_size=0.2, random_state=42)
    
    model = xgb.XGBClassifier(**param)
    model.fit(X_train, y_train, eval_set=[(X_valid, y_valid)], verbose=False)
    
    preds = model.predict_proba(X_valid)[:, 1]
    auc = roc_auc_score(y_valid, preds)
    
    return auc

# Run optimization
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=50,n_jobs=-1)

print("Completed Run Optimization")

# Get best parameters
best_params = study.best_params
print(f"Best parameters: {best_params}")

# Train the final model with the best parameters
best_xgb_model = xgb.XGBClassifier(**best_params, random_state=42, use_label_encoder=False, eval_metric='logloss')
best_xgb_model.fit(X_train_res, y_train_res)

print("Model fitted")
# Predict on the test set
y_pred = best_xgb_model.predict(X_test)


# Evaluate the model
print(classification_report(y_test, y_pred))
print(f'ROC AUC Score: {roc_auc_score(y_test, y_pred)}')
