In [17]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold, GridSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.ensemble import VotingClassifier
from sklearn.metrics import f1_score, classification_report
from xgboost import XGBClassifier
from datetime import datetime


def haversine(lat1, lon1, lat2, lon2):
    """
    Calculate the great circle distance between two points
    on the earth (specified in decimal degrees)
    """
    R = 6371  # Radius of earth in kilometers
   
    # Convert decimal degrees to radians
    lat1, lon1, lat2, lon2 = map(np.radians, [lat1, lon1, lat2, lon2])
   
    # Haversine formula
    dlat = lat2 - lat1
    dlon = lon2 - lon1
    a = np.sin(dlat/2)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2)**2
    c = 2 * np.arcsin(np.sqrt(a))
   
    return R * c


class FraudDetectionModel:
    def __init__(self, train_path, test_path):
        """
        Initialize the fraud detection model with training and test data
        """
        # Load the data
        self.train_df = pd.read_csv(train_path)
        self.test_df = pd.read_csv(test_path)
       
        # Preprocessing methods
        self.fraud_counts = None
        self.preprocess_data()
       
    def preprocess_data(self):
        """
        Comprehensive data preprocessing with additional features
        """
        # Preprocess training data and get fraud counts
        self.train_df, self.fraud_counts = self.enhanced_preprocess(self.train_df, is_train=True)
       
        # Preprocess test data using the fraud counts from training
        self.test_df = self.enhanced_preprocess(self.test_df, is_train=False, fraud_counts=self.fraud_counts)
       
        # Prepare features and target for training
        self.X = self.train_df.drop('is_fraud', axis=1)
        self.y = self.train_df['is_fraud']
       
        # Prepare test data features
        self.X_test = self.test_df
   
    def enhanced_preprocess(self, data, is_train=True, fraud_counts=None):
        """
        Enhanced preprocessing function
        """
        # Keep the 'id' column if it exists
        ids = data['id'] if 'id' in data.columns else None
       
        if is_train:
            # Group `cc_num` and calculate fraud counts
            fraud_counts = data.groupby(['cc_num', 'is_fraud']).size().unstack(fill_value=0).reset_index()
            fraud_counts.columns = ['cc_num', 'is_fraud_0_count', 'is_fraud_1_count']
            # Add a new column for fraud_score
            fraud_counts['fraud_score'] = (fraud_counts['is_fraud_0_count'] * 10) - (fraud_counts['is_fraud_1_count'] * 50)
       
        # Merge fraud counts into the data
        data = data.merge(fraud_counts, on='cc_num', how='left')
       
        # Convert datetime columns
        data['trans_datetime'] = pd.to_datetime(data['trans_date'] + ' ' + data['trans_time'])
        data['dob'] = pd.to_datetime(data['dob'], errors='coerce')
       
        # Feature engineering
        data['age'] = (data['trans_datetime'] - data['dob']).dt.days / 365.25
        data['second'] = data['trans_datetime'].dt.second
        data['minute'] = data['trans_datetime'].dt.minute
        data['hour'] = data['trans_datetime'].dt.hour
        data['day'] = data['trans_datetime'].dt.day
        data['month'] = data['trans_datetime'].dt.month
        data['weekday'] = data['trans_datetime'].dt.weekday
       
        # Calculate time-based feature
        data['trans_time_seconds'] = data['trans_datetime'].dt.hour * 3600 + data['trans_datetime'].dt.minute * 60 + data['trans_datetime'].dt.second
        data['seconds_from_midnight'] = 43200 - abs(43200 - data['trans_time_seconds'])
       
        # Calculate distance between cardholder and merchant
        data['haversine_distance'] = haversine(
            data['lat'], data['long'], data['merch_lat'], data['merch_long']
        )
       
        # Select features
        features = [
            'amt', 'gender', 'category', 'job', 'state', 'city_pop',
            'hour', 'day', 'month', 'weekday',
            'age', 'haversine_distance', 'fraud_score'
        ]
       
        if is_train:
            features += ['is_fraud']
       
        data = data[features]
       
        # Convert categorical columns
        categorical_cols = ['category', 'state', 'job']
        gender_map = {'F': 0, 'M': 1}
        data['gender'] = data['gender'].map(gender_map)
       
        # Label encoding for categorical columns
        for col in categorical_cols:
            le = LabelEncoder()
            combined_data = pd.concat([data[col]], axis=0).astype(str)
            le.fit(combined_data)
            data[col] = le.transform(data[col].astype(str))
       
        # Impute missing values
        imputer = SimpleImputer(strategy='median')
        data = pd.DataFrame(imputer.fit_transform(data), columns=features)
       
        # Add 'id' column back if it exists
        if ids is not None:
            data['id'] = ids
       
        if is_train:
            return data, fraud_counts
        else:
            return data
   
    def create_preprocessing_pipeline(self):
        """
        Create preprocessing pipeline
        """
        # Identify column types
        numeric_features = ['amt', 'hour', 'day', 'month', 'weekday', 'age', 'haversine_distance', 'fraud_score', 'city_pop']
        categorical_features = ['gender', 'category', 'job', 'state']
       
        # Create preprocessors for numeric and categorical features
        numeric_transformer = Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='median')),
            ('scaler', StandardScaler())
        ])
       
        # Removed the problematic preprocessing steps
        return None
   
    def find_best_hyperparameters(self, X, y):
        param_grid = {
            'classifier__n_estimators': [300, 500, 550, 600],
            'classifier__max_depth': [None, 3, 10],
            'classifier__learning_rate': [0.1, 0.2, 0.3],
        }
        pipeline = Pipeline([
            ('classifier', XGBClassifier(eval_metric='logloss', random_state=42))
        ])
        grid_search = GridSearchCV(pipeline, param_grid, scoring='f1', cv=3)
        grid_search.fit(X, y)
        print(f"Best Parameters: {grid_search.best_params_}")
        return grid_search.best_estimator_

    def train_and_predict(self, random_state=42):
        print("Finding the best hyperparameters...")
        best_model = self.find_best_hyperparameters(self.X, self.y)
        print("Training the best model...")
        best_model.fit(self.X, self.y)
        predictions = best_model.predict(self.X_test)
        print("Evaluating the model...")
        self.evaluate_model(best_model, self.X, self.y)
        return best_model, predictions

    def evaluate_model(self, model, X, y, random_state=42):
        cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=random_state)
        f1_scores = cross_val_score(model, X, y, cv=cv, scoring='f1')
        print(f"Mean F1 Score: {f1_scores.mean():.4f}")
        print(f"Standard Deviation: {f1_scores.std():.4f}")


# Main execution
def main():
    try:
        fraud_model = FraudDetectionModel('train.csv', 'test.csv')
        best_model, predictions = fraud_model.train_and_predict()
        return fraud_model, best_model, predictions 
    except Exception as e:
        print(f"An error occurred: {e}")

if __name__ == "__main__":
    fraud_model, best_model, predictions = main()
    if fraud_model and best_model and predictions is not None:
        # Extract the 'id' column from the test dataset
        test_ids = fraud_model.test_df['id']

        # Create submission DataFrame
        submission_df = pd.DataFrame({
            'id': test_ids,
            'is_fraud': predictions
        })

        # Save submission file
        submission_df.to_csv('submission.csv', index=False)
        print("Submission file 'submission.csv' has been created.")
    else:
        print("Failed to generate submission file due to earlier errors.")

Finding the best hyperparameters...
Best Parameters: {'classifier__learning_rate': 0.2, 'classifier__max_depth': None, 'classifier__n_estimators': 500}
Training the best model...
Evaluating the model...
Mean F1 Score: 0.9800
Standard Deviation: 0.0020
Submission file 'submission.csv' has been created.


Submission file 'submission.csv' has been created.
