In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

# Load datasets
train_identity = pd.read_csv('D:/MLPR/ieee/train_identity.csv')
train_transaction = pd.read_csv('D:/MLPR/ieee/train_transaction.csv')
test_identity = pd.read_csv('D:/MLPR/ieee/test_identity.csv')
test_transaction = pd.read_csv('D:/MLPR/ieee/test_transaction.csv')


In [4]:
# Merge training datasets
train_data = pd.merge(train_transaction, train_identity, on='TransactionID', how='left')

# Merge test datasets
test_data = pd.merge(test_transaction, test_identity, on='TransactionID', how='left')


In [5]:
# Fill missing values
train_data.fillna(-999, inplace=True)
test_data.fillna(-999, inplace=True)

# Convert categorical features to numerical
categorical_features = ['DeviceType', 'DeviceInfo']  # Add more categorical features if needed
for feature in categorical_features:
    train_data[feature] = train_data[feature].astype('category').cat.codes
    test_data[feature] = test_data[feature].astype('category').cat.codes

# Define features and target variable
X = train_data.drop(columns=['TransactionID', 'isFraud'])
y = train_data['isFraud']
X_test = test_data.drop(columns=['TransactionID'])


In [6]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [9]:
print(train_identity.dtypes)
print(train_transaction.dtypes)

TransactionID      int64
id_01            float64
id_02            float64
id_03            float64
id_04            float64
id_05            float64
id_06            float64
id_07            float64
id_08            float64
id_09            float64
id_10            float64
id_11            float64
id_12             object
id_13            float64
id_14            float64
id_15             object
id_16             object
id_17            float64
id_18            float64
id_19            float64
id_20            float64
id_21            float64
id_22            float64
id_23             object
id_24            float64
id_25            float64
id_26            float64
id_27             object
id_28             object
id_29             object
id_30             object
id_31             object
id_32            float64
id_33             object
id_34             object
id_35             object
id_36             object
id_37             object
id_38             object
DeviceType        object


In [12]:
# Check columns in both DataFrames
print("Training Data Columns:")
print(X.columns)
print("\nTesting Data Columns:")
print(X_test.columns)

# Find columns present in one DataFrame but not the other
missing_in_test = set(X.columns) - set(X_test.columns)
missing_in_train = set(X_test.columns) - set(X.columns)

print(f"\nColumns in training data but not in testing data: {missing_in_test}")
print(f"Columns in testing data but not in training data: {missing_in_train}")

Training Data Columns:
Index(['TransactionDT', 'TransactionAmt', 'ProductCD', 'card1', 'card2',
       'card3', 'card4', 'card5', 'card6', 'addr1',
       ...
       'id_31', 'id_32', 'id_33', 'id_34', 'id_35', 'id_36', 'id_37', 'id_38',
       'DeviceType', 'DeviceInfo'],
      dtype='object', length=432)

Testing Data Columns:
Index(['TransactionDT', 'TransactionAmt', 'ProductCD', 'card1', 'card2',
       'card3', 'card4', 'card5', 'card6', 'addr1',
       ...
       'id-31', 'id-32', 'id-33', 'id-34', 'id-35', 'id-36', 'id-37', 'id-38',
       'DeviceType', 'DeviceInfo'],
      dtype='object', length=432)

Columns in training data but not in testing data: {'id_29', 'id_03', 'id_13', 'id_25', 'id_21', 'id_26', 'id_36', 'id_38', 'id_17', 'id_35', 'id_31', 'id_12', 'id_18', 'id_27', 'id_28', 'id_33', 'id_15', 'id_01', 'id_07', 'id_22', 'id_20', 'id_16', 'id_30', 'id_02', 'id_05', 'id_08', 'id_19', 'id_06', 'id_10', 'id_24', 'id_23', 'id_04', 'id_34', 'id_11', 'id_32', 'id_09', 'id_14',

In [13]:
# Align columns
X_test = X_test.reindex(columns=X.columns, fill_value=-999)

# Verify columns
print("Aligned Testing Data Columns:")
print(X_test.columns)


Aligned Testing Data Columns:
Index(['TransactionDT', 'TransactionAmt', 'ProductCD', 'card1', 'card2',
       'card3', 'card4', 'card5', 'card6', 'addr1',
       ...
       'id_31', 'id_32', 'id_33', 'id_34', 'id_35', 'id_36', 'id_37', 'id_38',
       'DeviceType', 'DeviceInfo'],
      dtype='object', length=432)


In [14]:
# Identify columns with non-numeric data
non_numeric_columns = X.select_dtypes(include=['object']).columns
print(f"Non-numeric columns: {non_numeric_columns}")

# Convert categorical features to numeric
for feature in non_numeric_columns:
    if feature in X.columns:
        X[feature] = X[feature].astype('category').cat.codes
    if feature in X_test.columns:
        X_test[feature] = X_test[feature].astype('category').cat.codes

# Fill remaining missing values
X.fillna(-999, inplace=True)
X_test.fillna(-999, inplace=True)

# Ensure that all data is numeric
print(X.dtypes)
print(X_test.dtypes)

Non-numeric columns: Index(['id_15', 'id_16', 'id_23', 'id_27', 'id_28', 'id_29', 'id_30', 'id_31',
       'id_33', 'id_34', 'id_35', 'id_36', 'id_37', 'id_38'],
      dtype='object')
TransactionDT       int64
TransactionAmt    float64
ProductCD            int8
card1               int64
card2             float64
                   ...   
id_36                int8
id_37                int8
id_38                int8
DeviceType           int8
DeviceInfo          int16
Length: 432, dtype: object
TransactionDT       int64
TransactionAmt    float64
ProductCD            int8
card1               int64
card2             float64
                   ...   
id_36                int8
id_37                int8
id_38                int8
DeviceType           int8
DeviceInfo          int16
Length: 432, dtype: object


In [17]:
# Convert categorical features to numeric in X
for feature in non_numeric_columns_X:
    if feature in X.columns:
        X[feature] = X[feature].astype('category').cat.codes

# Convert categorical features to numeric in X_test
for feature in non_numeric_columns_X_test:
    if feature in X_test.columns:
        X_test[feature] = X_test[feature].astype('category').cat.codes

# Fill remaining missing values
X.fillna(-999, inplace=True)
X_test.fillna(-999, inplace=True)

# Verify that all data is numeric
print("X Data Types:")
print(X.dtypes)
print("\nX_test Data Types:")
print(X_test.dtypes)

X Data Types:
TransactionDT       int64
TransactionAmt    float64
ProductCD            int8
card1               int64
card2             float64
                   ...   
id_36                int8
id_37                int8
id_38                int8
DeviceType           int8
DeviceInfo          int16
Length: 432, dtype: object

X_test Data Types:
TransactionDT       int64
TransactionAmt    float64
ProductCD            int8
card1               int64
card2             float64
                   ...   
id-36                int8
id-37                int8
id-38                int8
DeviceType           int8
DeviceInfo          int16
Length: 432, dtype: object


In [19]:
# Ensure consistency in column names
X.columns = X.columns.str.replace('-', '_', regex=False)
X_test.columns = X_test.columns.str.replace('-', '_', regex=False)

# Re-check non-numeric columns after renaming
non_numeric_columns_X = X.select_dtypes(include=['object']).columns
non_numeric_columns_X_test = X_test.select_dtypes(include=['object']).columns

print(f"Non-numeric columns in X: {non_numeric_columns_X}")
print(f"Non-numeric columns in X_test: {non_numeric_columns_X_test}")

Non-numeric columns in X: Index([], dtype='object')
Non-numeric columns in X_test: Index([], dtype='object')


In [20]:
# Check if columns in X_test match those in X
if X.columns.equals(X_test.columns):
    print("Columns in X and X_test match.")
else:
    missing_cols = [col for col in X.columns if col not in X_test.columns]
    extra_cols = [col for col in X_test.columns if col not in X.columns]

    print(f"Missing columns in X_test: {missing_cols}")
    print(f"Extra columns in X_test: {extra_cols}")

Columns in X and X_test match.


In [22]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split

# Load train_data and test_data (assuming they are already loaded)

# Define features and target variable
X = train_data.drop(columns=['TransactionID', 'isFraud'])
y = train_data['isFraud']
X_test = test_data.drop(columns=['TransactionID'])

# Ensure consistency in column names
X.columns = X.columns.str.replace('-', '_', regex=False)
X_test.columns = X_test.columns.str.replace('-', '_', regex=False)

# Identify non-numeric columns
non_numeric_columns_X = X.select_dtypes(include=['object']).columns
non_numeric_columns_X_test = X_test.select_dtypes(include=['object']).columns

print(f"Non-numeric columns in X: {non_numeric_columns_X}")
print(f"Non-numeric columns in X_test: {non_numeric_columns_X_test}")

# Convert categorical features to numeric
for feature in non_numeric_columns_X:
    X[feature] = X[feature].astype('category').cat.codes
    X_test[feature] = X_test[feature].astype('category').cat.codes

# Fill remaining missing values with -999
X.fillna(-999, inplace=True)
X_test.fillna(-999, inplace=True)

# Check if columns in X_test match those in X
if not X.columns.equals(X_test.columns):
    missing_cols = [col for col in X.columns if col not in X_test.columns]
    extra_cols = [col for col in X_test.columns if col not in X.columns]
    
    if missing_cols:
        print(f"Missing columns in X_test: {missing_cols}")
    if extra_cols:
        print(f"Extra columns in X_test: {extra_cols}")
else:
    print("Columns in X and X_test match.")

# Split the training data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train the model
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Validate the model
y_val_pred = model.predict_proba(X_val)[:, 1]
auc_score = roc_auc_score(y_val, y_val_pred)
print(f'Validation AUC Score: {auc_score}')

# Predict on the test set
test_predictions = model.predict_proba(X_test)[:, 1]

# Prepare submission DataFrame
submission = pd.DataFrame({
    'TransactionID': test_data['TransactionID'],
    'isFraud': test_predictions
})

# Save to CSV
submission.to_csv('submissionFraud.csv', index=False)


Non-numeric columns in X: Index(['ProductCD', 'card4', 'card6', 'P_emaildomain', 'R_emaildomain', 'M1',
       'M2', 'M3', 'M4', 'M5', 'M6', 'M7', 'M8', 'M9', 'id_12', 'id_15',
       'id_16', 'id_23', 'id_27', 'id_28', 'id_29', 'id_30', 'id_31', 'id_33',
       'id_34', 'id_35', 'id_36', 'id_37', 'id_38'],
      dtype='object')
Non-numeric columns in X_test: Index(['ProductCD', 'card4', 'card6', 'P_emaildomain', 'R_emaildomain', 'M1',
       'M2', 'M3', 'M4', 'M5', 'M6', 'M7', 'M8', 'M9', 'id_12', 'id_15',
       'id_16', 'id_23', 'id_27', 'id_28', 'id_29', 'id_30', 'id_31', 'id_33',
       'id_34', 'id_35', 'id_36', 'id_37', 'id_38'],
      dtype='object')
Columns in X and X_test match.
Validation AUC Score: 0.9346783032634545
