In [None]:
import pandas as pd
from sklearn.linear_model import LogisticRegression  # Updated import
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler  # Added StandardScaler for Logistic Regression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
import matplotlib.pyplot as plt
import numpy as np

# Load the CSV files into DataFrames for training and testing
train_data = pd.read_csv("/content/fraudTrain.csv")
test_data = pd.read_csv("/content/fraudTest.csv")

# Select specific columns for training and testing
selected_cols = ['merchant', 'cc_num', 'category', 'amt', 'gender', 'city_pop', 'dob', 'state','job','trans_num', 'merch_lat','merch_long', 'is_fraud']

X_train = train_data[selected_cols[:-1]]  # Features for training
y_train = train_data[selected_cols[-1]]  # Target for training

X_test = test_data[selected_cols[:-1]]  # Features for testing
y_test = test_data[selected_cols[-1]]  # Target for testing




In [None]:
if y_train.isnull().values.any():
    # Impute missing values in y_train
    target_imputer = SimpleImputer(strategy='most_frequent')
    y_train = target_imputer.fit_transform(pd.DataFrame(y_train))

# Impute missing values with mean for numeric columns
numeric_cols = X_train.select_dtypes(include='number').columns.tolist()
numeric_imputer = SimpleImputer(strategy='mean')
X_train[numeric_cols] = numeric_imputer.fit_transform(X_train[numeric_cols])
X_test[numeric_cols] = numeric_imputer.transform(X_test[numeric_cols])

# Encode categorical columns using OneHotEncoder with handle_unknown='ignore'
categorical_cols = X_train.select_dtypes(exclude='number').columns.tolist()
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols),
        ('num', StandardScaler(), numeric_cols)  # Scaling numeric columns for Logistic Regression
    ],
    remainder='passthrough'
)

X_train_encoded = preprocessor.fit_transform(X_train)
X_test_encoded = preprocessor.transform(X_test)

if y_test.isnull().values.any():
    # Impute missing values in y_test if there are any
    target_imputer = SimpleImputer(strategy='most_frequent')
    y_test = target_imputer.fit_transform(pd.DataFrame(y_test))

# Fit the classifier using X_train and y_train
clf = LogisticRegression(random_state=0)  # Logistic Regression Classifier
clf.fit(X_train_encoded, y_train)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train[numeric_cols] = numeric_imputer.fit_transform(X_train[numeric_cols])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test[numeric_cols] = numeric_imputer.transform(X_test[numeric_cols])
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logi

In [None]:

print("Number of columns:", X_train.shape[1])
print("Number of features:", X_train_encoded.shape[1])

y_pred = clf.predict(X_test_encoded)
print(y_pred)

accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print("Precision:", precision)
print("Recall:", recall)
print("F1-score:", f1)

print(len(X_train.columns))
print(X_train.columns)

unique_classes = np.unique(y_train).astype(str)
unique_classes_train = np.unique(y_train).astype(str)
print(unique_classes_train)

unique_classes_test = np.unique(y_test).astype(str)
print(unique_classes_test)


conf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(conf_matrix)

Number of columns: 12
Number of features: 1050783
[0 0 0 ... 0 0 0]
Accuracy: 0.995639882746496
Precision: 0.017361111111111112
Recall: 0.002331002331002331
F1-score: 0.004110152075626798
12
Index(['merchant', 'cc_num', 'category', 'amt', 'gender', 'city_pop', 'dob',
       'state', 'job', 'trans_num', 'merch_lat', 'merch_long'],
      dtype='object')
['0' '1']
['0' '1']
Confusion Matrix:
[[553291    283]
 [  2140      5]]
