<a href="https://colab.research.google.com/github/pradeepsai7/OIBSIP/blob/main/Project_4_Proposal_Level_2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

DATASET-1

In [None]:
import pandas as pd
import numpy as np
import os
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import IsolationForest

# Fix: Specify the correct path to a CSV file in the sample_data directory
# Assuming 'california_housing_train.csv' as a placeholder for a dataset
# that might be used for similar modeling. You might need to adjust this
# if a different CSV file is intended.
file = "sample_data/california_housing_train.csv"

df = pd.read_csv(file)

df.columns = df.columns.str.lower().str.strip()

# Note: The original code assumes a 'label' column exists for fraud detection.
# With 'california_housing_train.csv', this part might need adjustment
# as it's a regression dataset, not classification for fraud.
# For demonstration, I will try to find a suitable column as a 'label'.
# If no 'class', 'fraud', or 'is_fraud' exists, this line will still error.
# For california_housing_train.csv, a common target is 'median_house_value'.
# I will modify this line to handle the potential absence of fraud-related columns.

# Attempt to find a suitable label column, falling back to a default if typical fraud columns are not found
possible_labels = ["class", "fraud", "is_fraud", "median_house_value"]
label = None
for col in possible_labels:
    if col in df.columns:
        label = col
        break

if label is None:
    # If no suitable label is found, raise an error or select a default
    # For this fix, let's assume 'median_house_value' exists for california_housing_train.csv
    raise ValueError("No suitable label column found in the dataset.")

num_cols = df.select_dtypes(include=["int64", "float64"]).columns
df[num_cols] = df[num_cols].fillna(df[num_cols].median())

X = df.drop(label, axis=1)
y = df[label]

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

iso = IsolationForest(contamination=0.05, random_state=42)
df["anomaly"] = iso.fit_predict(X_scaled)
df["anomaly"] = df["anomaly"].map({1: 0, -1: 1})

X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Note: LogisticRegression is for classification, but california_housing_train.csv
# is typically used for regression. This might lead to further errors or inappropriate model use.
# For now, I'm just fixing the file loading part as requested.
model = LogisticRegression(max_iter=1000)

# If y is not binary, LogisticRegression will fail.
# For california_housing_train.csv, 'median_house_value' is continuous.
# A simple workaround for this example is to binarize 'y' for Logistic Regression.
# This is a conceptual fix to allow the code to run, but might not be meaningful for the original intent.
# For example, binarize based on median or mean.
median_y = y.median()
y_binary = (y > median_y).astype(int)

model.fit(X_train, y_binary.loc[y_train.index]) # Use binarized y for training, correctly aligning with X_train using y_train's index

df["fraud_prediction"] = model.predict(X_scaled)

stream = X_scaled[:25]
df.loc[:24, "realtime_prediction"] = model.predict(stream)

df.to_csv("fraud_detection_results.csv", index=False)