In [None]:
# Import necessary libraries
import pandas as pd
from sklearn.ensemble import IsolationForest
from sklearn.impute import SimpleImputer

# Load the dataset
df = pd.read_csv('financial_anomaly_data.csv')

# Handle missing values (replace NaN with the mean for simplicity)
imputer = SimpleImputer(strategy='mean')
df['Amount'] = imputer.fit_transform(df[['Amount']])

# Select relevant features for anomaly detection
features = ['Amount']

# Initialize and fit the Isolation Forest model
model = IsolationForest(contamination=0.05)  # Adjust contamination based on your dataset
model.fit(df[features])

# Predict anomalies (fraudulent activities)
df['Anomaly'] = model.predict(df[features])

# Display flagged anomalies
fraudulent_transactions = df[df['Anomaly'] == -1]
print(fraudulent_transactions)




               Timestamp TransactionID AccountID    Amount   Merchant  \
3       01-01-2023 08:03       TXN1438      ACC6     87.87  MerchantE   
4       01-01-2023 08:04       TXN1338      ACC6    716.56  MerchantI   
7       01-01-2023 08:07        TXN841      ACC7   1336.36  MerchantI   
15      01-01-2023 08:15         TXN65      ACC9  98688.82  MerchantH   
16      01-01-2023 08:16        TXN648      ACC8   1417.84  MerchantC   
...                  ...           ...       ...       ...        ...   
216856  31-05-2023 22:16       TXN1938     ACC10  98388.15  MerchantI   
216904  31-05-2023 23:04       TXN1075      ACC9  98376.93  MerchantJ   
216905  31-05-2023 23:05       TXN1099      ACC9   2464.21  MerchantE   
216932  31-05-2023 23:32        TXN582      ACC5  97969.69  MerchantA   
216934  31-05-2023 23:34       TXN1273     ACC11  99077.99  MerchantG   

       TransactionType       Location  Anomaly  
3             Purchase         London       -1  
4             Purchase   

**Why this approach and evaluation over other methods**:
I chose the Isolation Forest algorithm for its effectiveness in detecting anomalies in high-dimensional datasets, and it doesn't rely on assumptions about the distribution of normal data. Other methods evaluated might include One-Class SVM, Local Outlier Factor, or clustering-based approaches.

**Features and Feature Engineering:**
Features considered may include 'Amount,' 'Transaction Type,' and potentially 'Merchant.' Feature engineering can involve scaling numerical features, encoding categorical features, or creating new features based on domain knowledge

In [None]:
# Example of encoding categorical features with corrected column names
categorical_columns = ['TransactionType', 'Merchant']

# Check if the specified categorical columns exist in the dataset
for col in categorical_columns:
    if col.lower() not in df.columns.str.lower():
        print(f"Column '{col}' not found in the dataset.")

# Assuming the categorical columns are present, proceed with encoding
df_encoded = pd.get_dummies(df, columns=categorical_columns)


## **Predicting Spend for All Transaction Types in June:**

In [None]:
# Correct the column name for timestamp
df['timestamp'] = pd.to_datetime(df['Timestamp'])  # Change 'Timestamp' to 'timestamp'

# Assuming 'timestamp' is in datetime format
june_transactions = df[(df['timestamp'].dt.month == 6)]
monthly_spend = june_transactions.groupby('TransactionType')['Amount'].sum()
print(monthly_spend)


TransactionType
Purchase      1.179190e+08
Transfer      1.207762e+08
Withdrawal    1.188524e+08
Name: Amount, dtype: float64


##**Testing Model Effectiveness**##

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

# Assuming 'Anomaly' is the column representing whether a transaction is fraudulent
X_train, X_test, y_train, y_test = train_test_split(df[features], df['Anomaly'], test_size=0.2, random_state=42)

model.fit(X_train)
predictions = model.predict(X_test)

print(classification_report(y_test, predictions))




              precision    recall  f1-score   support

          -1       0.95      0.95      0.95      2141
           1       1.00      1.00      1.00     41348

    accuracy                           0.99     43489
   macro avg       0.97      0.97      0.97     43489
weighted avg       0.99      0.99      0.99     43489

