<a href="https://colab.research.google.com/github/prashantmalan/NSE/blob/main/DQ_20241014.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
from sklearn.ensemble import IsolationForest
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
import plotly.graph_objects as go
from plotly.subplots import make_subplots

def read_data(transaction_url, classification_url):
    # Load the datasets
    transaction_data = pd.read_csv(transaction_url)
    column_classification = pd.read_csv(classification_url)
    return transaction_data, column_classification

def preprocess_data(transaction_data, column_classification):
    # Clean the data
    transaction_data = transaction_data[~transaction_data.applymap(lambda x: ';' in str(x)).any(axis=1)]
    transaction_data.reset_index(drop=True, inplace=True)
    transaction_data = transaction_data.applymap(lambda x: str(x).replace(',', '') if isinstance(x, str) else x)
    transaction_data = transaction_data.applymap(lambda x: str(x).replace('+', '') if isinstance(x, str) else x)
    transaction_data = transaction_data[~transaction_data.applymap(lambda x: ';' in str(x)).any(axis=1)]

    # Feature Engineering
    feature_categories = column_classification.set_index('Column')['Type'].to_dict()
    numeric_features = [col for col, cat in feature_categories.items() if cat == 'Amt']
    numeric_features = [col for col in numeric_features if col in transaction_data.columns]

    # Check for columns with all NaN values
    all_nan_columns = transaction_data[numeric_features].columns[transaction_data[numeric_features].isna().all()].tolist()
    transaction_data[all_nan_columns] = 0

    return transaction_data, numeric_features

def predict_anomalies(transaction_data, numeric_features):
    # Create a pipeline for preprocessing and anomaly detection
    pipeline = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='mean')),
        ('scaler', StandardScaler()),
        ('isolation_forest', IsolationForest(contamination=0.01, random_state=42))
    ])

    anomaly_scores_df = pd.DataFrame(index=transaction_data.index)
    for feature in numeric_features:
        feature_data = transaction_data[[feature]].copy()
        scores = pipeline.fit_predict(feature_data)
        anomaly_scores_df[feature] = scores

    anomaly_scores_df['Product name'] = transaction_data['Product name']
    anomaly_scores_df['Action type'] = transaction_data['Action type']
    return anomaly_scores_df

def plot_anomalies(transaction_data, anomaly_scores_df, numeric_features):
    n_features = len(numeric_features)
    n_cols = 4
    n_rows = int(np.ceil(n_features / n_cols))
    fig = make_subplots(rows=n_rows, cols=n_cols, subplot_titles=numeric_features, horizontal_spacing=0.05, vertical_spacing=0.05)

    for i, feature in enumerate(numeric_features):
        row = i // n_cols + 1
        col = i % n_cols + 1
        fig.add_trace(go.Scatter(
            x=anomaly_scores_df.index[anomaly_scores_df[feature] == 1],
            y=transaction_data[feature][anomaly_scores_df[feature] == 1],
            mode='markers',
            name='Normal',
            customdata=anomaly_scores_df[anomaly_scores_df[feature] == 1][['Product name', 'Action type']],
            hovertemplate='Product name: %{customdata[0]}<br>Action type: %{customdata[1]}<br>Value: %{y}<extra></extra>',
            marker=dict(size=8, color='green', opacity=0.7)
        ), row=row, col=col)

        fig.add_trace(go.Scatter(
            x=anomaly_scores_df.index[anomaly_scores_df[feature] == -1],
            y=transaction_data[feature][anomaly_scores_df[feature] == -1],
            mode='markers',
            name='Anomalous',
            customdata=anomaly_scores_df[anomaly_scores_df[feature] == -1][['Product name', 'Action type']],
            hovertemplate='Product name: %{customdata[0]}<br>Action type: %{customdata[1]}<br>Value: %{y}<extra></extra>',
            marker=dict(size=8, color='red', opacity=0.7)
        ), row=row, col=col)

    fig.update_layout(
        title='Anomaly Scores for Each Feature',
        showlegend=False,
        hovermode='closest',
        font=dict(size=10),
        autosize=False,
        width=1500,
        height=1200
    )
    fig.show()

# Example usage
transaction_data, column_classification = read_data(
    'https://raw.githubusercontent.com/prashantmalan/NSE/main/Pm_CFTC_2017649.csv',
    'https://raw.githubusercontent.com/prashantmalan/NSE/main/col_cat_5.csv'
)
transaction_data, numeric_features = preprocess_data(transaction_data, column_classification)
anomaly_scores_df = predict_anomalies(transaction_data, numeric_features)
plot_anomalies(transaction_data, anomaly_scores_df, numeric_features)