<a href="https://colab.research.google.com/github/prashantmalan/Chatbot/blob/main/DQ_20241014.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
from sklearn.ensemble import IsolationForest
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
import plotly.graph_objects as go
from plotly.subplots import make_subplots

# Load the datasets
transaction_data = pd.read_csv('https://raw.githubusercontent.com/prashantmalan/NSE/main/Pm_CFTC_2017649.csv')
column_classification = pd.read_csv('https://raw.githubusercontent.com/prashantmalan/NSE/main/col_cat_5.csv')

# Clean the data
transaction_data = transaction_data[~transaction_data.applymap(lambda x: ';' in str(x)).any(axis=1)]
transaction_data.reset_index(drop=True, inplace=True)
transaction_data = transaction_data.applymap(lambda x: str(x).replace(',', '') if isinstance(x, str) else x)
transaction_data = transaction_data.applymap(lambda x: str(x).replace('+', '') if isinstance(x, str) else x)
transaction_data = transaction_data[~transaction_data.applymap(lambda x: ';' in str(x)).any(axis=1)]

# Feature Engineering
feature_categories = column_classification.set_index('Column')['Type'].to_dict()
numeric_features = [col for col, cat in feature_categories.items() if cat == 'Amt']

# Ensure numeric features exist in the DataFrame
numeric_features = [col for col in numeric_features if col in transaction_data.columns]

# Check for columns with all NaN values
all_nan_columns = transaction_data[numeric_features].columns[transaction_data[numeric_features].isna().all()].tolist()

# Option 2: Impute with a constant value (e.g., 0) if you prefer not to drop
transaction_data[all_nan_columns] = 0

# Impute missing values for numeric features
imputer = SimpleImputer(strategy='mean')
imputed_data = imputer.fit_transform(transaction_data[numeric_features])

# Scale numeric features
scaler = StandardScaler()
numeric_data = scaler.fit_transform(imputed_data)

# Initialize a DataFrame to store anomaly scores for each feature
anomaly_scores_df = pd.DataFrame(index=transaction_data.index)

# Loop through each feature and calculate anomaly scores
for feature in numeric_features:
    # Extract the feature data
    feature_data = numeric_data[:, numeric_features.index(feature)].reshape(-1, 1)

    # Train the Isolation Forest model
    iso_forest = IsolationForest(contamination=0.01, random_state=42)
    scores = iso_forest.fit_predict(feature_data)

    # Store the anomaly scores in the DataFrame
    anomaly_scores_df[feature] = scores

# Add 'Product name' and 'Action type' columns to anomaly_scores_df
anomaly_scores_df['Product name'] = transaction_data['Product name']
anomaly_scores_df['Action type'] = transaction_data['Action type']

# Determine the number of rows and columns for the subplot grid
n_features = len(numeric_features)
n_cols = 4
n_rows = int(np.ceil(n_features / n_cols))

# Create a grid of subplots
fig = make_subplots(rows=n_rows, cols=n_cols, subplot_titles=numeric_features, horizontal_spacing=0.05, vertical_spacing=0.05)

for i, feature in enumerate(numeric_features):
    # Determine the row and column for the current subplot
    row = i // n_cols + 1
    col = i % n_cols + 1

    # Create a scatter plot for normal data points
    fig.add_trace(go.Scatter(
        x=anomaly_scores_df.index[anomaly_scores_df[feature] == 1],
        y=transaction_data[feature][anomaly_scores_df[feature] == 1],
        mode='markers',
        name='Normal',
        customdata=anomaly_scores_df[anomaly_scores_df[feature] == 1][['Product name', 'Action type']],
        hovertemplate='Product name: %{customdata[0]}<br>Action type: %{customdata[1]}<br>Value: %{y}<extra></extra>',
        marker=dict(
            size=8,  # Increase marker size
            color='green',  # Color for normal data points
            opacity=0.7
        )
    ), row=row, col=col)

    # Create a scatter plot for anomalous data points
    fig.add_trace(go.Scatter(
        x=anomaly_scores_df.index[anomaly_scores_df[feature] == -1],
        y=transaction_data[feature][anomaly_scores_df[feature] == -1],
        mode='markers',
        name='Anomalous',
        customdata=anomaly_scores_df[anomaly_scores_df[feature] == -1][['Product name', 'Action type']],
        hovertemplate='Product name: %{customdata[0]}<br>Action type: %{customdata[1]}<br>Value: %{y}<extra></extra>',
        marker=dict(
            size=8,  # Increase marker size
            color='red',  # Color for anomalous data points
            opacity=0.7
        )
    ), row=row, col=col)

# Update layout
fig.update_layout(
    title='Anomaly Scores for Each Feature',
    showlegend=False,
    hovermode='closest',
    font=dict(
        size=10  # Adjust font size if needed
    ),
    autosize=False,
    width=1500,  # Increase width to allow more space for plots
    height=1200  # Increase height to fit more plots
)

# Set individual axis titles
for i, feature in enumerate(numeric_features):
    row = i // n_cols + 1
    col = i % n_cols + 1
    #fig.update_yaxes(title_text=f'{feature}', row=row, col=col, tickangle=-45)  # Tilt y-axis labels
   # fig.update_xaxes(title_text='Index', row=row, col=col)

# Show the plot
fig.show()