In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import IsolationForest
from sklearn.svm import OneClassSVM
from sklearn.covariance import EllipticEnvelope
from sklearn.neighbors import LocalOutlierFactor



# Re-read raw data and clean non-numeric issues
df = pd.read_csv('/content/data (5) (1) (1) (1) (1)(internship-data-1).csv')
df['time'] = pd.to_datetime(df['time'], errors='coerce')
df = df.dropna(subset=['time'])  # Drop rows where time couldn't be parsed

# Feature columns
features = [
    'Cyclone_Inlet_Gas_Temp',
    'Cyclone_Material_Temp',
    'Cyclone_Outlet_Gas_draft',
    'Cyclone_cone_draft',
    'Cyclone_Gas_Outlet_Temp',
    'Cyclone_Inlet_Draft'
]

# Convert all feature columns to numeric, coerce errors to NaN, then drop
for col in features:
    df[col] = pd.to_numeric(df[col], errors='coerce')

df = df.dropna(subset=features)
df = df.sort_values('time')

# Create rolling and diff features
for col in features:
    df[f'{col}_rolling_mean'] = df[col].rolling(window=5).mean()
    df[f'{col}_rolling_std'] = df[col].rolling(window=5).std()
    df[f'{col}_diff'] = df[col].diff()

df = df.dropna()  # Drop rows with NaN due to rolling/diff

X = df.drop(columns=['time'])

# Run anomaly detection models
iso = IsolationForest(contamination=0.02, random_state=0)
df['iso_pred'] = iso.fit_predict(X)

svm = OneClassSVM(nu=0.02, kernel='rbf', gamma='scale')
df['svm_pred'] = svm.fit_predict(X)

elliptic = EllipticEnvelope(contamination=0.02, random_state=0)
df['elliptic_pred'] = elliptic.fit_predict(df[features])

lof = LocalOutlierFactor(n_neighbors=20, contamination=0.02)
df['lof_pred'] = lof.fit_predict(X)

# Convert predictions: 1 = anomaly, 0 = normal
for col in ['iso_pred', 'svm_pred', 'elliptic_pred', 'lof_pred']:
    df[col] = df[col].apply(lambda x: 1 if x == -1 else 0)

df['anomaly_score'] = df[['iso_pred', 'svm_pred', 'elliptic_pred', 'lof_pred']].sum(axis=1)
df['is_anomaly'] = df['anomaly_score'] >= 2

# Save anomalies to CSV
anomalies_df = df[df['is_anomaly']]
anomalies_path = "/content/preh/anomalies_cleaned.csv"
anomalies_df.to_csv(anomalies_path, index=False)

# Save feature summary
summary_stats = df.describe().transpose()
summary_path = "/content/preh/feature_summary_cleaned.csv"
summary_stats.to_csv(summary_path)

# Generate plots for each feature with anomalies marked
plot_paths = []
for col in features:
    plt.figure(figsize=(15, 5))
    sns.lineplot(data=df, x='time', y=col, label=col)
    sns.scatterplot(data=df[df['is_anomaly']], x='time', y=col, color='red', label='Anomaly')
    plt.title(f'Anomaly Detection for {col}')
    plt.xticks(rotation=45)
    plt.tight_layout()
    img_path = f'/content/preh/{col}_anomaly_plot_cleaned.png'
    plt.savefig(img_path)
    plt.close()
    plot_paths.append(img_path)

plot_paths += [anomalies_path, summary_path]
plot_paths


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col] = pd.to_numeric(df[col], errors='coerce')


['/content/preh/Cyclone_Inlet_Gas_Temp_anomaly_plot_cleaned.png',
 '/content/preh/Cyclone_Material_Temp_anomaly_plot_cleaned.png',
 '/content/preh/Cyclone_Outlet_Gas_draft_anomaly_plot_cleaned.png',
 '/content/preh/Cyclone_cone_draft_anomaly_plot_cleaned.png',
 '/content/preh/Cyclone_Gas_Outlet_Temp_anomaly_plot_cleaned.png',
 '/content/preh/Cyclone_Inlet_Draft_anomaly_plot_cleaned.png',
 '/content/preh/anomalies_cleaned.csv',
 '/content/preh/feature_summary_cleaned.csv']