In [None]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import LabelEncoder
from scipy.stats import zscore
from sklearn.ensemble import IsolationForest
from sklearn.model_selection import train_test_split

from pmdarima import auto_arima
from statsmodels.tsa.statespace.sarimax import SARIMAX
from statsmodels.tsa.stattools import adfuller


In [None]:
df = pd.read_excel("data.xlsx")
print("Dataset loaded with shape:", df.shape)


In [None]:
plt.figure(figsize=(14, 6))

# Bar plot for categorical feature 'type'
plt.subplot(1, 2, 1)
df["type"].value_counts().plot(kind='bar', color='skyblue')
plt.title("Distribution of 'type'")
plt.xlabel("Type")
plt.ylabel("Count")

# Box plot for 'amount'
plt.subplot(1, 2, 2)
sns.boxplot(x=df['amount'])
plt.title("Box Plot of 'amount'")

plt.tight_layout()
plt.show()


In [None]:
for col in df.columns:
    if df[col].dtype == "object":
        df[col].fillna("None", inplace=True)
        df[col] = LabelEncoder().fit_transform(df[col].astype(str))
    else:
        df[col].fillna(-999, inplace=True)

print("Preprocessed Data:\n", df.head())


In [None]:
def check_stationarity(series):
    result = adfuller(series.dropna())
    return result[1] <= 0.05

target_col = "newbalanceOrig"

if target_col in df.columns:
    stationary = check_stationarity(df[target_col])
    print(f"{target_col} is {'Stationary' if stationary else 'Non-Stationary'}")

    if not stationary:
        df[target_col] = df[target_col].diff().dropna()
        df.dropna(inplace=True)


In [None]:
print("\nAuto-ARIMA fitting...")
arima_model = auto_arima(df[target_col], trace=True, error_action='ignore', suppress_warnings=True)
print(arima_model.summary())

n_periods = 10
forecast, conf_int = arima_model.predict(n_periods=n_periods, return_conf_int=True)

plt.figure(figsize=(10, 4))
plt.plot(df[target_col], label='Original Series')
plt.plot(np.arange(len(df[target_col]), len(df[target_col]) + n_periods), forecast, label='Forecast', color='red')
plt.fill_between(np.arange(len(df[target_col]), len(df[target_col]) + n_periods),
                 conf_int[:, 0], conf_int[:, 1], color='pink', alpha=0.3)
plt.title("ARIMA Forecast")
plt.legend()
plt.show()


In [None]:
print("\nSARIMAX Model fitting...")
sarimax_model = SARIMAX(
    df[target_col],
    order=(1, 1, 1),
    seasonal_order=(1, 1, 1, 12),
    enforce_stationarity=False,
    enforce_invertibility=False
)
sarimax_result = sarimax_model.fit(disp=False)
print(sarimax_result.summary())

forecast_sarimax = sarimax_result.forecast(steps=n_periods)

plt.figure(figsize=(10, 4))
plt.plot(df[target_col], label='Original Series')
plt.plot(np.arange(len(df[target_col]), len(df[target_col]) + n_periods), forecast_sarimax, label='SARIMAX Forecast', color='green')
plt.title("SARIMAX Forecast")
plt.legend()
plt.show()


In [None]:
print("\nZ-Score Outlier Detection:")
numeric_cols = df.select_dtypes(include=[np.number]).columns

for col in numeric_cols:
    df[f"{col}_zscore"] = zscore(df[col])
    outliers = df[np.abs(df[f"{col}_zscore"]) > 3]
    print(f"Outliers in {col}: {len(outliers)}")


In [None]:
print("\nIsolation Forest Anomaly Detection:")
features = df[numeric_cols].drop(columns=[f"{col}_zscore" for col in numeric_cols], errors='ignore')

iso_model = IsolationForest(n_estimators=100, contamination=0.05, random_state=42)
df['anomaly'] = iso_model.fit_predict(features)

print("Anomalies detected:")
print(df['anomaly'].value_counts())

plt.figure(figsize=(10, 4))
plt.plot(df[target_col].values, label='Data')
plt.scatter(df.index[df['anomaly'] == -1], df[target_col][df['anomaly'] == -1], color='red', label='Anomaly')
plt.legend()
plt.title("Anomaly Detection in Time Series")
plt.show()
