In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import GridSearchCV

In [2]:
#Collect the time series data from the CSV file linked here.
data = pd.read_csv("C:/Users/admin/Downloads/Anoma_data.csv")

In [None]:
# Exploratory Data Analysis (EDA)
# Display basic statistics
print(data.describe())
# Check for missing values
print(data.isnull().sum())

# Visualize data distribution
sns.countplot(data['y'])
plt.show()

# Explore relationships between predictors and target variable
sns.pairplot(data, hue='y')
plt.show()


In [None]:
#Feature Engineering and feature selection

In [5]:
# Get the correct datatype for date
data['date'] = pd.to_datetime(data['date'])

In [8]:
# Model Selection
model = RandomForestClassifier(random_state=42)

In [None]:
#Choose the metrics for the model evaluation 

In [10]:
# Train/Test Split
X = data.drop(['y', 'date'], axis=1)  # Exclude 'date' column
y = data['y']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:
#Model Selection, Training, Predicting and Assessment

In [11]:
# Model Training
model.fit(X_train, y_train)


In [12]:
# Predictions on the test set
y_pred = model.predict(X_test)

In [13]:
# Model Assessment
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy * 100:.2f}%')


Accuracy: 99.76%


In [None]:
#Hyperparameter Tuning/Model Improvement

In [14]:
# using GridSearchCV for hyperparameter tuning
param_grid = {'n_estimators': [50, 100, 200], 'max_depth': [None, 10, 20]}
grid_search = GridSearchCV(model, param_grid, cv=5)
grid_search.fit(X_train, y_train)

best_model = grid_search.best_estimator_
best_model.fit(X_train, y_train)
best_y_pred = best_model.predict(X_test)

In [15]:
# Model Assessment after tuning
best_accuracy = accuracy_score(y_test, best_y_pred)
print(f'Best Model Accuracy: {best_accuracy * 100:.2f}%')


Best Model Accuracy: 99.76%


In [None]:
#Model deployment plan.

In [16]:
import joblib
joblib.dump(best_model, 'anomaly_detection_model.pkl')


['anomaly_detection_model.pkl']