In [1]:
import numpy as np
import pandas as pd
import joblib
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report

In [2]:
# Step 1: Read data from output.txt
file_path = "../output.txt"
data = []

with open(file_path, "r") as file:
    for line in file:
        numbers = list(map(float, line.split()))
        data.append(numbers)

data = np.array(data)


In [3]:
# Step 2: Create Windowed Data (Sliding Window of 4 Samples)
# features = []
# labels = []
# window_size = 3

# for category_label, category_data in enumerate(data):
#     for i in range(len(category_data) - window_size + 1):  # Sliding window
#         window = category_data[i : i + window_size]  # Take 4 consecutive samples
#         features.append([
#             np.mean(window),  # Mean
#             np.median(window),  # Median
#             np.var(window),  # Variance
#             np.min(window),  # Min
#             np.max(window)   # Max
#         ])
#         labels.append(category_label-1)  # Assign category label

In [4]:
# Step 2: Flatten data (Each data point is now independent)
features = []
labels = []

for category_label, category_data in enumerate(data):
    for value in category_data:
        features.append([value])
        labels.append(category_label-1)

# Convert to DataFrame
df = pd.DataFrame(features, columns=["Value"])
df["Category"] = labels

In [5]:
# Convert to DataFrame
# df = pd.DataFrame(features, columns=["Mean", "Median", "Variance", "Min", "Max"])
# df["Category"] = labels


In [6]:
# Step 3: Train-Test Split
X = df.drop(columns=["Category"])
y = df["Category"]
from scipy.stats import zscore

# Compute Z-score
z_scores = np.abs(X.apply(zscore))

# Remove outliers beyond Z = 3
X_filtered, y_filtered = X[(z_scores < 3).all(axis=1)], y[(z_scores < 3).all(axis=1)]

# Split filtered data
X_train, X_test, y_train, y_test = train_test_split(X_filtered, y_filtered, test_size=0.2, random_state=42, stratify=y_filtered)

In [7]:
# Step 4: Normalize Features (Optional for Decision Tree)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [8]:
# Step 5: Train a Decision Tree
clf_dt = DecisionTreeClassifier(
    max_depth=5,             # Control depth
    min_samples_split=10,    # Avoid splitting small nodes
    min_samples_leaf=5,      # Ensure meaningful leaf nodes
    max_features="sqrt",     # Reduce feature correlation
    ccp_alpha=0.01,          # Prune unnecessary branches
    random_state=42
)
# X = scaler.transform(X)


In [9]:
# X = scaler.transform(X)
clf_dt.fit(X_train, y_train)

In [10]:
clf_rf = RandomForestClassifier(n_estimators=100, random_state=42)
clf_rf.fit(X_train, y_train)

In [11]:

model_path = "../models/dt_classifier_300_median_5.pkl"
scaler_path = "../models/scaler.pkl"
# Step 7: Save the Model
joblib.dump(clf_dt, model_path)
joblib.dump(scaler, scaler_path)
print("Model and scaler saved!")

Model and scaler saved!


##### Inference

In [12]:
import matplotlib.pyplot as plt
import joblib
import numpy as np

In [13]:
# Read test data from file
test_file = "../input.txt"  # Your test data file
data = []

with open(test_file, "r") as file:
    for line in file:
        numbers = list(map(float, line.split()))  # Convert space-separated numbers to float
        data.extend(numbers)  # Flatten into one list

data = np.array(data).reshape(-1, 1)  # Convert to column vector

In [14]:
# Load model
clf = joblib.load(model_path)
scaler = joblib.load(scaler_path)

# Generate test values

data = scaler.transform(data)
# Predict
predictions = clf.predict(data)




In [15]:
predictions

array([-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
       -1, -1, -1, -1, -1, -1, -1, -1,  1,  2,  2,  0,  1,  2,  1,  1,  1,
        2,  3,  0,  1,  2,  3,  0,  1,  2,  3,  3,  0,  2,  0,  0,  1,  3,
        1,  3,  1,  2,  3,  3,  1,  3,  0,  2,  1,  2,  3,  0,  1,  2,  1,
        0,  0,  2,  0,  1,  0,  2,  0,  0,  1,  0,  2,  1,  0,  2,  0,  0,
        1,  2,  3,  0,  1,  2,  3,  3,  1,  3,  1,  2,  1,  2,  1,  1,  0,
        2,  0,  0,  1,  2,  1,  3,  1,  3,  1,  2,  1,  3,  1,  2,  1,  2,
        1,  2,  1,  2,  1,  2, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
       -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
       -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
       -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
       -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1])

In [16]:
data = []

with open(test_file, "r") as file:
    for line in file:
        numbers = list(map(float, line.split()))  # Convert space-separated numbers to float
        data.extend(numbers)  # Flatten into one list

data = np.array(data).reshape(-1, 1)  # Convert to column vector
np.set_printoptions(suppress=True)
paired = np.column_stack((data, predictions))
print(paired)

[[16793.    -1.]
 [19321.    -1.]
 [19254.    -1.]
 [19199.    -1.]
 [19024.    -1.]
 [18659.    -1.]
 [19247.    -1.]
 [19268.    -1.]
 [19213.    -1.]
 [19201.    -1.]
 [19035.    -1.]
 [19324.    -1.]
 [19140.    -1.]
 [19224.    -1.]
 [19190.    -1.]
 [19231.    -1.]
 [19136.    -1.]
 [19260.    -1.]
 [19205.    -1.]
 [19370.    -1.]
 [19185.    -1.]
 [19316.    -1.]
 [19264.    -1.]
 [19333.    -1.]
 [18984.    -1.]
 [14250.     1.]
 [11879.     2.]
 [11854.     2.]
 [15873.     0.]
 [14153.     1.]
 [11869.     2.]
 [14209.     1.]
 [14188.     1.]
 [14262.     1.]
 [11913.     2.]
 [ 9732.     3.]
 [15755.     0.]
 [14197.     1.]
 [11867.     2.]
 [ 9964.     3.]
 [15940.     0.]
 [14261.     1.]
 [11883.     2.]
 [ 9751.     3.]
 [ 9807.     3.]
 [15991.     0.]
 [11868.     2.]
 [15949.     0.]
 [15898.     0.]
 [14158.     1.]
 [ 9790.     3.]
 [14268.     1.]
 [ 9960.     3.]
 [14301.     1.]
 [11920.     2.]
 [ 9966.     3.]
 [ 9612.     3.]
 [14181.     1.]
 [ 9919.     3