In [3]:
import math
import numpy as np
import pandas as pd
import math
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score, classification_report

In [4]:
def get_amp_phase(data, position, width, is_calibrated=False, calibration=None, lookup_size=100):
    """
    Calculate the amplitude and phase of the Lissajous curve.

    :param data: List of tuples [(x1, y1), (x2, y2), ...] representing the data points.
    :param position: Central position in the data to analyze.
    :param width: Window width around the position.
    :param is_calibrated: Whether to apply calibration to amplitude.
    :param calibration: Dictionary with calibration factors {'voltage': float, 'amplitude': float}.
    :param lookup_size: Size of the lookup range for filtering distant points.
    :return: Tuple (amplitude, phase) where amplitude is the distance and phase is in degrees.
    """
    n_points = len(data)
    p1 = max(0, position - width // 2)
    p2 = min(n_points - 1, position + width // 2)

    # Initialize indices and min/max values
    A = B = C = D = p1
    min_x = max_x = data[p1][0]
    min_y = max_y = data[p1][1]

    # Find extreme points based on X values
    for i in range(p1 + 1, p2 + 1):
        if data[i][0] < min_x:
            min_x = data[i][0]
            A = i
        if data[i][0] > max_x:
            max_x = data[i][0]
            B = i

    # Find extreme points based on Y values
    for i in range(p1 + 1, p2 + 1):
        if data[i][1] < min_y:
            min_y = data[i][1]
            C = i
        if data[i][1] > max_y:
            max_y = data[i][1]
            D = i

    # Adjust ranges for lookup
    if C < A:
        A, C = C, A
    if D < B:
        B, D = D, B

    if (C - A) > lookup_size:
        tA = (A + C - lookup_size) // 2
        tC = (A + C + lookup_size) // 2
        A, C = tA, tC
    if (D - B) > lookup_size:
        tB = (B + D - lookup_size) // 2
        tD = (B + D + lookup_size) // 2
        B, D = tB, tD

    # Find the most distant points
    max_dist = 0
    q1 = q2 = None
    for s1 in range(A, C + 1):
        for s2 in range(B, D + 1):
            dx = data[s1][0] - data[s2][0]
            dy = data[s1][1] - data[s2][1]
            dist = dx**2 + dy**2
            if dist > max_dist:
                max_dist = dist
                q1, q2 = s1, s2

    # Ensure indices are valid
    q1 = min(q1, n_points - 1)
    q2 = min(q2, n_points - 1)

    # Swap if necessary to maintain order
    if q1 < q2:
        q1, q2 = q2, q1

    # Calculate amplitude and phase
    x1, y1 = data[q1]
    x2, y2 = data[q2]
    amplitude = math.sqrt((x2 - x1)**2 + (y2 - y1)**2)
    if is_calibrated and calibration:
        amplitude *= calibration['voltage'] / calibration['amplitude']

    dx = x1 - x2
    dy = y2 - y1
    phase = math.degrees(math.atan2(dy, dx))
    phase = phase % 360  # Normalize phase to [0, 360)

    return amplitude, phase

def extract_features(slice_df):
    """
    Extract amplitude and phase features for each slice of data.
    :param slice_df: DataFrame containing x and y data points (columns x0 and x1).
    :return: Feature vector containing amplitude and phase.
    """
    # Prepare data as a list of tuples [(x, y), ...]
    data = list(zip(slice_df["0"], slice_df["1"]))

    # Compute amplitude and phase around the center of the slice
    position = len(data) // 2  # Use center of the slice
    width = len(data)         # Use the full slice length as the window
    amplitude, phase = get_amp_phase(data, position, width)

    return np.array([amplitude, phase])

In [5]:
df = pd.read_csv("saved/data_with_defects.csv")
print(df.columns)

Index(['0', '1', '2', '3', '4', '5', '6', '7', 'slice_number', 'phase_0',
       'phase_1', 'phase_2'],
      dtype='object')


In [13]:
first_pair_df = df.drop(['0', '1', '4', '5', '6', '7', 'phase_1', 'phase_2'], axis=1)
print(first_pair_df)
print(df['slice_number'].unique())

             2         3  slice_number  phase_0
0    -5.371949  1.999640           NaN      NaN
1    -5.383067  2.006006           NaN      NaN
2    -5.380915  2.008247           NaN      NaN
3    -5.383067  2.006006           NaN      NaN
4    -5.374011  1.993006           NaN      NaN
...        ...       ...           ...      ...
5047 -0.858898 -0.118907           NaN      NaN
5048 -0.858809 -0.123301           NaN      NaN
5049 -0.865354 -0.125632           NaN      NaN
5050 -0.865354 -0.125632           NaN      NaN
5051 -0.869747 -0.125722           NaN      NaN

[5052 rows x 4 columns]
[nan -1.  1.  4.  2.  3.  5.]


In [14]:
first_pair_df['target'] = np.where(first_pair_df['slice_number'].isna(), 0, 1)
print(first_pair_df)
print(first_pair_df['target'].unique())
print(len(first_pair_df[first_pair_df['target']==1]))

             2         3  slice_number  phase_0  target
0    -5.371949  1.999640           NaN      NaN       0
1    -5.383067  2.006006           NaN      NaN       0
2    -5.380915  2.008247           NaN      NaN       0
3    -5.383067  2.006006           NaN      NaN       0
4    -5.374011  1.993006           NaN      NaN       0
...        ...       ...           ...      ...     ...
5047 -0.858898 -0.118907           NaN      NaN       0
5048 -0.858809 -0.123301           NaN      NaN       0
5049 -0.865354 -0.125632           NaN      NaN       0
5050 -0.865354 -0.125632           NaN      NaN       0
5051 -0.869747 -0.125722           NaN      NaN       0

[5052 rows x 5 columns]
[0 1]
258


In [15]:
X = first_pair_df[['2', '3']]
y = first_pair_df['target']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

rf = GradientBoostingClassifier(n_estimators=300, max_depth=5, random_state=42)
rf.fit(X_train, y_train)

y_pred = rf.predict(X_test)
print("Random Forest Test Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:")
print(classification_report(y_test, y_pred))

Random Forest Test Accuracy: 0.9802176063303659
Classification Report:
              precision    recall  f1-score   support

           0       0.98      1.00      0.99       959
           1       0.90      0.69      0.78        52

    accuracy                           0.98      1011
   macro avg       0.94      0.84      0.89      1011
weighted avg       0.98      0.98      0.98      1011



In [16]:
y_pred_probs = rf.predict_proba(X_test)[:, 1]
y_pred = (y_pred_probs > 0.15).astype(int)
print("Test Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:")
print(classification_report(y_test, y_pred))

Test Accuracy: 0.9812067260138477
Classification Report:
              precision    recall  f1-score   support

           0       0.99      0.99      0.99       959
           1       0.84      0.79      0.81        52

    accuracy                           0.98      1011
   macro avg       0.91      0.89      0.90      1011
weighted avg       0.98      0.98      0.98      1011



In [None]:

# 1. Load your new data
new_df = pd.read_csv("saved/test copy.csv")


# 3. If your model was simply trained on columns [0, 1], do:
X_new = new_df[['2', '1']]

# 5. If you want probabilities (for example, to apply a threshold), use:
y_new_pred_proba = rf.predict_proba(X_new)[:, 1]

# Add 'defect_proba_1' column to the DataFrame
new_df['defect_proba_2'] = y_new_pred_proba

# Save the updated DataFrame back to the same file
new_df.to_csv("saved/test copy.csv", index=False)