In [16]:
import numpy as np
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
# used chatGPT

In [17]:
# Load features and labels
features_array = np.load('x_data.npy', allow_pickle=True)
labels_array = np.load('y_label.npy', allow_pickle=True)


In [18]:
columns_to_remove = [
   'font_size', 'right_gaze_point_in_user_x', 'left_gaze_point_in_user_z', 
   'left_gaze_point_on_display_area_y', 'right_gaze_point_on_display_area_x', 
   'right_gaze_point_in_user_z', 'right_gaze_point_in_user_y', 
   'left_gaze_point_in_user_x', 'right_pupil_diameter', 
   'left_gaze_origin_in_trackbox_z', 'right_gaze_origin_in_trackbox_y', 
   'left_gaze_origin_in_trackbox_x', 'right_gaze_origin_in_user_z', 
   'left_gaze_origin_in_user_x', 'left_gaze_origin_in_user_y', 
   'left_gaze_origin_validity', 'right_gaze_origin_validity', 
   'device_time_stamp', 'left_gaze_point_validity', 
   'right_gaze_point_validity', 'left_pupil_validity', 'right_pupil_validity'
]


In [19]:
processed_dataframes = []
for df in features_array:
    #Drop the columns if they exist, ignore errors if some columns don't exist
    new_df = df.drop(columns=columns_to_remove, errors='ignore')
    processed_dataframes.append(new_df)

features_array = np.array(processed_dataframes, dtype=object)

In [20]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import OneHotEncoder

lengths = [df.size for df in features_array]
k = int(np.median(lengths))
# Example of handling non-numeric data before flattening
processed_features = []
encoder = OneHotEncoder(sparse=False)  # Initialize one-hot encoder

for df in features_array:
    # Check and transform non-numeric columns if necessary
    for col in df.columns:
        if df[col].dtype == object:
            # Assuming the non-numeric data is categorical and not text like 'Times New Roman'
            transformed = encoder.fit_transform(df[[col]])
            df = pd.concat([df.drop(col, axis=1), pd.DataFrame(transformed)], axis=1)
    
    # Flatten and standardize lengths as before
    flattened = df.values.flatten()
    # Use a fixed length 'k' determined as before
    if len(flattened) > k:
        processed_features.append(flattened[:k])
    else:
        processed_features.append(np.pad(flattened, (0, k - len(flattened)), 'constant'))

X = np.array(processed_features)
y = labels_array

In [21]:
print("NaNs in X:", np.isnan(X).any())
print("Infs in X:", np.isinf(X).any())

# If there are any, you might want to consider replacing them
if np.isnan(X).any() or np.isinf(X).any():
    # Replace NaNs with the mean of the column
    col_mean = np.nanmean(X, axis=0)  # Mean ignoring NaNs
    # Find indices where NaN values are
    inds = np.where(np.isnan(X))
    # Replace NaNs with the mean of each column
    X[inds] = np.take(col_mean, inds[1])

    # Replace infinities with large finite numbers
    X[np.isinf(X)] = 1e+18  # You might choose a suitable finite number

    # Re-check
    print("NaNs in X after replacement:", np.isnan(X).any())
    print("Infs in X after replacement:", np.isinf(X).any())


NaNs in X: True
Infs in X: False
NaNs in X after replacement: False
Infs in X after replacement: False


In [22]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=224182)


In [23]:
# Initialize the SVM classifier
svm_model = SVC(kernel='poly')  # You can choose other kernels like 'rbf'

# Train the model
svm_model.fit(X_train, y_train)


SVC(kernel='poly')

In [24]:
# Predict the labels for the test set
y_pred = svm_model.predict(X_test)

# Evaluate the model's performance
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))


Accuracy: 0.4349593495934959
Classification Report:
                         precision    recall  f1-score   support

      immersive - easy       0.00      0.00      0.00         6
      immersive - hard       0.00      0.00      0.00         4
    immersive - normal       0.67      0.33      0.44        12
       skimming - easy       0.00      0.00      0.00        51
       skimming - hard       0.00      0.00      0.00        66
     skimming - normal       0.43      0.99      0.60       104
skimming - really hard       0.00      0.00      0.00         3

              accuracy                           0.43       246
             macro avg       0.16      0.19      0.15       246
          weighted avg       0.22      0.43      0.28       246



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [25]:
from collections import Counter

# Count the occurrences of each label
label_counts = Counter(labels_array)
label_counts

Counter({'immersive - normal': 53,
         'immersive - easy': 27,
         'immersive - hard': 34,
         'skimming - normal': 450,
         'skimming - easy': 278,
         'skimming - hard': 360,
         'immersive - really hard': 2,
         'skimming - really hard': 24})

In [26]:
450/len(labels_array)

0.36644951140065146