In [24]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
import scipy.stats as stats
from sklearn.preprocessing import StandardScaler


In [25]:
# Load the data
df = pd.read_csv('behavior_result.csv')
df.head()

Unnamed: 0,filename,windowswitch_count_gpt,windowswitch_speed_gpt,totaltime_gpt,click_count_gpt,total_mouse_movement_gpt,mousewheel_count_gpt,total_mousewheel_distance_gpt,average_mousewheel_distance_gpt,med_mousewheel_distance_gpt,...,keypress_count_tasksheet,highlight_count_tasksheet,average_highlight_length_tasksheet,med_highlight_length_tasksheet,idle_count_tasksheet,med_idle_duration_tasksheet,total_idle_duration_tasksheet,score_AI,score_alone,score_overreliance
0,A2.json,3,205.303,615.909,47,53245.778417,145,14499.999833,99.999999,99.999999,...,53,2,49.0,49.0,4,2005.0,8022,32,61,-29
1,A3.json,4,201.14325,804.573,22,26918.727912,312,26904.0,86.230769,102.0,...,438,5,195.8,2.0,29,2002.0,58043,61,42,19
2,A4.json,22,42.957364,945.062,33,26747.643438,69,4390.535278,63.630946,24.888306,...,92,12,233.583333,24.0,53,2002.0,106222,51,45,6
3,A5.json,2,25.4055,50.811,6,5755.784963,24,2900.0,120.833333,100.0,...,0,0,0.0,0.0,7,2013.0,14819,65,55,10
4,A6.json,9,92.944333,836.499,19,17445.667936,524,52500.0,100.19084,100.0,...,1,14,183.857143,142.0,18,2011.5,36165,30,40,-10


In [26]:
# EDA
df.describe()

Unnamed: 0,windowswitch_count_gpt,windowswitch_speed_gpt,totaltime_gpt,click_count_gpt,total_mouse_movement_gpt,mousewheel_count_gpt,total_mousewheel_distance_gpt,average_mousewheel_distance_gpt,med_mousewheel_distance_gpt,copy_count_gpt,...,keypress_count_tasksheet,highlight_count_tasksheet,average_highlight_length_tasksheet,med_highlight_length_tasksheet,idle_count_tasksheet,med_idle_duration_tasksheet,total_idle_duration_tasksheet,score_AI,score_alone,score_overreliance
count,22.0,22.0,22.0,22.0,22.0,22.0,22.0,22.0,22.0,22.0,...,22.0,22.0,22.0,22.0,22.0,22.0,22.0,22.0,22.0,22.0
mean,11.590909,73.909041,678.572727,41.090909,39521.31693,541.636364,25804.390102,74.463182,61.999227,4.318182,...,92.772727,18.272727,293.401453,235.977273,27.545455,2016.681818,57862.0,48.636364,47.818182,0.818182
std,8.404415,61.056609,302.83403,35.730351,27751.214115,479.12505,21228.54331,49.007149,47.830196,6.847915,...,135.305626,15.802899,383.3103,449.812222,16.343711,34.249972,34666.123524,14.039215,10.445488,21.214632
min,0.0,0.0,50.811,2.0,5352.501218,24.0,2900.0,3.412552,2.0,0.0,...,0.0,0.0,0.0,0.0,4.0,2001.0,8022.0,22.0,24.0,-40.0
25%,4.25,34.089208,509.46825,10.0,18797.375237,195.25,7594.499658,17.527517,8.0,0.25,...,19.25,5.5,70.526099,23.125,13.0,2002.0,26949.5,37.25,40.0,-10.75
50%,8.5,46.336582,778.911,29.0,31040.914363,458.0,16677.5,99.999999,99.999999,1.5,...,68.5,15.0,163.29021,25.0,30.0,2007.5,63088.0,50.5,47.0,0.5
75%,19.0,89.763967,911.0415,65.5,58061.730352,604.25,43900.0,112.526126,100.0,3.0,...,82.25,25.0,245.645833,125.875,39.5,2011.0,83042.0,62.5,55.0,16.75
max,27.0,205.303,1203.343,118.0,105616.203855,1937.0,63800.0,139.30131,133.333325,28.0,...,548.0,55.0,1339.0,1339.0,53.0,2128.0,118304.0,69.0,67.0,41.0


In [27]:
# Determine what features might need to be transformed
# The features that are not normally distributed might need to be transformed
# We can use the Shapiro-Wilk test to determine if a feature is normally distributed
features = df.columns[1:-4]
transformed_features = []
for feature in features:
    stat, p = stats.shapiro(df[feature])
    print(f'{feature}: p-value = {p}')
    if p < 0.005:
        transformed_features.append(feature)

print(f'Transformed features: {transformed_features}')


windowswitch_count_gpt: p-value = 0.043588653206825256
windowswitch_speed_gpt: p-value = 0.0008476412040181458
totaltime_gpt: p-value = 0.2679891884326935
click_count_gpt: p-value = 0.022310620173811913
total_mouse_movement_gpt: p-value = 0.10608398914337158
mousewheel_count_gpt: p-value = 0.002519912552088499
total_mousewheel_distance_gpt: p-value = 0.008215234614908695
average_mousewheel_distance_gpt: p-value = 0.004084073938429356
med_mousewheel_distance_gpt: p-value = 0.00013355567352846265
copy_count_gpt: p-value = 6.368940375978127e-06
average_copy_length_gpt: p-value = 1.0316963198420126e-05
med_copy_length_gpt: p-value = 1.5912102071524714e-06
paste_count_gpt: p-value = 0.0012515896232798696
average_paste_length_gpt: p-value = 0.005792492069303989
med_paste_length_gpt: p-value = 0.0003027644124813378
delete_count_gpt: p-value = 0.004361619707196951
keypress_count_gpt: p-value = 0.007714137900620699
highlight_count_gpt: p-value = 0.001662966562435031
average_highlight_length_gpt

In [28]:
# transform the features
for feature in transformed_features:
    df[feature] = np.log(df[feature])


# visualize distribution of features after transformation
# for feature in features:
#     sns.histplot(df[feature])
#     plt.title(feature)
#     plt.show()

  result = getattr(ufunc, method)(*inputs, **kwargs)


In [29]:
# Replace infinities with NaN
df = df.replace([np.inf, -np.inf], np.nan)

# Fill NaN values with the mean of the column
df = df.fillna(df.mean())

  df = df.fillna(df.mean())


In [30]:
# standardize the features
scaler = StandardScaler()
df[features] = scaler.fit_transform(df[features])

In [31]:
# check for multicollinearity
corr = df[features].corr()

# list of features that are highly correlated
highly_correlated_features = []
for i in range(len(corr.columns)):
    for j in range(i):
        if abs(corr.iloc[i, j]) > 0.8:
            highly_correlated_features.append((corr.columns[i], corr.columns[j]))

# randomly select one feature from each pair of highly correlated features
features_to_drop = []
for feature1, feature2 in highly_correlated_features:
    if feature1 not in features_to_drop:
        features_to_drop.append(feature2)

# drop the highly correlated features
df = df.drop(features_to_drop, axis=1)

print(f'Features to drop: {features_to_drop}')

Features to drop: ['click_count_gpt', 'average_mousewheel_distance_gpt', 'average_copy_length_gpt', 'paste_count_gpt', 'click_count_gpt', 'average_highlight_length_gpt', 'totaltime_gpt', 'totaltime_gpt', 'idle_count_gpt', 'totaltime_gpt', 'total_idle_duration_gpt', 'click_count_tasksheet', 'average_mousewheel_distance_gpt', 'med_mousewheel_distance_gpt', 'average_mousewheel_distance_gpt', 'med_mousewheel_distance_gpt', 'average_mousewheel_distance_tasksheet', 'average_copy_length_tasksheet', 'paste_count_tasksheet', 'paste_count_tasksheet', 'average_paste_length_tasksheet', 'click_count_tasksheet', 'total_mouse_movement_tasksheet', 'average_copy_length_tasksheet']


In [32]:
# Multiple linear regression
# remove score_AI score_alone score_overreliance and filename from the features
features = df.columns[1:-4]
X = df[features]
y = df['score_overreliance']

# add a constant to the features
X = sm.add_constant(X)

# fit the model
model = sm.OLS(y, X).fit()

# print the summary
print(model.summary())


                            OLS Regression Results                            
Dep. Variable:     score_overreliance   R-squared:                       0.999
Model:                            OLS   Adj. R-squared:                  0.982
Method:                 Least Squares   F-statistic:                     59.02
Date:                Mon, 12 Aug 2024   Prob (F-statistic):              0.102
Time:                        01:44:13   Log-Likelihood:                -20.089
No. Observations:                  22   AIC:                             82.18
Df Residuals:                       1   BIC:                             105.1
Df Model:                          20                                         
Covariance Type:            nonrobust                                         
                                          coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------------------------------
co

In [33]:
# create a list of features with p-values less than 0.05
significant_features = model.pvalues[model.pvalues < 0.05].index.tolist()
print(f'Significant features: {significant_features}')

Significant features: ['total_mouse_movement_gpt', 'mousewheel_count_gpt', 'total_mousewheel_distance_gpt', 'med_highlight_length_gpt', 'med_idle_duration_gpt', 'prompts_count_gpt', 'delete_count_tasksheet', 'keypress_count_tasksheet', 'med_highlight_length_tasksheet', 'med_idle_duration_tasksheet']
