In [1]:
import pandas as pd
import numpy as np 
from matplotlib import pyplot as plt
import seaborn as sns
sns.set()

In [2]:
df = pd.read_csv('allinone.csv')

In [3]:
import statsmodels.api as sm
from sklearn.preprocessing import StandardScaler, MinMaxScaler
def fit_with_sm(X, y):
    X = sm.add_constant(X)
    model = sm.OLS(y, X).fit()
    return model

def fit_with_sklearn(X, y):
    from sklearn.linear_model import LinearRegression
    model = LinearRegression().fit(X, y)
    return model

def fit_with_catboost(X, y):
    from catboost import CatBoostRegressor
    model = CatBoostRegressor(depth=3, num_trees=500, verbose=False).fit(X, y)
    return model
def fit_pipeline(df, scaler=None):
    if scaler is not None:
        df = pd.DataFrame(scaler.fit_transform(df), columns=df.columns)
    model = fit_with_sm(df.drop((['Score']), axis=1), df['Score'])
    print(model.pvalues[model.pvalues < 0.05])
    return model

In [4]:
df.index = df['ID']
df.drop('ID', axis=1, inplace=True)
df.columns

Index(['total_focus_time', 'element_switch_count', 'element_switch_speed',
       'totaltime_x', 'click_count_x', 'total_mouse_movement_x',
       'mousewheel_count_x', 'total_mousewheel_distance_x',
       'average_mousewheel_distance_x', 'copy_count_x',
       'average_copy_length_x', 'paste_count_x', 'average_paste_length_x',
       'delete_count_x', 'keypress_count_x', 'highlight_count_x',
       'average_highlight_length_x', 'idle_count_x', 'average_idle_duration_x',
       'total_idle_duration_x', 'prompts_count', 'total_prompts_duration',
       'average_prompts_duration', 'tab_switch_count', 'total_tab_switch_time',
       'average_tab_switch_time', 'totaltime_y', 'click_count_y',
       'total_mouse_movement_y', 'mousewheel_count_y',
       'total_mousewheel_distance_y', 'average_mousewheel_distance_y',
       'copy_count_y', 'average_copy_length_y', 'paste_count_y',
       'average_paste_length_y', 'delete_count_y', 'keypress_count_y',
       'highlight_count_y', 'average_hig

In [5]:
for col in df.columns:
    # if all = same value
    if len(df[col].unique()) == 1:
        df.drop(col, axis=1, inplace=True)
        print(f'Dropped {col}')

In [12]:
df['Score'].mean(), '+-', df['Score'].std()

(4.934782608695652, '+-', 3.0796982096955428)

In [6]:
print(df.isna().sum())
df.fillna(0, inplace=True)

total_focus_time                  0
element_switch_count              0
element_switch_speed              0
totaltime_x                       0
click_count_x                     0
total_mouse_movement_x            0
mousewheel_count_x                0
total_mousewheel_distance_x       0
average_mousewheel_distance_x     0
copy_count_x                      0
average_copy_length_x             0
paste_count_x                     0
average_paste_length_x            0
delete_count_x                    0
keypress_count_x                  0
highlight_count_x                 0
average_highlight_length_x        0
idle_count_x                      0
average_idle_duration_x           0
total_idle_duration_x             0
prompts_count                     0
total_prompts_duration            0
average_prompts_duration          3
tab_switch_count                  0
total_tab_switch_time             0
average_tab_switch_time          34
totaltime_y                       0
click_count_y               

In [7]:
print(len(df), len(df.columns))

46 44


In [8]:
model = fit_pipeline(df, StandardScaler())

mousewheel_count_x    0.039951
dtype: float64


In [9]:
model.summary()

0,1,2,3
Dep. Variable:,Score,R-squared:,0.97
Model:,OLS,Adj. R-squared:,0.554
Method:,Least Squares,F-statistic:,2.333
Date:,"Sat, 19 Oct 2024",Prob (F-statistic):,0.266
Time:,17:13:40,Log-Likelihood:,15.608
No. Observations:,46,AIC:,54.78
Df Residuals:,3,BIC:,133.4
Df Model:,42,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,-5.294e-15,0.100,-5.32e-14,1.000,-0.317,0.317
total_focus_time,0.6418,0.390,1.646,0.198,-0.599,1.883
element_switch_count,0.6418,0.390,1.646,0.198,-0.599,1.883
element_switch_speed,-1.2518,1.079,-1.160,0.330,-4.685,2.182
totaltime_x,-6.1303,1.998,-3.069,0.055,-12.488,0.228
click_count_x,-0.8512,0.863,-0.987,0.397,-3.596,1.894
total_mouse_movement_x,0.8474,0.666,1.272,0.293,-1.272,2.967
mousewheel_count_x,1.1985,0.344,3.484,0.040,0.104,2.293
total_mousewheel_distance_x,-1.3256,0.482,-2.748,0.071,-2.861,0.210

0,1,2,3
Omnibus:,4.381,Durbin-Watson:,1.739
Prob(Omnibus):,0.112,Jarque-Bera (JB):,3.213
Skew:,-0.505,Prob(JB):,0.201
Kurtosis:,3.811,Cond. No.,1.33e+16
