In [1]:
import pandas as pd
import numpy as np 
from matplotlib import pyplot as plt
import seaborn as sns
sns.set()

In [2]:
df = pd.read_csv('trip_allinone.csv')

In [3]:
import statsmodels.api as sm
from sklearn.preprocessing import StandardScaler, MinMaxScaler
def fit_with_sm(X, y):
    X = sm.add_constant(X)
    model = sm.OLS(y, X).fit()
    return model

def fit_with_sklearn(X, y):
    from sklearn.linear_model import LinearRegression
    model = LinearRegression().fit(X, y)
    return model

def fit_with_catboost(X, y):
    from catboost import CatBoostRegressor
    model = CatBoostRegressor(depth=3, num_trees=500, verbose=False).fit(X, y)
    return model
def fit_pipeline(df, scaler=None):
    if scaler is not None:
        df = pd.DataFrame(scaler.fit_transform(df), columns=df.columns)
    model = fit_with_sm(df.drop((['Score']), axis=1), df['Score'])
    print(model.pvalues[model.pvalues < 0.05])
    return model

In [4]:
df.index = df['ID']
df.drop('ID', axis=1, inplace=True)
df.columns

Index(['totaltime', 'click_count', 'total_mouse_movement', 'mousewheel_count',
       'total_mousewheel_distance', 'average_mousewheel_distance',
       'copy_count', 'average_copy_length', 'paste_count',
       'average_paste_length', 'delete_count', 'keypress_count',
       'highlight_count', 'average_highlight_length', 'idle_count',
       'average_idle_duration', 'total_idle_duration', 'Score'],
      dtype='object')

In [5]:
for col in df.columns:
    # if all = same value
    if len(df[col].unique()) == 1:
        df.drop(col, axis=1, inplace=True)
        print(f'Dropped {col}')

In [6]:
df.fillna(0, inplace=True)

In [7]:
print(len(df), len(df.columns))

28 18


In [8]:
model = fit_pipeline(df, StandardScaler())

mousewheel_count        0.040180
average_paste_length    0.024754
dtype: float64


In [9]:
model.summary()

0,1,2,3
Dep. Variable:,Score,R-squared:,0.738
Model:,OLS,Adj. R-squared:,0.291
Method:,Least Squares,F-statistic:,1.653
Date:,"Sat, 19 Oct 2024",Prob (F-statistic):,0.211
Time:,17:12:42,Log-Likelihood:,-21.005
No. Observations:,28,AIC:,78.01
Df Residuals:,10,BIC:,102.0
Df Model:,17,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,3.053e-16,0.162,1.88e-15,1.000,-0.361,0.361
totaltime,-0.7010,0.414,-1.695,0.121,-1.622,0.220
click_count,-0.5340,0.391,-1.365,0.202,-1.406,0.338
total_mouse_movement,0.2968,0.697,0.426,0.679,-1.257,1.851
mousewheel_count,-1.4621,0.620,-2.357,0.040,-2.844,-0.080
total_mousewheel_distance,1.3144,0.592,2.220,0.051,-0.005,2.633
average_mousewheel_distance,-1.3827,0.647,-2.138,0.058,-2.823,0.058
copy_count,0.3716,0.389,0.955,0.362,-0.496,1.239
average_copy_length,1.7374,3.086,0.563,0.586,-5.138,8.613

0,1,2,3
Omnibus:,1.077,Durbin-Watson:,2.011
Prob(Omnibus):,0.583,Jarque-Bera (JB):,1.042
Skew:,-0.402,Prob(JB):,0.594
Kurtosis:,2.503,Cond. No.,614.0
