In [1]:
import numpy as np
import os
import pandas as pd
from sklearn.linear_model import LinearRegression
import statsmodels.api as sm
import sys

module_path = os.path.abspath(os.path.join('../src'))
if not module_path in sys.path:
    sys.path.insert(0, module_path)

from innoprod.sheet_tools import get_sheet_dfs
from innoprod.wrangling.msyh_data_sharing import wrangle_roadmaps, wrangle_grants

In [2]:
data = get_sheet_dfs()
roadmaps_df = wrangle_roadmaps(data['Roadmaps'])
grants_df = wrangle_grants(data['Grants'])

In [3]:
grants_totals = grants_df[['Client ID', 'Actual amount claimed']].groupby('Client ID').sum()
grants_totals = grants_totals.rename(columns={'Actual amount claimed': 'Total actual amount claimed'})

roadmaps_df = roadmaps_df.join(grants_totals, on='Client ID')

In [4]:
iv_cols = [
    # 'Number of GAFs',
    'Turnover',
    'Number of FTE Employees (calc)',
    'Current Digital Readiness Score (refer to PAS:1040)',
    'Do you have a Digital Champion in place?',
    # 'Employee Increase (FTE calc)',
    'How valuable did you find the involvement of your contact within the programme during the course of the support?',
    'How valuable did you find the GROWTHmapper and its report in identifying the key areas of supporting your business?',
    'How valuable did you find the support you received from the Expert Coach during the course of the programme?',
]
dv_col = 'Total actual amount claimed'

In [5]:
model_data = roadmaps_df[iv_cols+[dv_col]].dropna()
len(model_data)

112

In [6]:
for col in iv_cols:
    test_cols = [c for c in iv_cols if c != col]
    test_data = roadmaps_df[test_cols+[dv_col]].dropna()
    print(f'{col}: {len(test_data)}')

Turnover: 138
Number of FTE Employees (calc): 112
Current Digital Readiness Score (refer to PAS:1040): 116
Do you have a Digital Champion in place?: 115
How valuable did you find the involvement of your contact within the programme during the course of the support?: 112
How valuable did you find the GROWTHmapper and its report in identifying the key areas of supporting your business?: 112
How valuable did you find the support you received from the Expert Coach during the course of the programme?: 112


In [7]:
X = model_data[iv_cols]
y = model_data[dv_col].values.reshape(-1, 1)
model = LinearRegression()
model.fit(X, y)

0,1,2
,"fit_intercept  fit_intercept: bool, default=True Whether to calculate the intercept for this model. If set to False, no intercept will be used in calculations (i.e. data is expected to be centered).",True
,"copy_X  copy_X: bool, default=True If True, X will be copied; else, it may be overwritten.",True
,"tol  tol: float, default=1e-6 The precision of the solution (`coef_`) is determined by `tol` which specifies a different convergence criterion for the `lsqr` solver. `tol` is set as `atol` and `btol` of :func:`scipy.sparse.linalg.lsqr` when fitting on sparse training data. This parameter has no effect when fitting on dense data. .. versionadded:: 1.7",1e-06
,"n_jobs  n_jobs: int, default=None The number of jobs to use for the computation. This will only provide speedup in case of sufficiently large problems, that is if firstly `n_targets > 1` and secondly `X` is sparse or if `positive` is set to `True`. ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. ``-1`` means using all processors. See :term:`Glossary ` for more details.",
,"positive  positive: bool, default=False When set to ``True``, forces the coefficients to be positive. This option is only supported for dense arrays. For a comparison between a linear regression model with positive constraints on the regression coefficients and a linear regression without such constraints, see :ref:`sphx_glr_auto_examples_linear_model_plot_nnls.py`. .. versionadded:: 0.24",False


In [8]:
X2 = sm.add_constant(X)
est = sm.OLS(y, X2.astype(float))
est2 = est.fit()
print(est2.summary())

                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.087
Model:                            OLS   Adj. R-squared:                  0.026
Method:                 Least Squares   F-statistic:                     1.416
Date:                Tue, 17 Feb 2026   Prob (F-statistic):              0.207
Time:                        10:56:27   Log-Likelihood:                -1041.6
No. Observations:                 112   AIC:                             2099.
Df Residuals:                     104   BIC:                             2121.
Df Model:                           7                                         
Covariance Type:            nonrobust                                         
                                                                                                                          coef    std err          t      P>|t|      [0.025      0.975]
--------------------------

In [9]:
ols_corrs = model_data.corr()
ols_corrs.to_csv('outputs/ols_corrs.csv')