In [None]:
import numpy as np
import os
import pandas as pd
from sklearn.linear_model import LinearRegression
import statsmodels.api as sm
import sys

module_path = os.path.abspath(os.path.join('..'))
if not module_path in sys.path:
    sys.path.insert(0, module_path)

from innoprod.sheet_tools import get_sheet_dfs
from innoprod.wrangling.msyh_data_sharing import wrangle_roadmaps, wrangle_grants

In [None]:
data = get_sheet_dfs()
roadmaps_df = wrangle_roadmaps(data['Roadmaps'])
grants_df = wrangle_grants(data['Grants'])

In [None]:
grants_totals = grants_df[['Client ID', 'Actual amount claimed']].groupby('Client ID').sum()
grants_totals = grants_totals.rename(columns={'Actual amount claimed': 'Total actual amount claimed'})

roadmaps_df = roadmaps_df.join(grants_totals, on='Client ID')

In [None]:
iv_cols = [
    # 'Number of GAFs',
    'Turnover',
    'Number of FTE Employees (calc)',
    'Current Digital Readiness Score (refer to PAS:1040)',
    'Do you have a Digital Champion in place?',
    # 'Employee Increase (FTE calc)',
    'How valuable did you find the involvement of your contact within the programme during the course of the support?',
    'How valuable did you find the GROWTHmapper and its report in identifying the key areas of supporting your business?',
    'How valuable did you find the support you received from the Expert Coach during the course of the programme?',
]
dv_col = 'Total actual amount claimed'

In [None]:
model_data = roadmaps_df[iv_cols+[dv_col]].dropna()
len(model_data)

In [None]:
for col in iv_cols:
    test_cols = [c for c in iv_cols if c != col]
    test_data = roadmaps_df[test_cols+[dv_col]].dropna()
    print(f'{col}: {len(test_data)}')

In [None]:
X = model_data[iv_cols]
y = model_data[dv_col].values.reshape(-1, 1)
model = LinearRegression()
model.fit(X, y)

In [None]:
X2 = sm.add_constant(X)
est = sm.OLS(y, X2.astype(float))
est2 = est.fit()
print(est2.summary())

In [None]:
ols_corrs = model_data.corr()
ols_corrs.to_csv('outputs/ols_corrs.csv')