In [1]:
from functools import reduce
import json
import os
import operator
import pandas as pd
import sys

module_path = os.path.abspath(os.path.join('../src'))
if not module_path in sys.path:
    sys.path.insert(0, module_path)

from innoprod.sheet_tools import get_sheet_dfs
from innoprod.wrangling.wrangling_tools import characterise_df_columnwise, is_non_empty
from innoprod.wrangling.msyh_data_sharing import wrangle_roadmaps

In [2]:
data = get_sheet_dfs()
roadmaps_df = wrangle_roadmaps(data['Roadmaps'])

## TODO write wrangling function for Grants
grants_df = data['Grants'] 

# Basic Data Characterisation

In [3]:
roadmaps_characterisation = characterise_df_columnwise(roadmaps_df)
roadmaps_characterisation.to_csv('characterisation_roadmaps.csv')

In [4]:
grants_characterisation = characterise_df_columnwise(grants_df)
grants_characterisation.to_csv('characterisation_grants.csv')

In [5]:
df = data['Roadmaps']
col = 'Number of PT employees'
mask = (df[col] == '') == (df[col].isna())
df[mask][col].unique().size

np.int64(2)

## Cross-referencing Roadmaps and Grants
The two lists of **Client ID**s overlap perfectly:

In [6]:
set(grants_df['Client ID']) == set(roadmaps_df['Client ID'])

True

## Do withdrawn grant applications mean clients pull out completely?
This many clients withdrew from the scheme all together:

In [7]:
mask = grants_df['Application Status'] != 'Withdrew'

len(set(roadmaps_df['Client ID']).difference(set(grants_df[mask]['Client ID'])))

10

# Does Roadmaps > Number of GAFs match up with the data on Grants?

In [8]:
sum(roadmaps_df['Number of GAFs']) == grants_df['Grant ID'].count()

np.True_

# How many Roadmaps rows have all core data?

In [16]:
core_cols = [
    'Client ID',
    'Number of GAFs',
    'Primary_contact_id',
    'Nature of Business/core activity',
    'Turnover',
    'Enquiry Date',
    'Org Size by Number of FTE (calc)',
    'Number of FTE Employees (calc)',
    'Summary review of Edge Digital diagnostic report & current state and key improvement areas',
    'Current Digital Readiness Score (refer to PAS:1040)',
    'What are the internal barriers to growth? How do you intend to finance future growth? Are there sufficient leadership and management skills in the business to achieve your growth? What opportunities do you have to expand into new markets?',
    'Main historical barrier',
    'Details of any existing Digital Strategy',
    'Do you have a Digital Champion in place?',
    'Level of current Strategic Digital Skills/knowledge in the business',
    'Level of current Technical Digital Skills/knowledge in the business',
    'Whether the business is already investing/adopting/utilising Industry 4.0 Technologies, with examples',
    'Summary of the identified problems, including Gap Analysis',
    'Key potential industry 4.0 solutions to address the identified problems/gaps',
    'Recommended Action Plan to utilise Industry 4.0 Technology',
    'Overview of qualitative benefits of recommended Action Plan (positive/negative)',
    'Skills and other requirements that will be needed to successfully implement the recommended Action Plan',
    'Application area of technology in the Action Plan'
]

reqs_plans = [
   'Requirements/Plans: Short Term',
   'Requirements/Plans: Medium Term', 
   'Requirements/Plans: Long Term'
]

In [22]:
core_mask = [is_non_empty(roadmaps_df[col]) for col in core_cols]
core_mask = reduce(operator.and_, core_mask)

reqs_mask = [is_non_empty(roadmaps_df[col]) for col in reqs_plans]
reqs_mask = reduce(operator.or_, reqs_mask)

mask = core_mask & reqs_mask
sum(mask)

np.int64(90)