In [None]:
import numpy as np
import pandas as pd
import os
import matplotlib.pyplot as plt

engagement_dir = "../input/learnplatform-covid19-impact-on-digital-learning/engagement_data"
districts_path = "../input/learnplatform-covid19-impact-on-digital-learning/districts_info.csv"
products_path = "../input/learnplatform-covid19-impact-on-digital-learning/products_info.csv"

# Prepare Districts

In [None]:
def load_districts(path:str):
    '''
    loads the non-null district data. 
    These districts are 'valuable', 
    because analyzing them provides more valuable information for us.
    '''
    districts = pd.read_csv(path).dropna()
    return districts, districts['district_id']

In [None]:
districts, valuable_districts = load_districts(path=districts_path)

# Assemble Engagement Data

In [None]:
def load_engagement(directory:str, districts):
    '''
    loads and concatenates the engagement data of all 'valuable' districts.
    '''
    engagement = pd.DataFrame(columns=['time', 'lp_id', 'pct_access', 'engagement_index', 'district'])
    counter = 0
    for csv in os.listdir(directory):
        csv_path = directory + "/" + csv
        district = int(csv[0:-4])
        if district in districts.values:
            counter += 1
            if counter % 8 == 0:
                print(f'{counter}/{len(districts)}')
            temp = pd.read_csv(csv_path)
            temp = temp.dropna()
            temp['district'] = district
            engagement = pd.concat([engagement, temp], ignore_index=True)
    return engagement

In [None]:
engagement_data = load_engagement(directory=engagement_dir, districts=valuable_districts)

# Merge District Data with Engagement Data

In [None]:
engagement_data = engagement_data.rename(columns={"district": "district_id"})
data = pd.merge(engagement_data, districts, on=["district_id"])

# Merge with Product Data

In [None]:
products = pd.read_csv(products_path)
products = products.rename(columns={"LP ID": "lp_id"})
data["lp_id"] = data["lp_id"].astype(int)
data = pd.merge(data, products, on=["lp_id"])

## Get an Overview

In [None]:
data

In [None]:
# suppress scientific notation:
pd.options.display.float_format = '{:.2f}'.format

data.describe()

In [None]:
data.to_csv('data.csv', index=False)