In [None]:
import numpy as np
import pandas as pd
import os
import glob
import seaborn as sns
import matplotlib.pyplot as plt
from plotly.subplots import make_subplots
import plotly.graph_objects as go
from plotly.offline import plot
import plotly.express as px
import warnings
warnings.filterwarnings("ignore")

In [None]:
path = "../input/learnplatform-covid19-impact-on-digital-learning/engagement_data"
files = glob.glob(path + "/*.csv")
data = []
for file in files:
    df_raw = pd.read_csv(file)
    df_raw["filename"] = os.path.basename(file)
    data.append(df_raw)

df_engagement_data = pd.concat(data,ignore_index= True)
districts_info_df  = pd.read_csv("../input/learnplatform-covid19-impact-on-digital-learning/districts_info.csv")
products_info_df   = pd.read_csv("../input/learnplatform-covid19-impact-on-digital-learning/products_info.csv")

In [None]:
def missing_df(df):
    missing_values = df.isnull().sum() / len(df) * 100
    df_ = pd.DataFrame( missing_values, columns=["value"] ).reset_index()
    return df_
    
missing_districts_info     = missing_df(districts_info_df)
missing_products_info      = missing_df(products_info_df)
missing_df_engagement_data = missing_df(df_engagement_data)

In [None]:
df_engagement_data["filename"] = df_engagement_data["filename"].str.replace(".csv","")
df_engagement_data

In [None]:
for missing_data in [missing_districts_info,missing_products_info,missing_df_engagement_data]:
    fig = px.pie(missing_data,
                 names= "index", 
                 values = "value",
                 title = "% of Missing values",
                 width=800, height=400,
                color_discrete_sequence=px.colors.sequential.RdBu)

    fig.update_traces(textposition='inside', textinfo='value+label')
    fig.show()

In [None]:
df_engagement_data

In [None]:
df_engagement_data["filename"] = df_engagement_data["filename"].astype(str)
districts_info_df["district_id"] = districts_info_df["district_id"].astype(str)

In [None]:
raw_df_eng_dist = pd.merge(
    df_engagement_data,
    districts_info_df,
    how = "left", 
    left_on = ["filename"],
    right_on = ["district_id"]
)

In [None]:
missing_raw_df_eng_dist = missing_df(raw_df_eng_dist)
missing_raw_df_eng_dist

In [None]:
print(df_engagement_data["lp_id"].nunique())
print(products_info_df["LP ID"].nunique())

In [None]:
raw_df_eng_prod = pd.merge(
    products_info_df, 
    df_engagement_data,
    how      = "left" , 
    left_on  = ["LP ID"] , 
    right_on = ["lp_id"]
)

In [None]:
raw_df_eng_prod_missing = missing_df(raw_df_eng_prod)
raw_df_eng_prod_missing

In [None]:
#state distribution
fig = px.histogram(districts_info_df, 
                   x = 'state',
                   width=800,
                   height=400).update_xaxes(categoryorder="total descending")
fig.show()

In [None]:
districts_info_df.columns

In [None]:
clean_districts_info_df = districts_info_df[districts_info_df["locale"].isna() == False]

In [None]:
df = pd.DataFrame(clean_districts_info_df["locale"].value_counts()).reset_index()

In [None]:
import plotly.express as px
fig = px.pie(df, 
             values = "locale",
             names = "index",
            color_discrete_sequence= px.colors.sequential.Plasma,
            hole=0.2)

fig.update_traces(hoverinfo='percent', textinfo='value+label')
fig.show()

#### product information excel

In [None]:
sns.countplot(y="Provider/Company Name", data = products_info_df, order=products_info_df['Provider/Company Name'].value_counts().index[:20])
plt.title("Top 20 Provider/Company")

In [None]:
agg_prod = products_info_df.groupby(by=["Sector(s)"]).size().reset_index(name= 'counts')

In [None]:
fig = make_subplots(rows=1,cols=1, specs=[[{'type':'domain'}]])

In [None]:
cafe_colors =  ['rgb(146, 123, 21)', 'rgb(177, 180, 34)', 'rgb(206, 206, 40)',
                'rgb(175, 51, 21)', 'rgb(35, 36, 21)']

fig.add_trace(
    go.Pie(
        values=agg_prod["counts"],
        labels=agg_prod["Sector(s)"],
        marker_colors=cafe_colors
    ))
fig.update_traces(hoverinfo='percent', textinfo='label+value')
#plt.figure(figsize=(10,10))

In [None]:
sns.countplot(y="Primary Essential Function", 
              data = products_info_df, 
              order=products_info_df['Primary Essential Function'].value_counts().index[:20]
             )
