In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Exploring the Categorical Variables

In [None]:
# Reading data with information of districts and products
district = pd.read_csv('/kaggle/input/learnplatform-covid19-impact-on-digital-learning/districts_info.csv')
products = pd.read_csv('/kaggle/input/learnplatform-covid19-impact-on-digital-learning/products_info.csv')

In [None]:
display(district.info(), district.isna().any())

In [None]:
display(products.info(), products.isna().any())

In [None]:
display(len(district), district.head(), len(products), products.head())

There is no reason to delete the NaN data within the district DataFrame, because there is a csv related to each id.

### Graphic EDA

#### Graphics for district DataFrame

In [None]:
print(district['state'].unique(), len(district['state'].unique()))

In [None]:
district['state'].value_counts()

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()
sns.set_palette('colorblind')

plt.figure(figsize=(14,8))
g = sns.countplot(x='state', data=district, hue='locale')
g.set(xlabel='State', ylabel='Count')
plt.legend(loc='center right')
plt.xticks(rotation=90)
plt.show()

In [None]:
g2 = sns.catplot(y='state', data=district, kind='count', col='locale', col_wrap=2, sharey=False, sharex=False,
                 height=5, aspect=1.5, color='orange')
g2.set_xticklabels(rotation=0)
g2.set(xlabel='Count', ylabel='State')
plt.show()
plt.clf()

In [None]:
print(products['Primary Essential Function'].unique(), len(products['Primary Essential Function'].unique()))

In [None]:
products['Primary Essential Function'].value_counts()

In [None]:
plt.figure(figsize=(18,10))
g3 = sns.countplot(x='Primary Essential Function', data=products, hue='Sector(s)')
g3.set(xlabel='Primary Essential Function', ylabel='Count')
plt.xticks(rotation=90)
plt.legend(title='Sectors', fontsize='large', loc='center right')
plt.show()
plt.clf()

In [None]:
# Providers that provide more than 3 products
providers = products.groupby('Provider/Company Name').filter(lambda x: len(x) > 1)
display(providers, len(providers))

In [None]:
plt.figure(figsize=(18,10))
g4 = sns.countplot(x='Provider/Company Name', data=providers, hue='Sector(s)', order=providers['Provider/Company Name'].value_counts().index)
g4.set(xlabel='Primary Essential Function', ylabel='Count')
plt.xticks(rotation=90)
plt.legend(title='Sectors', fontsize='large', loc='center right')
plt.show()
plt.clf()

In [None]:
# Replacing characters in disctricts columns
cols_to_clean = ['pct_black/hispanic', 'pct_free/reduced', 'county_connections_ratio', 'pp_total_raw']
for col in cols_to_clean:
    for i in range(len(district[col])):
        if str(district.iloc[i][col]) != 'nan':
            district[col].loc[i] = (district[col].loc[i]).rstrip("[") + "]"
district.head()

In [None]:
plt.figure(figsize=(18,8))
g5 = sns.countplot(x='state', data=district, hue='pct_black/hispanic', order=district.groupby('state').agg({'pct_black/hispanic': 'count'}).sort_values(by='pct_black/hispanic', ascending=False).index)
g5.set(xlabel='State', ylabel='Number of districts')
plt.xticks(rotation=90)
plt.title('')
plt.legend(title='Black/Hispanic (percent)', fontsize='large', loc='center right')
plt.show()
plt.clf()

In [None]:
plt.figure(figsize=(18,8))
g6 = sns.countplot(x='state', data=district, hue='pct_free/reduced', order=district.groupby('state').agg({'pct_free/reduced': 'count'}).sort_values(by='pct_free/reduced', ascending=False).index)
g6.set(xlabel='State', ylabel='Number of districts')
plt.xticks(rotation=90)
plt.title('')
plt.legend(title='Students in the districts eligible for \nfree or reduced-price (percent)', fontsize='large', loc='center right')
plt.show()
plt.clf()

In [None]:
plt.figure(figsize=(18,8))
g7 = sns.countplot(x='state', data=district, hue='county_connections_ratio', order=district.groupby('state').agg({'county_connections_ratio': 'count'}).sort_values(by='county_connections_ratio', ascending=False).index)
g7.set(xlabel='State', ylabel='Number of districts')
plt.xticks(rotation=90)
plt.title('')
plt.legend(title='Residential fixed high-speed connections over \n200 kbps in at least one direction/households', fontsize='large', loc='center right')
plt.show()
plt.clf()

In [None]:
plt.figure(figsize=(18,8))
g8 = sns.countplot(x='state', data=district, hue='pp_total_raw', order=district.groupby('state').agg({'pp_total_raw': 'count'}).sort_values(by='pp_total_raw', ascending=False).index)
g8.set(xlabel='State', ylabel='Number of districts')
plt.xticks(rotation=90)
plt.title('')
plt.legend(title='Sum of local and federal \nexpenditure', fontsize='large', loc='center right')
plt.show()
plt.clf()

In [None]:
g9 = district['pp_total_raw'].value_counts().plot.pie(figsize=(10,10))