In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import glob
from datetime import datetime, timezone, timedelta


# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        os.path.join(dirname, filename)

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Data Import :: *engagment_data*

The engagement data are aggregated at school district level, and each file in the folder engagement_data represents data from one school district. The 4-digit file name represents district_id which can be used to link to district information in district_info.csv. The lp_id can be used to link to product information in product_info.csv.

In [None]:
path= "../input/learnplatform-covid19-impact-on-digital-learning/engagement_data/*.csv"
#path = "engagement_data/*.csv"

all_files = glob.glob(path)

all_df = []
for f in all_files:
    df = pd.read_csv(f)
    #The 4-digit file name represents district_id. We extract the distric_id from the path and assign it to a new column
    df['district_id'] = f.split('/')[-1]
    df["district_id"] = df["district_id"].str.replace('.csv','')
    all_df.append(df)
    
df = pd.concat(all_df, ignore_index=True, sort=True)

# Releasing memory
del all_df

In [None]:
# Print the type of each variable in the dataframe

df.info()

In [None]:
print(df.shape)
df.head()

## Data Cleaning :: *engagment_data*

### Handling missing values

In [None]:
print("The data frame has " + str(df.isnull().sum().sum()) + " missing values.")

In [None]:
df.isna().sum() 

In [None]:
print(df['engagement_index'].min())

0.01 is the lowest value in the column engagement_index. We can replace all NaNs of the column engagement_index with 0 without manipulate the given information of our data frame, when "pct_access" is 0 

In [None]:
df['engagement_index'] = np.where(df['pct_access']==0, 0, df["engagement_index"])

In [None]:
df.isna().sum()  

Both "engagement_index" and "pct_access" have the same amount of NaNs. We take a close look of this situation...

In [None]:
df_nan_i = df[(df["pct_access"].isna()) & (df["engagement_index"].isna())]
df_nan_i.head()

In [None]:
# releasing memory
del df_nan_i

In [None]:
print(df.lp_id.min())
print(df.pct_access.min())
print(df.engagement_index.min())


We will assign "-1" to both pct_access and engagement_index, when both are "NaN" in each column. With "-1" we keep the information of the columns unambiguously, because "0" is the lowest value of them. 

In [None]:
df[['engagement_index', 'pct_access']] = df[['engagement_index', 'pct_access']].fillna(value=-1)


In [None]:
df.isna().sum()

In [None]:
df_nan_ii = df[(df["lp_id"].isna())]
df_nan_ii.head()

In [None]:
# releasing memory
del df_nan_ii
df.isna().sum()

We will replace the remaing lp_id NaNs with "-1" (The lowest lp_id is 10003)

In [None]:
df['lp_id'] = df["lp_id"].fillna(value=-1)

In [None]:
df.isna().sum()

lp_id is a float number. We will check, if we colud transform them to a integer type without losing information. 

In [None]:
df["lp_id_2"] = df["lp_id"].astype(int)
df.eval("temp = lp_id - lp_id_2", inplace=True)
print(df["temp"].sum())

The total sum of float lp_id and integer lp_id (in column lp_id_2) is 0. We can transform the id to integer. Afterwards we transform them to a string to have an objective type. 

In [None]:
df.drop(["lp_id_2","temp"], axis=1, inplace=True)
df["lp_id"] = df["lp_id"].astype(int)
df["lp_id"] = df["lp_id"].astype(str)
df.head()

## Feature Extraction :: *engagement_index*

We want to extract the weekday, month and week from the given date of this data frame

In [None]:
df["time"] = pd.to_datetime(df["time"])

In [None]:
df["weekday"] = df["time"].dt.dayofweek
df["month"] = df["time"].dt.month
df["week"] = df["time"].dt.isocalendar().week

df.head()

In [None]:
df_eng = df.copy()
del df

# Data Import :: *products_info.csv*

In [None]:
df = pd.read_csv("../input/learnplatform-covid19-impact-on-digital-learning/products_info.csv")
print(df.shape)
df.head()

In [None]:
print(df["LP ID"].nunique())
print(df["Product Name"].nunique())
print(df["Provider/Company Name"].nunique())
print(df["Sector(s)"].nunique())
print(df["Primary Essential Function"].nunique())

In [None]:
new = df["Primary Essential Function"].str.split(" - ", n = 1, expand = True)
# Making separate main category column
df["cat"]= new[0]
# Making sub category column
df["sub_cat"]= new[1]
# Dropping old column
df.drop(columns =["Primary Essential Function"], inplace = True)

In [None]:
new = df["sub_cat"].str.split(" - ", n = 1, expand = True)
# making separate main category column
df["sub_cat"]= new[0]
# making sub category column
df["sub_cat_2"]= new[1]

In [None]:
df.isna().sum()

In [None]:
df1 = df[df.isna().any(axis=1)]
df1

In [None]:
del df1

We replace NaN's of the data frame with "undefined".

In [None]:
df[['Sector(s)', 'cat', "sub_cat", "sub_cat_2", "Provider/Company Name"]] = df[['Sector(s)', 'cat',"sub_cat", "sub_cat_2","Provider/Company Name"]].fillna(value="undefined")
df.isna().sum()

Because of information redundancy, we drop the "URL" column. We rename the LP ID column to "lp_id", like it's labled in the "engagment_index" data frame.

In [None]:
df.drop(['URL'], axis=1, inplace=True)
df.rename(columns={"LP ID": "lp_id"}, inplace=True)
df["lp_id"] = df["lp_id"].astype(str)
df.head()

In [None]:
df.lp_id.nunique()

In [None]:
df.sub_cat.nunique()

In [None]:
df.sub_cat_2.unique()

In [None]:
df_prod = df.copy()
del df

# Data Import :: *districts_info*

In [None]:
df = pd.read_csv("../input/learnplatform-covid19-impact-on-digital-learning/districts_info.csv")
print("Shape of dataframe", df.shape)
df.info()

## Data cleaning and handling missing values of *districts_info*

The *district_id* column represent values with an objective characteristic. We will change it from int- to string-type.

In [None]:
df["district_id"] = df["district_id"].astype(str)

We want to split the percentage columns to single columns. 

In [None]:
df[["pct_black/hispanic", "pct_free/reduced", "county_connections_ratio", "pp_total_raw"]] = df[["pct_black/hispanic", "pct_free/reduced", "county_connections_ratio", "pp_total_raw"]].astype(str)

In [None]:
df["pct_black/hispanic"] = df["pct_black/hispanic"].str.replace('[','')
df["pct_free/reduced"] = df["pct_free/reduced"].str.replace('[','')
df["county_connections_ratio"] = df["county_connections_ratio"].str.replace('[','')
df["pp_total_raw"] = df["pp_total_raw"].str.replace('[','')


In [None]:
new = df["pct_black/hispanic"].str.split(", ", n = 1, expand = True)
# making separate main category column
df["pct_black"]= new[0]
df["pct_black"] = df["pct_black"].astype(float)
# making sub category column
df["pct_hispanic"]= new[1]
df["pct_hispanic"] = df["pct_hispanic"].astype(float)
# Dropping old column
df.drop(columns =["pct_black/hispanic"], inplace = True)

In [None]:
new = df["pct_free/reduced"].str.split(", ", n = 1, expand = True)
# making separate main category column
df["pct_free"] = new[0]
df["pct_free"] = df["pct_free"].astype(float)
# making sub category column
df["pct_reduced"] = new[1]
df["pct_reduced"] = df["pct_reduced"].astype(float)
# Dropping old column
df.drop(columns = ["pct_free/reduced"], inplace = True)
df.head()

We have "nan" and "NaN" values. First, we uniform these to "NaN", by replacing the "nan".

In [None]:
df = df.fillna(value=np.nan)

nan_value = float("NaN")
#Convert nan values to NaN string
df.replace("nan", nan_value, inplace=True)

df.head()

We drop all rows, when "state" or "locale" value is NaN.

In [None]:
df.dropna(subset = ["state"], inplace=True)
df.dropna(subset = ["locale"], inplace=True)
df.shape


Missing values will be replaced by "-1"

In [None]:
df[["county_connections_ratio", "pp_total_raw", "pct_black", "pct_hispanic", "pct_free", "pct_reduced"]] = df[["county_connections_ratio", "pp_total_raw", "pct_black", "pct_hispanic", "pct_free", "pct_reduced"]].fillna(value=-1)

In [None]:
print(df.county_connections_ratio.unique())
print(df.pp_total_raw.nunique())
print(df.duplicated().sum())

In [None]:
df.drop_duplicates(inplace=True)

df.shape
df_dist = df.copy()
# Releasing memory
del df

# Merging All Data Frames

We merge the three data frames to single data frame. Finally, we save it as a "pickle".

In [None]:
df_merge = df_eng.merge(df_dist, how='outer', on='district_id')
# Releasing memory
del df_eng
del df_dist

In [None]:
df = df_merge.merge(df_prod, how='outer', on='lp_id')
# Releasing memory
del df_prod
del df_merge

In [None]:
print(df.shape)
df.head()

In [None]:
df.info()

In [None]:
df.to_pickle('df.pickle')