In [None]:

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import glob 
import matplotlib.pyplot as plt 
import seaborn as sns
import plotly.express as px

# Table of Contents
* [Read Data](#Read-data)
* [Inspect and Clean Data](#Inspect-and-Clean-Data)
* [Univariate Analysis](#univ-analysis)
* [Numeric Columns](#numeric_columns)
* [Numeric_Column_Engagement_Index](#Numeric_Column_Engagement_Index)
* [Numeric_Column_pct_access](#Numeric_Column_pct_access)
* [Catgeorical Columns](#cat_cols)
* [Data Cleaning for categorical columns](#cat_col_clean)
* [pct_free/reduced](#pct_free/reduced)
* [Sector](#Sector)
* [pp_total_raw](#pp_total_raw)
* [Primary Essential Function](#pef)
* [Product Name](#pn)
* [Column Time](#time)
* [Bivariate  and Multivariate Analysis](#bivar)
* [pct_access vs Engagement index and locale](#p_vs_e)
* [Demographic Categorical values vs pct_access](#dem_vs_eng)
* [Questions](#Q)
* Q1 [For which Primary Essential function is the online learning facility widely used? ](#Q1)
* Q2 [How many products that have Primary Essential function as Digital Learning Platforms? ](#Q2)
* Q3 [What are the top 10 products interms of engagemnt index that have Primary Essential function as Digital Learning Platforms? ](#Q3)
* Q4 [What are the top 10 products in terms of pct_access that have Primary Essential function as Digital Learning Platforms? ](#Q4)
* Q5 [Did the locale influence the pct_access or engagement_index?](#Q5)
* Q6 [What products are popular in different locales?](#Q6)
* Q7 [Does the popularity of products depend on pct_black/hispanic](#Q7)
* Q8 [Which Products are popular during weekdays? Are they different from the ones popular on weekends?](#Q8)
* Q9 [Which Products are popular based on the Sector?](#Q9)
* Q10 [Which are the popular products across different states?](#Q10)
* Q11 [Which Primary Essential Function does the product which is popular across different state belong to?](#Q11)
*  [External Data](#ED)
* Q12a [What are the products popular in States with high kids population(Utah and Texas)?](#Q12a)
* Q12b [What are the products popular in States with low kids population(Florida and New Hampshire)?](#12b)
* Q13a [What are the primary Essential Functions of the products popular in States with high kids population(Utah and Texas)?](#Q13a)
* Q13b [What are the primary Essential Functions of the products popular in States with low kids population(New Hampshire and Florida)?](#Q13b)
* Q14 [Is there any corelation between the percentage of black/hispanic to the funds invested by NERDS?](#Q14)
* Q15 [Is there any corelation between the percentage of black/hispanic to Percentage of students in the districts eligible for free or reduced-price lunch?](#Q15)
* [Conclusion and Summary](#CnS)



<a id="Read-data"></a>
# Read Data

In [None]:
# accesing and printing files in directory and subdirectory
eng_df_tot = pd.DataFrame()
for filename in glob.glob("../input/learnplatform-covid19-impact-on-digital-learning/engagement_data/*.csv", recursive=True):
    eng_df = pd.read_csv(filename)
    eng_df = eng_df[~eng_df["engagement_index"].isnull()]
    dist_id = filename.split("/")[-1].split(".")[0] # Extracting the district id from the filename
    eng_df["dist_id"] = dist_id
    eng_df_tot = pd.concat([eng_df_tot,eng_df])

dist_df = pd.read_csv("../input/learnplatform-covid19-impact-on-digital-learning/districts_info.csv")
prod_df = pd.read_csv("../input/learnplatform-covid19-impact-on-digital-learning/products_info.csv")

<a id="Inspect-and-Clean-Data"></a>
# Inspect and Clean Data

In [None]:
print('*'*50,"Engagement Info",'*'*50 )
print(eng_df_tot.head())
print('*'*50,"Product Info",'*'*50 )
print(prod_df.head())
print('*'*50,"District Info",'*'*50 )
print(dist_df.head())

In [None]:
#Merge all three dataframes
prod_eng_merged = pd.merge(eng_df_tot, prod_df, left_on="lp_id", right_on="LP ID")
prod_eng_merged.head()

In [None]:
prod_eng_merged.isnull().sum()/len(prod_eng_merged)

In [None]:
prod_eng_merged[prod_eng_merged["Sector(s)"].isnull()].head()

### Observation
Since the null values are around 4 % we can drop the columns

In [None]:
prod_eng_merged = prod_eng_merged.dropna()

In [None]:
dist_df.info()

In [None]:
#District dataframe has a lot of null values
dist_df.isnull().sum()/len(dist_df)

In [None]:
#Keep only records where state info is present
dist_df = dist_df[~dist_df.state.isnull()]

In [None]:
dist_df.info()

In [None]:
dist_df[(dist_df[["pct_black/hispanic","pct_free/reduced","county_connections_ratio","pp_total_raw"]].isnull().sum(axis=1) >=2)]

In [None]:
dist_df = dist_df.fillna("Data not Available")

In [None]:
#Converting the district id to a string
dist_df.district_id = dist_df.district_id.astype("string")[:]

In [None]:
prod_eng_dist_merged = pd.merge(prod_eng_merged, dist_df, left_on="dist_id", right_on="district_id")
prod_eng_dist_merged.head()

In [None]:
#There are two columns for product id lets check if there are any mismatches
prod_eng_dist_merged[prod_eng_dist_merged["lp_id"] != prod_eng_dist_merged["LP ID"]].shape

In [None]:
#Since it is a redundant column we cdrop one product id column and change the other product id column type to string
prod_eng_dist_merged.drop("lp_id",axis=1,inplace=True)
prod_eng_dist_merged["LP ID"] = prod_eng_dist_merged["LP ID"].astype("string")[:]

In [None]:
#There are two columns for district id lets check if there are any mismatches
prod_eng_dist_merged[prod_eng_dist_merged["dist_id"] != prod_eng_dist_merged["district_id"]].shape

In [None]:
#Since it is a redundant column we drop one product id column and change the other product id column type to string
prod_eng_dist_merged.drop("dist_id",axis=1,inplace=True)


In [None]:
#Changing time column to data type time
prod_eng_dist_merged["time"] = prod_eng_dist_merged["time"].astype("datetime64")

In [None]:
prod_eng_dist_merged.isnull().sum()

<a id =univ-analysis></a>
# Univariate Analysis

In [None]:
num_cols = list(prod_eng_dist_merged.select_dtypes(exclude = ["object","string","datetime"]).columns)
cat_cols = list(prod_eng_dist_merged.select_dtypes(include = ["object","string"]).columns)
print("Numeric cols:",num_cols)
print("Categoric cols:",cat_cols)

<a id =numeric_columns></a>
# Numeric Columns

<a id = Numeric_Column_Engagement_Index></a>
### Numeric Column Engagement Index
According to the data dictionary the definition is :
"*Total page-load events per one thousand students of a given product and on a given day*"

How is the data distributed in the column engagement Index distributed?

In [None]:
sns.displot(data=prod_eng_dist_merged, x="engagement_index", bins=20, stat="probability")
plt.title("Distribution of data in the column Engagement Index");

Most of the rcords seem to be having an engagement index less than 500.Lets look at a narrower range.

In [None]:
print("Records with engagement index less than 50 make up to : ", round(prod_eng_dist_merged[prod_eng_dist_merged["engagement_index"] <= 50].shape[0]/prod_eng_dist_merged.shape[0],2))
print("Records with engagement index more than 50 make up to : ", round(prod_eng_dist_merged[prod_eng_dist_merged["engagement_index"] > 50].shape[0]/prod_eng_dist_merged.shape[0],2))

Another way to see it visually is by plottinga distribution plot with a cumulative feature.

In [None]:
sns.displot(data=prod_eng_dist_merged[prod_eng_dist_merged["engagement_index"] <= 100], x="engagement_index", bins=20, stat="probability",cumulative =True)
plt.title("Distribution of data in the column Engagement Index");

##### Observation
Around 80% of records have engagement Index of less than 20 so let us filter out and see the distribution again

<a id = numcl1></a>
#### Numerical Column Engagement Index Cleaning

In [None]:
df_eng_lt_20 = prod_eng_dist_merged[prod_eng_dist_merged["engagement_index"] <= 20]

In [None]:
sns.displot(data=df_eng_lt_20, x="engagement_index", bins=20, stat="probability")
plt.title("Distribution of data in the column Enagement Index");

##### Observation
The Engagement Index is a skewed data the majority of the records have a value less than 10.  

In [None]:
df_eng_lt_10 = prod_eng_dist_merged[prod_eng_dist_merged["engagement_index"] <= 10]

<a id = Numeric_Column_pct_access></a>
### Numeric Column pct_access

In [None]:
plt.figure(figsize=(10,10))
sns.set_context("poster")
plt.suptitle("Inspecting numerical columns")
ax=sns.displot(data = df_eng_lt_10,x ="pct_access",bins=20,stat="probability")
ax.set(title="Inspecting distribution of data in numerical column "+"pct_access");

##### Observation
Most of the pct access is between 0 and 30 percent and according to the data dictionary the pct_access means "Percentage of students in the district have at least one page-load event of a given product and on a given day

<a id = numcl2></a>
#### Numerical Column Percent Access Cleaning

Let us see how the data is divided for the pct_access less than 30 percent and those with above 30 percent access.

In [None]:
print("Data set size for pct_acces < = 30 % is : ", round(df_eng_lt_10[df_eng_lt_10.pct_access <=.30].shape[0]/df_eng_lt_10.shape[0],2),"%")
print("Data set size for pct_acces >30 % is :", round(df_eng_lt_10[df_eng_lt_10.pct_access >=.30].shape[0]/df_eng_lt_10.shape[0],2),"%")

##### Observation
The data is 90% with records for pct_access <= 30 percent hence we can filter the data for only records with less than 30 percent access

Another way to visualise it with cumulative distribution plots

In [None]:
sns.set_style("darkgrid")
plt.figure(figsize=(10,10))
sns.set_context("poster")

ax=sns.displot(data = df_eng_lt_10[df_eng_lt_10.pct_access <=.30],x ="pct_access",bins=20,stat="probability", cumulative=True)
ax.set(title="Inspecting distribution of data in numerical column "+"pct_access");

Let us filter the data and see the distribution futher in this filtered data set

In [None]:
df_pct_acc_lt_30 = df_eng_lt_10[df_eng_lt_10.pct_access <=.30]

In [None]:
sns.set_style("darkgrid")
plt.figure(figsize=(10,10))
sns.set_context("poster")

ax=sns.displot(data = df_pct_acc_lt_30,x ="pct_access",bins=20,stat="probability")
ax.set(title="Inspecting distribution of data in numerical column "+"pct_access");


##### Observation
Almost half of the records are in the range of 0 to 0.05  pct_access, and 80% are below 0.15 pct_access. The data is skewed towards the lower range of 0 to 10 percent. This is further supported by the cumulative graph seen above.

In [None]:
# Let us retain records with pct_access less than equal to .15
df_pct_acc_lt_15 = df_pct_acc_lt_30[df_pct_acc_lt_30.pct_access <=.15]

<a id = cat_cols></a>
# Catgeorical Columns

<a id = cat_col_clean></a>
### Data Cleaning for categorical columns

##### county_connections_ratio
According to the data dictionary ratio (residential fixed high-speed connections over 200 kbps in at least one direction/households) based on the county level data from FCC From 477 (December 2018 version). See FCC data for more information.

In [None]:
df_pct_acc_lt_15[df_pct_acc_lt_15["county_connections_ratio"] == "Data not Available"].sample(10)

In [None]:
print("The percentage of rows that have no data available for county_connections_ratio is: "+str(round(df_pct_acc_lt_15[df_pct_acc_lt_15["county_connections_ratio"] == "Data not Available"].shape[0]/df_pct_acc_lt_15.shape[0],2)*100))

The percentage of rows with "Data not avialble for county_connections_ratio is 8 % hence we will drop it.

In [None]:
df_pct_acc_lt_15 = df_pct_acc_lt_15[df_pct_acc_lt_15["county_connections_ratio"] != "Data not Available"]

<a id = pct_free/reduced></a>
#### pct_free/reduced
According to data dictionary percentage_free/reduced is *Percentage of students in the districts eligible for free or reduced-price lunch based on 2018-19 NCES data*

In [None]:
df_pct_acc_lt_15[df_pct_acc_lt_15["pct_free/reduced"] == "Data not Available"].sample(10)

In [None]:
print("The percentage of rows that have no data available for pct_free/reduced is: "+str(round(df_pct_acc_lt_15[df_pct_acc_lt_15["pct_free/reduced"] == "Data not Available"].shape[0]/df_pct_acc_lt_15.shape[0],2)*100))

The percentage of rows with "Data not avialble for pct_free/reduced is 12 % hence we will drop it.

In [None]:
df_pct_acc_lt_15 = df_pct_acc_lt_15[df_pct_acc_lt_15["pct_free/reduced"]!="Data not Available"]

<a id = Sector></a>
#### Sector(s)
Accordong to the data dictionary sectors *Sector of education where the product is used*

In [None]:
print("The percentage of rows that have no data available for Sector(s) is: "+str(round(df_pct_acc_lt_15[df_pct_acc_lt_15["Sector(s)"] == "Data not Available"].shape[0]/df_pct_acc_lt_15.shape[0],2)*100))

The percentage of rows with "Data not avialble for Sectors is 0 %.

<a id = pp_total_raw></a>
#### pp_total_raw 
According to the data dictionary the *Per-pupil total expenditure (sum of local and federal expenditure) from Edunomics Lab's National Education Resource Database on Schools (NERD$) project. The expenditure data are school-by-school, and we use the median value to represent the expenditure of a given school district.*
The column "pp_total_raw" has a large number of data not available. Let us look at that.

In [None]:
print("The percentage of rows that have no data available for pp_tortal_raw is: "+str(round(df_pct_acc_lt_15[df_pct_acc_lt_15["pp_total_raw"] == "Data not Available"].shape[0]/df_pct_acc_lt_15.shape[0],2)*100))

In [None]:
df_pct_acc_lt_15[df_pct_acc_lt_15["pp_total_raw"] == "Data not Available"].sample(10)

The data for this column can be analysed by omitting this value. It can be either considered as the data was not supplied for confidentiality purpose or no grants were given.

In [None]:
cat_cols = df_pct_acc_lt_15.select_dtypes(include=["object","string"]).columns
cat_cols

We will make a list of categorical columns that are having less unique values and others which ahve more than can be visualized.


In [None]:
for col in cat_cols:
    print("For columns",col,"the number of unique values are",len(df_pct_acc_lt_15[col].unique()))

In [None]:
cat_cols_less = ['Sector(s)', 'state', 'locale', 'pct_black/hispanic', 'pct_free/reduced',  'pp_total_raw']

Column 'county_connections_ratio' has one value hence can be elliminated,

Now we will draw count plots to see how many records of each category is present

In [None]:
sns.set_context("notebook")
f = plt.figure(figsize=(20,20))
for i,col in enumerate(cat_cols_less):
    plt.subplot(7,1,i+1)
    sns.countplot(data=df_pct_acc_lt_15, x=col)
    ax.set(title="Inspecting the distribution of categories in the categorical column "+col)

plt.suptitle("Inspecting categorical columns",fontdict={"fontsize":25})
f.tight_layout();

<a id = pef></a>
#### Primary Essential Function 
According to the data dictionary "Primary Essential Function", the basic function of the product. There are two layers of labels here. Products are first labeled as one of these three categories: LC = Learning & Curriculum, CM = Classroom Management, and SDO = School & District Operations. Each of these categories have multiple sub-categories with which the products were labeled

In [None]:
#Checking the top 10 unique values
df_pct_acc_lt_15["Primary Essential Function"].value_counts(normalize=True).nlargest(10)

There are 36 unique Primary Essential Functions. 

<a id = pn></a>
#### Product Name 


In [None]:
#Checking the top 10 number of unique values
df_pct_acc_lt_15["Product Name"].value_counts(normalize=True).nlargest(10)

<a id =time></a>
### Column Time

In [None]:
print("The data set contains records from date",df_pct_acc_lt_15["time"].min(),"to",df_pct_acc_lt_15["time"].max())

In [None]:
# let us look at the records distribution over 12 months
sns.set_context("poster")
f = plt.figure(figsize=(15,8))
sns.histplot(df_pct_acc_lt_15["time"],bins=12,color="g")
plt.title("Number of records for the different months")
f.tight_layout()

### Observation
The number of records are high for month September to November may be assocaited with the start of a new academic year. June , July and August have seen the lowest records may be due to annual end of academic year holidays.

Let us see if the weekday plays a role in the number of records.

In [None]:
# let us see if there ias a pattern for week days
sns.set_context("poster")
f = plt.figure(figsize=(20,8))
sns.histplot(df_pct_acc_lt_15["time"].dt.day_name(),bins=7,color="coral")
plt.title("Number of records for the different days of the week")

<a id = bivar></a>
# Bivariate  and Multivariate Analysis

<a id = p_vs_e></a>
#### pct_access vs Engagement index and locale

Let us check if there is a relationship between the two numerical columns.

In [None]:
# Checking the relationship between engagement index and pct_access
sns.set_context("notebook")
f = plt.figure(figsize=(20,8))
sns.scatterplot(data = df_pct_acc_lt_15, x="engagement_index", y="pct_access",hue="locale")
plt.title("Realtionship between pct_access and engagement index");

### Observation

The products with low pct_access but high engagement index is seen by the cities. These maybe products used by corporate sectors and higher education.

<a id = dem_vs_eng></a>
### Demographic Categorical values vs pct_access 

Now let us look at relationship of categorical columns which have demographic details with the engagement Index

In [None]:
f = plt.figure(figsize=(20,30))
sns.set_context("notebook")
plt.suptitle("Inspecting the spread of values for pct_access for different demographic categorical columns",fontdict={"fontsize":25})
for i,col in enumerate(cat_cols_less):
    plt.subplot(7,1,i+1)
    plt_order =list(df_pct_acc_lt_15.groupby(by=[col])["pct_access"].median().sort_values(ascending=False).index)
    ax=sns.boxplot(data=df_pct_acc_lt_15,x=col,y="pct_access",order=plt_order)
#     ax.set(title="Inspecting numerical column "+col)
    f.tight_layout();
        

### Observation 

1. The Corporate sector dedicated products though has the lowest count in the records has a high pct_access compared to the other categories.

2. The higher education products have a very low count of records and low pct_access as compared to other sectors. 

3. We can further discover the products that are popular in the corporate sector and educational sectors which are having the higher pct_access

4. The pct_access for New Hamshire nad New York is seen to be higher than the other States. Florida and Texas seem to be the lowest. 

5. The pct_access of the rural is seen to be higher than the rest of the locales.

6. The race has no impact on the engagement index.

8. The highest range in pp_total_raw of 22000 to 24000 and 32000 and 34000 have a higher pct_access compared to others.

# Questions

<a id =Q1></a>
#### Question 1
# For which Primary Essential function is the online learning facility widely used? 

In [None]:
top_10_pef = round(df_pct_acc_lt_15.groupby("Primary Essential Function")["engagement_index"].sum()/df_pct_acc_lt_15["engagement_index"].sum()*100,2).nlargest(10)
top_10_pef

### Observation
20% of products are for Digital learning platforms, 15% are Sites Resources and Reference, 10% are Content creation tools and 9% of study tools

<a id =Q2></a>
#### Question 2
# How many products that have Primary Essential function as Digital Learning Platforms? 

In [None]:
df_pct_acc_lt_15[df_pct_acc_lt_15["Primary Essential Function"] == "LC - Digital Learning Platforms"]["Product Name"].nunique()

##### Observation 
There are 74 unique products. Let us see which are top 10 in terms of engagemnt index.


<a id =Q3></a>
#### Question 3
# What are the top 10 products interms of engagemnt index that have Primary Essential function as Digital Learning Platforms? 

In [None]:
dlp_df = df_pct_acc_lt_15[df_pct_acc_lt_15["Primary Essential Function"] == "LC - Digital Learning Platforms"]
top10 = dlp_df.groupby("Product Name")["engagement_index"].sum().nlargest(10)
top10

In [None]:
sns.set_palette("colorblind")
sns.set_context("notebook")
f = plt.figure(figsize=(10,8))
df = pd.DataFrame(top10)
df = df.reset_index()
sns.barplot(data = df,y="Product Name", x="engagement_index",orient='h')
plt.title("Top engagement index products ")
f.tight_layout()


The top 10 products in terms of engagement index are with primary Essential function as LC - Digital Learning Platforms are 
Duolingo, Typing.com,TypingClub, Code.org, Study.com,IXL Math,BrainPOP,BrainPOP Jr., ABCmouse.com,Tynker 

<a id =Q4></a>
#### Question 4
# What are the top 10 products in terms of pct_access that have Primary Essential function as Digital Learning Platforms? 

In [None]:
dlp_df = df_pct_acc_lt_15[df_pct_acc_lt_15["Primary Essential Function"] == "LC - Digital Learning Platforms"]
top10 = dlp_df.groupby(["state","Product Name"])["pct_access"].sum().nlargest(10)
top10

In [None]:
sns.set_context("notebook")
f = plt.figure(figsize=(20,8))

df = pd.DataFrame(top10)
df = df.reset_index()
sns.barplot(data = df,y="Product Name", x="pct_access",orient='h',hue='state')
plt.title("Top 10 pct_access products with their state names");

f.tight_layout();

The top 10 products in terms of engagement index are with pact_access across a state as LC - Digital Learning Platforms. They are listed in the descending order of pct_access 
in Utah it is Study.com ,
in Connecticut it is  Study.com, Kids A-Z, Raz-Kids, AdaptedMind, 
in Illinios it is AdaptedMind ,Study.com,Kids A-Z, 
in Utah it  is Kids A-Z , 
in Connecticut it is Duolingo

<a id =Q5></a>
#### Question 5 
# Did the locale influence the pct_access or engagement_index?

In [None]:
plt.figure(figsize=(10,10))
sns.set_context("notebook")
ax=sns.displot(data = df_pct_acc_lt_15,x ="pct_access",hue ="locale", bins=5,stat="probability",multiple="stack")
ax.set(title="Did the locale influence the pct_access");

The locale did not seem to influence the pct_access

In [None]:
plt.figure(figsize=(10,10))
sns.set_context("notebook")
ax=sns.displot(data = df_pct_acc_lt_15,x ="engagement_index",hue ="locale", bins=5,stat="probability",multiple="stack")
ax.set(title="Did the locale influence the pct_access");

<a id =Q6></a>
#### Question 6 
# What products are popular in different locales?

In [None]:
df_pct_acc_lt_15["locale"].unique()

In [None]:
locale= ['Suburb', 'Rural', 'Town', 'City']

for i, loc in enumerate(locale):
    dlp_df = df_pct_acc_lt_15[df_pct_acc_lt_15["locale"] == loc]
    top10 = dlp_df.groupby("Product Name")["engagement_index"].sum().nlargest(10)
    print("Top 10 prods in locale",loc)
    print(list(top10.index))

In [None]:
locale= ['Suburb', 'Rural', 'Town', 'City']
sns.set_context("notebook")
f = plt.figure(figsize=(20,8))
for i, loc in enumerate(locale):
    dlp_df = df_pct_acc_lt_15[df_pct_acc_lt_15["locale"] == loc]
    top10 = dlp_df.groupby(["state","Product Name"])["pct_access"].sum().nlargest(10)
    ax = plt.subplot(1,4,i+1)
    df = pd.DataFrame(top10)
    df = df.reset_index()
    sns.barplot(data = df,y="Product Name", x="pct_access",orient='h',dodge=False)
    ax.set(title = "Locale:"+loc)
    plt.suptitle("Top 10 pct_access products in the a particular locale with their state names");
    plt.xticks(rotation=90)
f.tight_layout()


In [None]:
for i, loc in enumerate(locale):
    dlp_df = df_pct_acc_lt_15[df_pct_acc_lt_15["locale"] == loc]
    top10 = dlp_df.groupby("Product Name")["engagement_index"].sum().nlargest(10)
    print("Top 10 prods in locale",loc)
    print(list(top10.index))

In [None]:
locale= ['Suburb', 'Rural', 'Town', 'City']
sns.set_context("notebook")
f = plt.figure(figsize=(20,8))
for i, loc in enumerate(locale):
    dlp_df = df_pct_acc_lt_15[df_pct_acc_lt_15["locale"] == loc]
    top10 = dlp_df.groupby("Product Name")["engagement_index"].sum().nlargest(10)
    ax = plt.subplot(1,4,i+1)
    df = pd.DataFrame(top10)
    df = df.reset_index()
    sns.barplot(data = df,y="Product Name", x="engagement_index",orient='h',dodge=False)
    
    ax.set(title = "Locale:"+loc)
    plt.suptitle("Top 10 engagement index products across all states in a particular locale");
    plt.xticks(rotation=90)
f.tight_layout()


### Observation 
The products are almost simialr except that the order has changed in different locales.

<a id =Q7></a>
#### Question 7 
# Does the popularity of products depend on pct_black/hispanic ?

In [None]:
lis = list(df_pct_acc_lt_15["pct_black/hispanic"].unique())

In [None]:

sns.set_context("notebook")
f = plt.figure(figsize=(20,8))
for i, loc in enumerate(lis):
    dlp_df = df_pct_acc_lt_15[df_pct_acc_lt_15["pct_black/hispanic"] == loc]
    top10 = dlp_df.groupby(["state","Product Name"])["pct_access"].sum().nlargest(10)
    ax = plt.subplot(1,5,i+1)
    df = pd.DataFrame(top10)
    df = df.reset_index()
    sns.barplot(data = df,y="Product Name", x="pct_access",orient='h',dodge=False)
    ax.set(title = "pct_black/hispanic:"+loc)
    plt.suptitle("Top 10 pct_access products in places with particular pct_black/hispanic");
    plt.xticks(rotation=90)
f.tight_layout()


In [None]:

sns.set_context("notebook")
f = plt.figure(figsize=(20,8))
for i, loc in enumerate(lis):
    dlp_df = df_pct_acc_lt_15[df_pct_acc_lt_15["pct_black/hispanic"] == loc]
    top10 = dlp_df.groupby("Product Name")["engagement_index"].sum().nlargest(10)
    ax = plt.subplot(1,5,i+1)
    df = pd.DataFrame(top10)
    df = df.reset_index()
    sns.barplot(data = df,y="Product Name", x="engagement_index",orient='h',dodge=False)
    
    ax.set(title = "Locale:"+loc)
    plt.suptitle("Top 10 engagement index products across all states in a particular locale");
    plt.xticks(rotation=90)
f.tight_layout()


### Observation 
The products are almost simialr except that the order has changed in different locales.

<a id = Q8></a>
#### Question 8
# Which Products are popular during weekdays? Are they different from the ones popular on weekends?


Let us seperate the weekend data and weekday data

In [None]:
weekend_df = df_pct_acc_lt_15[df_pct_acc_lt_15["time"].dt.day_name().isin(["Saturday","Sunday"])]
weekend_df.sample(5)

In [None]:
weekday_df = df_pct_acc_lt_15[~df_pct_acc_lt_15["time"].dt.day_name().isin(["Saturday","Sunday"])]
weekday_df.sample(5)

In [None]:
weekdaydf = weekday_df.groupby(["state","Product Name"])["pct_access"].sum().nlargest(10)
weekenddf = weekend_df.groupby(["state","Product Name"])["pct_access"].sum().nlargest(10)

In [None]:
f = plt.figure(figsize=(20,8))

ax1= plt.subplot(1,2,1)
df = pd.DataFrame(weekdaydf)
df = df.reset_index()
sns.barplot(data = df,y="Product Name", x="pct_access",orient='h',dodge=False)
ax1.set_title("Top 10 Products with highest pct_access in a state on week days", fontsize=16)


ax2 =plt.subplot(1,2,2)
df = pd.DataFrame(weekenddf)
df = df.reset_index()
sns.barplot(data = df,y="Product Name", x="pct_access",orient='h',dodge=False)
ax2.set_title("Top 10 Products with highest pct_access in a state on weekends", fontsize=16)


f.tight_layout()


In [None]:
weekdaydf = weekday_df.groupby("Product Name")["engagement_index"].sum().nlargest(10)
weekenddf = weekend_df.groupby("Product Name")["engagement_index"].sum().nlargest(10)

In [None]:
f = plt.figure(figsize=(20,8))
ax1= plt.subplot(1,2,1)
df = pd.DataFrame(weekdaydf)
df = df.reset_index()
sns.barplot(data = df,y="Product Name", x="engagement_index",orient='h',dodge=False)
ax1.set_title("Top 10 Products with highest engagement index across all states on week days", fontsize=16)


ax2 =plt.subplot(1,2,2)
df = pd.DataFrame(weekenddf)
df = df.reset_index()
sns.barplot(data = df,y="Product Name", x="engagement_index",orient='h',dodge=False)
ax2.set_title("Top 10 Products with highest engagement index across all states on weekends", fontsize=16)


f.tight_layout();

In [None]:
weekday_df[weekday_df["Product Name"].isin(weekdaydf.index)]["Primary Essential Function"].unique()

In [None]:
weekend_df[weekend_df["Product Name"].isin(weekenddf.index)]["Primary Essential Function"].unique()

### Observation
The products popular on weekdays belong to Learning platforms. While the weekend apps are more of self learning and recreation products.

<a id = Q9></a>
#### Question 9
# Which Products are popular based on the Sector?

In [None]:
df_pct_acc_lt_15["Sector(s)"].unique()

In [None]:
sector= ['PreK-12; Higher Ed; Corporate', 'PreK-12', 'PreK-12; Higher Ed','Corporate', 'Higher Ed; Corporate']
sns.set_context("notebook")
f = plt.figure(figsize=(30,20))
for i, sec in enumerate(sector):
    ax = plt.subplot(3,2,i+1)
    dlp_df = df_pct_acc_lt_15[df_pct_acc_lt_15["Sector(s)"] == sec]
    top10 = dlp_df.groupby(["state","Product Name"])["pct_access"].sum().nlargest(10)
    df = pd.DataFrame(top10)
    df = df.reset_index()
    sns.barplot(data = df,y="Product Name", x="pct_access",hue="state",orient='h',dodge=False)
    plt.legend(loc="best")
    ax.set(title = "Sector(s):"+sec)
    plt.suptitle("Top 10 pct_access products in the a particular sector with their state names");
#     plt.xticks(rotation=90)
f.tight_layout()

In [None]:
sector= ['PreK-12; Higher Ed; Corporate', 'PreK-12', 'PreK-12; Higher Ed','Corporate', 'Higher Ed; Corporate']
sns.set_context("notebook")
f = plt.figure(figsize=(20,8))
for i, sec in enumerate(sector):
    ax = plt.subplot(3,2,i+1)
    dlp_df = df_pct_acc_lt_15[df_pct_acc_lt_15["Sector(s)"] == sec]
    top10 = dlp_df.groupby("Product Name")["engagement_index"].sum().nlargest(10)
    df = pd.DataFrame(top10)
    df = df.reset_index()
    sns.barplot(data = df,y="Product Name", x="engagement_index",orient='h')
#     top10.plot.barh()
    ax.set(title = "Sector(s):"+sec)
    plt.suptitle("Top 10 engagement index products in the a particular sector across all states");
    plt.xticks(rotation=90)
f.tight_layout()

### Observation 
The products are almost simialr except that the order has changed in different locales.

<a id =Q10></a>
#### Question 10
# Which are the popular products across different states?

In [None]:
states_list = list(df_pct_acc_lt_15["state"].unique())
states_list

In [None]:
sns.set_context("notebook")
f = plt.figure(figsize=(25,30))

for i,state in enumerate(states_list):
    plt.subplot(len(states_list)//2,2,i+1)
    state_df = df_pct_acc_lt_15[df_pct_acc_lt_15["state"]==state]
    top10 = state_df.groupby("Product Name")["engagement_index"].sum().nlargest(10)
    df = pd.DataFrame(top10)
    df = df.reset_index()
    ax = sns.barplot(data = df,y="Product Name", x="engagement_index",orient='h')
    ax.set_title("Engagement index for products in "+state,fontdict={"fontsize":15})
    
f.tight_layout()

In [None]:
sns.set_context("notebook")
f = plt.figure(figsize=(25,30))

for i,state in enumerate(states_list):
    plt.subplot(len(states_list)//2,2,i+1)
    state_df = df_pct_acc_lt_15[df_pct_acc_lt_15["state"]==state]
    top10 = state_df.groupby("Product Name")["pct_access"].sum().nlargest(10)
    df = pd.DataFrame(top10)
    df = df.reset_index()
    ax = sns.barplot(data = df,y="Product Name", x="pct_access",orient='h')
    ax.set_title("pct_access for products in "+state,fontdict={"fontsize":15})
    
f.tight_layout()

<a id =Q11></a>
#### Question 11
# Which Primary Essential Function does the product which is popular across different state belong to?

In [None]:
sns.set_context("notebook")
f = plt.figure(figsize=(25,30))

for i,state in enumerate(states_list):
    plt.subplot(len(states_list)//2,2,i+1)
    state_df = df_pct_acc_lt_15[df_pct_acc_lt_15["state"]==state]
    top10 = state_df.groupby("Product Name")["pct_access"].sum().nlargest(10)
    top_prods_list = list(top10.index)
    ct_df = df_pct_acc_lt_15[df_pct_acc_lt_15["state"]==state]
    top_prods_df = ct_df[ct_df["Product Name"].isin(top_prods_list)]
    
    ax = sns.barplot(data = top_prods_df,y="Primary Essential Function", x="pct_access",orient='h')
    ax.set_title("Primary Essential Function of popular products in "+state,fontdict={"fontsize":15})
    
f.tight_layout()

### Observation
The most popular primary essential function is "Sites,Resources and Reference" followed by "LC/CM/SDO-Other"

Now to correlate if there is a reason for the products to be popular lets make use of an external data source.

<a id =ED></a>
## External Data

Data was taken from the website "https://www.census.gov/data.html". The data considered gives the population for children under 19 years of age for different states. The data is got for different age groups of "Under 5 years","5 to 9 years", "10 to 14 years" and "15 to 19 years".

The idea behind this step is that incaase the child poulation of a particular group is dominant in a state the choice of the product might be dependent on the age factor.

In [None]:
pop_df = pd.read_csv("../input/children-population/Children_population_5_to_19.csv")
pop_df.head()


In [None]:
pop_df.info()

In [None]:
# Converting the values to percentage of total population of the state to compare the population of children in all states
pop_df["all_kids"] = round(pop_df[["Under 5 years","5 to 9 years","10 to 14 years","15 to 19 years"]].sum(axis=1)/pop_df["Total population"]*100,2)
pop_df["Under 5 years"] = round(pop_df["Under 5 years"]/pop_df["Total population"]*100,2)
pop_df["5 to 9 years"] = round(pop_df["5 to 9 years"]/pop_df["Total population"]*100,2)
pop_df["10 to 14 years"] = round(pop_df["10 to 14 years"]/pop_df["Total population"]*100,2)
pop_df["15 to 19 years"] = round(pop_df["15 to 19 years"]/pop_df["Total population"]*100,2)


In [None]:
pop_df.head()

In [None]:
sns.set_context("poster")
sns.set_palette("gist_rainbow")
f = plt.figure(figsize=(25,8))
#First row of plots
ax = plt.subplot(1,2,1)
sns.barplot(data = pop_df, x= "Geographic Area Name",y="Total population",order=pop_df.sort_values("Total population",ascending=False)["Geographic Area Name"])
plt.xticks(rotation =90);
ax.set_title("Total Population of all people in different States ",fontdict={"fontsize":20})

ax = plt.subplot(1,2,2)

sns.barplot(data = pop_df, x= "Geographic Area Name",y="all_kids",order=pop_df.sort_values("all_kids",ascending=False)["Geographic Area Name"])
plt.xticks(rotation =90);
ax.set_title("Population Percentage of kids of all age  of different States ",fontdict={"fontsize":20})
#second row of plots
f = plt.figure(figsize=(25,8))

ax = plt.subplot(1,4,1)
sns.barplot(data = pop_df, x= "Geographic Area Name",y="Under 5 years",order=pop_df.sort_values("Under 5 years",ascending=False)["Geographic Area Name"])
ax.set_title("Population Percentage of children under 5",fontdict={"fontsize":15})
plt.xticks(rotation =90);

ax = plt.subplot(1,4,2)
sns.barplot(data = pop_df, x= "Geographic Area Name",y="5 to 9 years",order=pop_df.sort_values("5 to 9 years",ascending=False)["Geographic Area Name"])
ax.set_title("Population Percentage of children 5 to 9 years",fontdict={"fontsize":15})
plt.xticks(rotation =90);

ax = plt.subplot(1,4,3)
sns.barplot(data = pop_df, x= "Geographic Area Name",y="10 to 14 years",order=pop_df.sort_values("10 to 14 years",ascending=False)["Geographic Area Name"])
ax.set_title("Population Percentage of children 10 to 14 years",fontdict={"fontsize":15})
plt.xticks(rotation =90);

ax = plt.subplot(1,4,4)
sns.barplot(data = pop_df, x= "Geographic Area Name",y="15 to 19 years",order=pop_df.sort_values("15 to 19 years",ascending=False)["Geographic Area Name"])
ax.set_title("Population Percentage of children 15 to 19 years",fontdict={"fontsize":15})
plt.xticks(rotation =90);
f.tight_layout()

### Observation
The percentage population of all age group children is around 30% in Utah, followed by around 27% in Texas and around 25 % in Minnesota. New Hampshire and Florida are the lowest around 21 % to 22 %

While looking at different age groups. Utah still has the higherst percentage in all groups followed by Texas. New Hampshire is the lowest except for kids between 15 and 19. Florida is the lowest in this category.


Now looking at this [question9](#Q9) reveals the Primary essential functions of theese states.



<a id =Q12a></a>
#### Question 12 a
# What are the products popular in States with high kids population(Utah and Texas)?
<a id =Q12b></a>
#### Question 12b
# What are the products popular in States with low kids population(Florida and New Hampshire)?

In [None]:
states_list = ["Utah","Texas"]
sns.set_context("poster")
f = plt.figure(figsize=(25,10))
plt.suptitle("Popular products in high child population", fontdict={"fontsize":25})
for i,state in enumerate(states_list):
    plt.subplot(len(states_list)//2,2,i+1)
    state_df = df_pct_acc_lt_15[df_pct_acc_lt_15["state"]==state]
    top10 = state_df.groupby("Product Name")["pct_access"].sum().nlargest(10)
    df = pd.DataFrame(top10)
    df = df.reset_index()
    ax = sns.barplot(data = df,y="Product Name", x="pct_access",orient='h')
    ax.set_title("pct_access for products in "+state,fontdict={"fontsize":25})
    
f.tight_layout()
states_list = ["New Hampshire","Florida"]
sns.set_context("poster")
f = plt.figure(figsize=(25,10))
plt.suptitle("Popular products in low child population", fontdict={"fontsize":25})
for i,state in enumerate(states_list):
    plt.subplot(len(states_list)//2,2,i+1)
    state_df = df_pct_acc_lt_15[df_pct_acc_lt_15["state"]==state]
    top10 = state_df.groupby("Product Name")["pct_access"].sum().nlargest(10)
    df = pd.DataFrame(top10)
    df = df.reset_index()
    ax = sns.barplot(data = df,y="Product Name", x="pct_access",orient='h')
    ax.set_title("pct_access for products in "+state,fontdict={"fontsize":25})
    
f.tight_layout()

### Observation 
1. Although the products popular in high kids population states differ from those in low kid population states,there is no striking difference. 


<a id =Q13a></a>
#### Question 13a
# What are the primary Essential Functions of the products popular in States with high kids population(Utah and Texas)?
<a id =Q13b></a>
#### Question 13b
# What are the primary Essential Functions of the products popular in States with low kids population(New Hampshire and Florida)?

In [None]:

states_list = ["Utah","Texas"]
sns.set_context("poster")
f = plt.figure(figsize=(25,10))
plt.suptitle("Primary Essential function of popular products in high child population", fontdict={"fontsize":25})
for i,state in enumerate(states_list):
    top_prods = round(df_pct_acc_lt_15[df_pct_acc_lt_15["state"]==state]["Product Name"].value_counts(normalize=True)*100, 2).nlargest(10)
    top_prods_list = list(top_prods.index)
    ct_df = df_pct_acc_lt_15[df_pct_acc_lt_15["state"]==state]
    top_prods_df = ct_df[ct_df["Product Name"].isin(top_prods_list)]
    pe_ct  = round(top_prods_df["Primary Essential Function"].value_counts(normalize=True)*100, 2).nlargest(10)
    plt.subplot(len(states_list),1,i+1)
    ax=sns.barplot(data = top_prods_df ,y= pe_ct.index, x =pe_ct.values,orient='h')
    ax.set_title("Primary Essential Function of products popular in "+state)

f.tight_layout()
states_list = ["New Hampshire","Florida"]
sns.set_context("poster")
f = plt.figure(figsize=(25,10))
plt.suptitle("Primary Essential function of popular products in low child population", fontdict={"fontsize":25})
for i,state in enumerate(states_list):
    top_prods = round(df_pct_acc_lt_15[df_pct_acc_lt_15["state"]==state]["Product Name"].value_counts(normalize=True)*100, 2).nlargest(10)
    top_prods_list = list(top_prods.index)
    ct_df = df_pct_acc_lt_15[df_pct_acc_lt_15["state"]==state]
    top_prods_df = ct_df[ct_df["Product Name"].isin(top_prods_list)]
    pe_ct  = round(top_prods_df["Primary Essential Function"].value_counts(normalize=True)*100, 2).nlargest(10)
    plt.subplot(len(states_list),1,i+1)
    ax=sns.barplot(data = top_prods_df ,y= pe_ct.index, x =pe_ct.values,orient='h')
    ax.set_title("Primary Essential Function of products popular in "+state)

f.tight_layout()

### Observation 
1. The primary essential function of products popular in high kids population states "content creation", featured with slightly higher than 20% and sites resources and references at slightly lower than 20% in Texas and "games and simulation" and SDO has featured with slightly greater than 20% each in Utah of the popular products. 
2. The primary essential function of products popular in low kids population states "study tools "  featured with around 30% in New Hampshire and "sites resources and references"  featured with a little less tha 30% in Florida.

<a id =Q14></a>
#### Question 14
# Is there any corelation between the percentage of black/hispanic to the funds invested by NERDS?

In [None]:
data_cross1 = pd.crosstab(index=df_pct_acc_lt_15["pct_black/hispanic"],columns=df_pct_acc_lt_15["pp_total_raw"])
data_cross1 = data_cross1[['[4000, 6000[', '[6000, 8000[', '[8000, 10000[','[10000, 12000[', '[12000, 14000[', '[14000, 16000[', '[16000, 18000[',
       '[18000, 20000[', '[22000, 24000[', '[32000, 34000[']]
data_cross1.head()

In [None]:
f = plt.figure(figsize=(30,8))
sns.set_context("poster")

ax = plt.subplot(1,2,1)
sns.heatmap(data_cross1,cmap="YlGnBu");
ax.set(title="percentage of black/hispanic to the funds invested by NERDS")


### Observation
The number of records for the value of pct_free/reduced ="[0,0.2[" and pp_total_raw = 8000,10000[" has the maximim number of records.

<a id =Q15></a>
#### Question 15
# Is there any corelation between the percentage of black/hispanic to Percentage of students in the districts eligible for free or reduced-price lunch?

In [None]:
data_cross1 = pd.crosstab(index=df_pct_acc_lt_15["pct_black/hispanic"],columns=df_pct_acc_lt_15["pct_free/reduced"])
data_cross1 = data_cross1[["[0, 0.2[","[0.2, 0.4[","[0.4, 0.6[","[0.6, 0.8[","[0.8, 1["]]
data_cross1.head()

In [None]:
f = plt.figure(figsize=(30,8))
sns.set_context("poster")

ax = plt.subplot(1,2,1)
sns.heatmap(data_cross1,cmap="YlGnBu");
ax.set(title="percentage of black/hispanic to Percentage of students in the districts eligible for free or reduced-price lunch")

The number of records for the value of pct_free/reduced ="[0,0.2[" and pct_black/hispanic = [0,0.2[" has the maximim number of records.

<a id =CnS></a>
# Conclusion and Summary
1. 20% of products are for Digital learning platforms, 15% are Sites Resources and Reference, 10% are Content creation tools and 9% of study tools.
2. There are 74 unique products. Let us see which are top 10 in terms of engagemnt index.
3. The top 10 products with primary Essential function as LC - Digital Learning Platforms are Duolingo, Typing.com,TypingClub, Code.org, Study.com,IXL Math,BrainPOP,BrainPOP Jr., ABCmouse.com,Tynker 
4. The top 10 products in terms of engagement index are with pact_access across a state as LC - Digital Learning Platforms. They are listed in the descending order of pct_access 

> in Utah it is Study.com ,

> in Connecticut it is  Study.com, Kids A-Z, Raz-Kids, AdaptedMind, 

> in Illinios it is AdaptedMind ,Study.com,Kids A-Z, 

> in Utah it  is Kids A-Z , 

> in Connecticut it is Duolingo

5. The products popular in different locales are almost simialr except that the order of preference has changed.
6. The products popular on weekdays belong to Learning platforms. While the weekend apps are more of self learning and recreation products.
7. The most popular primary essential function is "Sites,Resources and Reference" followed by "LC/CM/SDO-Other".
8. The percentage population of all age group children is around 30% in Utah, followed by around 27% in Texas and around 25 % in Minnesota. New Hampshire and Florida are the lowest around 21 % to 22 %

9. While looking at different age groups. Utah still has the higherst percentage in all groups followed by Texas. New Hampshire is the lowest except for kids between 15 and 19. Florida is the lowest in this category.
10. The primary essential function of products popular in high kids population states "content creation", featured with slightly higher than 20% and sites resources and references at slightly lower than 20% in Texas and "games and simulation" and SDO has featured with slightly greater than 20% each in Utah of the popular products.
11. The primary essential function of products popular in low kids population states "study tools "  featured with around 30% in New Hampshire and "sites resources and references"  featured with a little less tha 30% in Florida..

