In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## Overview 

This notebook provides a high-level view of the digital learning state in 2020. Specifically, the following areas are being explored:

- **Evolution.** 

  The evolution of various types of Learning Platforms (e.g., newly-launched, discontinued etc.) and their coverage over the districts with time.
  
- **Opportunity Analysis.** 

  After delving into the data, we classify the Learning Platforms into Freemium and Paid types. Later on, the Freemium subscription is categorized into several groups and their opportunities for expansion are unveiled.

In [None]:
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from sklearn import preprocessing
import matplotlib.pyplot as plt
import seaborn as sns
import plotly
import plotly.express as px
import plotly.graph_objs as go
from geopy.geocoders import Nominatim
import folium
from folium.plugins import HeatMap
from folium.plugins import FastMarkerCluster
from plotly import tools
import re
from plotly.offline import init_notebook_mode, plot, iplot
from wordcloud import WordCloud, STOPWORDS 
from warnings import filterwarnings
filterwarnings('ignore')
import missingno as msno
import glob

sns.set_theme(style="whitegrid")
sns.color_palette("cubehelix")

### Helpful links:

- charting: https://www.kaggle.com/niteshyadav3103/eda-e-commerce-shipping-data
- multiplot: https://dev.to/thalesbruno/subplotting-with-matplotlib-and-seaborn-5ei8
- parsing: https://www.kaggle.com/girishkumarsahu/learnplatform-covid-19-impact
- Percentiles plot: https://stackoverflow.com/questions/47503718/plot-percentiles-using-matplotlib
- closest value: https://stackoverflow.com/questions/30112202/how-do-i-find-the-closest-values-in-a-pandas-series-to-an-input-number
- fillna with groupby: https://stackoverflow.com/questions/46391128/pandas-fillna-using-groupby
- most frequent value: https://stackoverflow.com/questions/15138973/how-to-get-the-number-of-the-most-frequent-value-in-a-column
- ref previous row: https://stackoverflow.com/questions/23333786/reference-values-in-the-previous-row-with-map-or-apply
- kmeans random state: https://stats.stackexchange.com/questions/224759/how-to-avoid-k-means-assigning-different-labels-on-different-run
- dataframe from dict: https://stackoverflow.com/questions/18837262/convert-python-dict-into-a-dataframe
- others: https://online.stat.psu.edu/stat200/lesson/6/6.4

### Engagement data

This section performs the loading, cleaning and imputation of the engagement data. 

In [None]:
path = '../input/learnplatform-covid19-impact-on-digital-learning/engagement_data' 
all_files = glob.glob(path + "/*.csv")

li = []

for filename in all_files:
    df = pd.read_csv(filename, index_col=None, header=0)
    district_id = filename.split("/")[4].split(".")[0]
    df["district_id"] = district_id
    li.append(df)
    
engagement_df = pd.concat(li)
engagement_df = engagement_df.reset_index(drop=True)
engagement_df.head()

In [None]:
engagement_df.shape

Let's see the percentage of NaN values in the data. We'll carry out the cleaning and imputation on these in the subsequent lines of code.

In [None]:
engagement_df.isna().sum() * 100.0 / engagement_df.shape[0]

In [None]:
engagement_df.nunique()

In [None]:
engagement_df.dtypes

In [None]:
engagement_df_refined = engagement_df.copy()
engagement_df_refined.head()

In [None]:
engagement_df_refined["district_id"] = engagement_df_refined["district_id"].astype("category")
engagement_df_refined["district_id"].head()

In [None]:
engagement_df_refined["time"] = pd.to_datetime(engagement_df_refined["time"])

In [None]:
## The day of the week with Monday=0, Sunday=6.
engagement_df_refined["month"] = engagement_df_refined["time"].dt.month
engagement_df_refined["dayofweek"] = engagement_df_refined["time"].dt.dayofweek
engagement_df_refined["weekofyear"] = engagement_df_refined["time"].dt.weekofyear
engagement_df_refined.head()

### Definition

Let's drill down on the definition and implication of values in the engagement data.

- $$ pct\_access = \frac {total\_online\_students} {total\_enrolled\_students} $$

- $$ engagement\_index = \frac {total\_pageload\_events} {total\_enrolled\_students\_in\_thousands} $$

- pct_access is defined when any of the following holds true

    - both the $ total\_online\_students $ and $ total\_enrolled\_students $ are known.
    
    - both the $ total\_online\_students $ and $ total\_enrolled\_students $ are unknown. Mathematically this value is NaN; however, for this specific context, we can define it as 0.0. This is expected for a newly launched platform.
    
    - $ total\_enrolled\_students $ is known. The value of $ total\_online\_students $ can be inferred from the $ total\_pageload\_events $. If the latter is 0.0 or NaN, the former will assume the same value; otherwise, we can apply regression to determine this value.
    
Similar notions apply for engagement_index.

- **Inactive platform.** A platform is inactive if 
    
    -  (pct_access, engagement_index) is (0.0, 0.0): No online students; hence no page load events.
    
    -  (pct_access, engagement_index) is (0.0, NaN): In case when the pct_access is 0.0 for no online students, the total page load events may be unknown causing the engagement_index to be NaN. Such combination may appear because of some mismanagement in data collection.
    
    -  (pct_access, engagement_index) is (NaN, 0.0): This combination should not occur in practice because the total page load events is a multiple of the online students available; therefore, if the latter is NaN, the former must also be NaN. Such combination may still appear because of some mismanagement in data collection.
    
    -  (pct_access, engagement_index) is (NaN, NaN).

### Imputation

- **Basic Intuition:** Note that, if the pct_access > 0.0 then the engagement_index > 0.0 too. Because we cannot expect too see a pageload when there are no online students.

- **Inactive Learning Platforms:** Let's impute all those entries with 0.0

- **engagement_index = NaN and pct_access != NaN:** Such entries can be filled with pct_access * 10.0, if pct_access is not NaN and greater than 0.0, as each student online is supposed to load a page at least. A more complex approach to determine this is regression.

- **pct_access = NaN and engagement_index != NaN:** Such entries can be filled with engagement_index / 10.0, if engagement_index is not NaN and greater than 0.0. Bound the calculated value by 100.0, if exceeded.

- **lp_id NaN and (pct_access, engagement_index) not NaN:** Fill with 0 (Assume, Misc. LP).

### Cleaning

Get rid of all entries which fail to satify the following:

- **engagement_index lower bound.** Note that, every student must load at least a page to contribute to the engagement_index. Therefore, we need to check whether the engagement_index satisfies its lower bound for a specific entry.

- **pct_access upper bound.** Similar logic suggests us to verify whether pct_access satisfies its upper bound or not in a specific entry.

In [None]:
engagement_df_refined.describe()

- In the following scatterplot, we can observe a good number of points with pct_access = 0.0 and engagement_index > 0.0. This is counterintuitive because how can a pageload event occur when there are no students online (note that auto pageload events can be scheduled too but those are not our targeted rows)? But it may also happen that the values are too infinitesimal to be captured within 2 decimal places. Therefore, we will focus on the relationships.

    - (pct_access, engagement_index): (0.0, > 0.0) is a valid combination.
    - (pct_access, engagement_index): (> 0.0, 0.0) is an invalid combination.

In [None]:
sns.scatterplot(data=engagement_df_refined
               ,x="pct_access"
               ,y="engagement_index")

In [None]:
((engagement_df_refined["pct_access"] == 0.0) & 
 (engagement_df_refined["engagement_index"] > 0.0)).sum()

- We have found some valid yet counter-intuitive combinations above.

In [None]:
((engagement_df_refined["pct_access"] > 0.0) & 
 (engagement_df_refined["engagement_index"] == 0.0)).sum()

- Invalid combinations are not found (see the above line of code).

In [None]:
engagement_df_refined[  (engagement_df_refined["lp_id"].isna()) 
                      | (engagement_df_refined["pct_access"].isna())
                      | (engagement_df_refined["engagement_index"].isna())
                     ].head(10)

- No platform active for this combination (pct_access == 0.0 & engagement_index == NaN)

In [None]:
((engagement_df_refined["pct_access"] == 0.0) & 
 (engagement_df_refined["engagement_index"].isna())).sum()

- Lets impute these combinations (pct_access, engagement_index): (value, NaN) and (NaN, value) by inferring from the known one.

In [None]:
engagement_df_refined["engagement_index"].fillna(value=engagement_df_refined["pct_access"] * 10.
                                                ,inplace=True)

In [None]:
engagement_df_refined["pct_access"].fillna(value=engagement_df_refined["engagement_index"] / 10.
                                          ,inplace=True)

In [None]:
engagement_df_refined.isna().sum()

- Offline or inactive platform for this combination (pct_access, engagement_index) = (NaN, NaN). Let's impute such entries with 0.0

In [None]:
((engagement_df_refined["pct_access"].isna()) & 
 (engagement_df_refined["engagement_index"].isna())).sum()

In [None]:
engagement_df_refined["pct_access"].fillna(value=0.0 ,inplace=True)

In [None]:
engagement_df_refined["engagement_index"].fillna(value=0.0, inplace=True)

In [None]:
engagement_df_refined.isna().sum() 

- Still some rows available for LP_ID None (even after removing those inactive platforms). These will be imputed after cleaning the data by checking the upper and lower bounds.

In [None]:
engagement_df_refined[engagement_df_refined["lp_id"].isna()].head(10)

- Note that every student must load at least a page to contribute to the engagement_index. Therefore, we need to check whether the engagement_index given satisfies this lower bound.

In [None]:
engagement_df_refined["engagement_index_lower_bound"] = (engagement_df_refined["pct_access"] / 100.0) * 1000.0
engagement_df_refined.head(10)

- how many of the engagement_index satisfies the lower_bound?

In [None]:
((engagement_df_refined["engagement_index"] >= engagement_df_refined["engagement_index_lower_bound"])).sum()

- How many rows fail to satisfy the lower_bound of the engagement_index? We should remove those.

In [None]:
engagement_df_refined[((engagement_df_refined["engagement_index"] < engagement_df_refined["engagement_index_lower_bound"]))].index

- 1171702 rows fail to satisfy the lower bound of the engagement_index. Let's get rid of those failed ones.

In [None]:
engagement_df_refined.drop( engagement_df_refined[((engagement_df_refined["engagement_index"] < engagement_df_refined["engagement_index_lower_bound"]))].index
                          , inplace=True)

engagement_df_refined.shape

In [None]:
print("Percentage of dropped rows (engagement_index_lower_bound check): ", 1171702 * 100. / (1171702 + 21152488))

In [None]:
# sns.scatterplot( x="pct_access"
#                , y="engagement_index"
#                , data=engagement_df_refined)

- Lets see whether the opposite holds true. That is, pct_access satisfies its upper bound.

In [None]:
engagement_df_refined["pct_access_upper_bound"] = (engagement_df_refined["engagement_index"] / 1000.0) * 100.0
engagement_df_refined.head(10)

In [None]:
print("pct_access_upper_bound check passed rows: ", (engagement_df_refined["pct_access"] <= engagement_df_refined["pct_access_upper_bound"]).sum())

In [None]:
print("pct_access_upper_bound check falied rows: ", (engagement_df_refined["pct_access"] > engagement_df_refined["pct_access_upper_bound"]).sum())

442 rows fail to satisfy the upper bound of pct_access. Let's drop such entries.

In [None]:
engagement_df_refined.drop( engagement_df_refined[((engagement_df_refined["pct_access"] > engagement_df_refined["pct_access_upper_bound"]))].index
                          , inplace = True)

engagement_df_refined.shape

In [None]:
print("Percentage of pct_access_upper_bound check failed entries: ", 442 * 100. / (1171702 + 442 + 21152488))

In [None]:
engagement_df_refined.describe()

Drop upper_bound and lower_bound columns...

In [None]:
engagement_df_refined.drop(["engagement_index_lower_bound", 
                            "pct_access_upper_bound"]
                           ,axis=1
                           ,inplace=True
                          )

In [None]:
sns.scatterplot( data=engagement_df_refined
                ,x="pct_access"
                ,y="engagement_index")

- Let's impute those undefined or NaN learning platforms.

In [None]:
engagement_df_refined[engagement_df_refined["lp_id"].isna()].head(10)

In [None]:
engagement_df_refined["lp_id"].fillna(value=0.0, inplace=True)
# engagement_df_refined["pct_access"].fillna(value=0.0, inplace=True)
# engagement_df_refined.dropna(subset=["engagement_index"], inplace=True)

In [None]:
engagement_df_refined["lp_id"] = engagement_df_refined["lp_id"].astype("category")
engagement_df_refined["month"] = engagement_df_refined["month"].astype("category")
engagement_df_refined["dayofweek"] = engagement_df_refined["dayofweek"].astype("category")
engagement_df_refined["weekofyear"] = engagement_df_refined["weekofyear"].astype("category")

- Let's see the unique values retained before and after the imputation and cleaning.

In [None]:
print("before imputation and cleaning: \n\n", engagement_df.nunique())

In [None]:
print("after imputation and cleaning: \n\n", engagement_df_refined.nunique())

In [None]:
print("Learning Platforms with wrong relationships = {0} - {1} = {2}".format(
        engagement_df["lp_id"].nunique()
      , engagement_df_refined["lp_id"].nunique()
      , engagement_df["lp_id"].nunique() - engagement_df_refined["lp_id"].nunique())
     )

**engagement_df_refined cleaning done.**

In [None]:
engagement_df_refined.describe()

Label the sequence of days and explore the skew of pct_access and engagement_index.

In [None]:
basedate = pd.to_datetime('2020-01-01')
engagement_df_refined["days_since"] = (engagement_df_refined["time"] - basedate).dt.days
engagement_df_refined.describe()

In [None]:
sns.distplot( engagement_df_refined["pct_access"]
            , kde=False)

plt.show()

print("pct_access skew: ", engagement_df_refined["pct_access"].skew())

In [None]:
sns.distplot( engagement_df_refined["engagement_index"]
            , kde=False)

plt.show()

print("engagement_index skew: ", engagement_df_refined["engagement_index"].skew())

In [None]:
engagement_df_refined["district_id"] = engagement_df_refined["district_id"].astype("category")
engagement_df_refined["days_since"] = engagement_df_refined["days_since"].astype("category")

### Unique Learning Platforms

Now that the engagement data has been imputed and cleaned, let's explore the evolution of the Learning Platforms over the months in 2020. Let's look through the number of Unique Learning Platforms by month.

In [None]:
grouped = engagement_df_refined.groupby(by=["month"])["lp_id"].unique().reset_index(name="lp_id")
grouped

In [None]:
## https://stackoverflow.com/questions/23333786/reference-values-in-the-previous-row-with-map-or-apply
new_col = 'result'

def apply_func_decorator(func):
    prev_row = {}
    def wrapper(curr_row, **kwargs):
        val = func(curr_row, prev_row)
        prev_row.update(curr_row)
        prev_row[new_col] = val
        return val
    return wrapper

@apply_func_decorator
def running_total(curr_row, prev_row):

      return np.unique(list(curr_row['lp_id'])  + list(prev_row.get("result", [])))

In [None]:
grouped["result"] = np.nan
grouped["result"] = grouped.apply( running_total
                                 , axis=1)

grouped["result"]

In [None]:
grouped["count"] = grouped["result"].apply(lambda x: len(x))

In [None]:
grouped["count"]

In [None]:
plt.figure(figsize=(8, 6))

sns.barplot(data=grouped
            ,x="month"
            ,y="count"
            ,palette="cubehelix_r"
           )

plt.ylabel("unique learning platforms")
plt.show()

- **Remark:** Unique Learning Platforms (ULP) are those which don't have any twin star.

    - January got started with 3857 learning platforms. The number of unique platforms  gradually increased linearly till August; then entered the saturation region where the growth is extremely low or not too obvious compared to the first half of the year. 
    
    - 8583 is the final count of Unique Learning Platforms at Dec, 2020.

### Newly-launched Learning Platforms

In [None]:
grouped["lp_id"] = grouped["lp_id"].apply(np.sort)
grouped

In [None]:
grouped["new_count"] = grouped["count"] - grouped["count"].shift(1)
grouped["new_count"].fillna(value=0, inplace=True)

In [None]:
grouped

In [None]:
plt.figure(figsize=(8, 6))

sns.barplot(data=grouped
            ,x="month"
            ,y="new_count"
            ,palette="cubehelix_r"
           )

plt.ylabel("Newly-launched Learning Platforms")
plt.show()

- **Remark:** Newly-launched Learning Platforms (NLP) are those which got a fresh start in providing the digital learning service.

    - In Februrary, 1832 learning platforms got registered; this is the peak of newly-launched ones. However, no specific growth or decline had been manifested later on. Other than a few bounces (~500 platforms), the newly-launched got bounded by ~250 platforms over various months of the year.

### Discontinued Learning Platforms

In [None]:
grouped["prev_result"] = grouped["result"].shift(1)
grouped["prev_result"] = grouped["prev_result"].fillna("").apply(list)

In [None]:
grouped["discontinued"] = (grouped["prev_result"].map(set) - grouped["lp_id"].map(set)).apply(list)
grouped

In [None]:
grouped["closed_count"] = grouped["discontinued"].apply(lambda x: len(x))
grouped

In [None]:
plt.figure(figsize=(8, 6))

sns.barplot( x="month"
            ,y="closed_count"
            ,data=grouped
            ,palette="cubehelix_r"
           )

plt.ylabel("discontinued learning platforms")
plt.show()

- **Remark:** Discontinued Learning Platforms are those which stopped providing their service any further.

    - In February, 76 learning platforms from January stopped providing their service. The increasing trend in the discontinuation goes on till the end of year. Note that the number of discontinued platforms tends to get very high after September (the end of financial year); approximately 2X - 2.5X jump in October through December. The maximum number of Discontinued Platforms is observed in December with a value of 2982.

### Sustained Platforms

In [None]:
def intersection(a, b):
    return list(set(a) & set(b))

In [None]:
grouped["common"] = grouped.apply( lambda x: intersection(x["lp_id"], x["prev_result"])
                                 , axis=1)

grouped

In [None]:
grouped["common_count"] = grouped["common"].apply(len)
grouped

In [None]:
plt.figure(figsize=(8,6))

sns.barplot(x="month"
           ,y="common_count"
           ,data=grouped
           ,palette="cubehelix_r"
           )

plt.ylabel("sustained learning platforms")
plt.show()

- **Remark:** Sustained Learning Platforms are those which carry forward the previously seen learning platforms.

    - Example: In February, 3781 Learning Platforms from January were retained. This growth continued up until September (thus, reaching the peak 6689 LPs) and then gradually followed a declining trend in the final 3 months of the year. Note that, September is the end of financial year in the U.S. 

    - The most common number of Sustained Learning Platforms stayed close to 5500 throught the year.
    
    - We haven't seen any data in December 2019, that is, the month before January 2020. Therefore, it can be regarded as N/A. But for the sake of simplicity, we filled it with 0.

### District Coverage

Let's explore the district-wise coverage in 2020.

In [None]:
engagement_df_refined["district_id"] = engagement_df_refined["district_id"].astype(int)

In [None]:
grouped = engagement_df_refined.groupby(by = ["month"])["district_id"].unique().reset_index(name="district_id")
grouped

In [None]:
@apply_func_decorator
def get_cumulative(curr_row, prev_row):

    return np.unique(list(curr_row['district_id'])  + list(prev_row.get("result", [])))

In [None]:
grouped["result"] = np.nan
grouped["result"] = grouped.apply( get_cumulative
                                  ,axis=1)

In [None]:
grouped

In [None]:
grouped["cum_districts_count"] = grouped["result"].apply(lambda x: len(x))
grouped

In [None]:
plt.figure(figsize=(8,6))

sns.barplot( data=grouped
            ,x="month"
            ,y="cum_districts_count"
            ,palette="cubehelix_r"
           )

plt.ylabel("cumulative districts count")
plt.show()

- **Remark:** The district-wise coverage remained almost same over various months of the year. It started off with 216 districts in January and then continues to grow by a few districts at each month, thus ended up with 233 districts in November through December.

Let's get ready for the next part of our analysis.

In [None]:
engagement_df_refined["weekofyear"] = engagement_df_refined["weekofyear"].astype(int)
engagement_df_refined["month"] = engagement_df_refined["month"].astype(int)
engagement_df_refined["days_since"] = engagement_df_refined["days_since"].astype(int)
engagement_df_refined["dayofweek"] = engagement_df_refined["dayofweek"].astype(int)

## Freemium vs. Paid subscriptions

This section groups the consumer bases and categorizes those by the duration of the offered weeks. Then it explores the returning habit of the categorized consumers.

In [None]:
grouped = engagement_df_refined.groupby(by=["lp_id", "district_id"])["weekofyear"].agg(["min", "max", "size", "nunique"]).reset_index()

grouped = grouped.dropna()

grouped.rename( columns={ "min": "min_week"
                        , "max": "max_week"
                        , "size": "total_days_used"
                        , "nunique": "total_active_weeks"
                        }
               ,inplace=True
              )

grouped["total_span_in_weeks"] = grouped["max_week"] - grouped["min_week"] + 1

grouped["lp_id"] = grouped["lp_id"].astype(int)
grouped["max_week"] = grouped["max_week"].astype(int)
grouped["min_week"] = grouped["min_week"].astype(int)

grouped.head(10)

In [None]:
grouped.describe()

In [None]:
plt.figure(figsize=(8, 6))

sns.histplot( x="total_span_in_weeks"
             ,data=grouped
             )

plt.xlabel("total span (in weeks)")
plt.xticks(rotation=30)
plt.show()

- **Remark:** This plot suggest us two subscription categories: Freemium and Paid. The Freemium subscription is represented by the mode of the graph - those lasting for 1-week only.

In [None]:
dict_ = {"freemium": (grouped["total_span_in_weeks"] == 1).sum()
        ,"paid": (grouped["total_span_in_weeks"] > 1).sum()
        }

grouped_free_or_paid = pd.DataFrame.from_dict(dict_, orient="index").reset_index()
grouped_free_or_paid.columns = ["subscription", "count"]
grouped_free_or_paid

In [None]:
((grouped["total_span_in_weeks"] > 1) &
 (grouped["total_days_used"] > 1)).sum() * 100. / (grouped["total_span_in_weeks"] > 1).sum()

In [None]:
((grouped["total_span_in_weeks"] == 1) &
 (grouped["total_days_used"] > 1)).sum() * 100. / (grouped["total_span_in_weeks"] == 1).sum()

In [None]:
fig = px.pie(
    grouped_free_or_paid, 
    names='subscription', 
    values='count',
    color_discrete_sequence=px.colors.sequential.Mint,
    title='Percentage of Subscription Types', 
    width=700,
    height=500,
    hole=0.5
)
fig.show()

**Remarks:**

- 70.4% Paid subscriptions. All of those have returning users.

- 29.6% Freemium subscriptions, lasted for 1 week only. ~6.12% of the freemium consumer bases showed returning habit by coming back to the Learning Platform after the first day .

### Freemium subscription:

- What percentage of Learning Platforms offered Freemium subscriptions?

- Did the offered weeks of Freemium exhibit any trend?

- How did the percentiles of days used evolve for the Freemium version?

In [None]:
grouped[grouped["total_days_used"] == 1]["lp_id"].nunique() * 100. / grouped["lp_id"].nunique()

- ~98% learning platforms offered 1-week long freemium subscriptions across all 233 districts. 

In [None]:
plt.figure(figsize=(12, 6))

sns.countplot(x="max_week"
             ,data=grouped[ grouped["total_span_in_weeks"] == 1 ]
             )

plt.title("Freemium offers vs. week")
plt.xlabel("week")
plt.ylabel("freemium count")
plt.xticks(rotation=60)
plt.show()

- **Remark:** The offered-week shows a periodic trend for the 1-week long freemium subscription. It reached the first peak at about March 2021 for a small time span only (~3 weeks).

In [None]:
g = grouped[grouped["total_span_in_weeks"] == 1]

i = g['total_days_used'].quantile([0.05, 0.25, 0.5, 0.9, 0.95, 0.99, 0.999])
# j = g['total_days_used'].agg(['min', 'max'])

# pd.concat([i, j], axis=0)

# g.describe()

df_percentiles = pd.DataFrame(i).reset_index()
df_percentiles.rename(columns={"index": "percentile"}, inplace=True)

plt.figure(figsize=(8, 6))

sns.lineplot(x="percentile"
            ,y="total_days_used"
            ,data=df_percentiles)

plt.title("Freemium consumer-bases")
plt.ylabel("days used")
plt.show()

- **Remark:** The percentiles curve shows a sharp step-wise increase at the very end, at the ~94th percentile it rose to 2 days and finally (after the 99th) it took a sharp turn to 3 or more days.

### Premium (or Paid) subscription:

- How did the registration of consumer bases evolve over time?

- How did the percentiles of days used evolve for the Paid subscription?

In [None]:
plt.figure(figsize=(12, 6))

sns.countplot(x="min_week"
             ,data=grouped[ grouped["total_span_in_weeks"] > 1 ]
             )

plt.title("Paid registration vs. week")
plt.xlabel("week")
plt.ylabel("paid registration count")
plt.xticks(rotation=60)
plt.show()

- **Remark:** The paid registration count remained very high at the initial 2 weeks and then gradually showed a diminishing trend with the approach of the middle of the year.

In [None]:
g = grouped[ grouped["total_span_in_weeks"] > 1 ]
i = g['total_days_used'].quantile([0.01, 0.05, 0.25, 0.5, 0.6, 0.65, 0.7, 0.75, 0.8, 0.825, 0.8375, 0.85, 0.9, 0.95, 0.99, 0.995, 0.999])
# j = g['total_days_used'].agg(['min', 'max'])

# pd.concat([i, j], axis=0)

df_percentiles = pd.DataFrame(i).reset_index()
df_percentiles.rename(columns={"index": "percentile"}, inplace=True)

plt.figure(figsize=(8, 6))

sns.lineplot(x="percentile"
            ,y="total_days_used"
            ,data=df_percentiles)

plt.title("Paid consumer-bases")
plt.ylabel("days used")
plt.show()

- **Remarks:**

    - When total_span_in_weeks > 1, the consumer bases exhibit a returning habit by coming back to the learning platform after the first day.

    - The number of days used follows an exponential pattern. It starts off with a minimum of 2 days, grows to a median of 12 days, crosses 100 days at about 83rd percentile and eventually touches 366 days at about 99.9th percentile. 

## Opportunity Analysis (Freemium):

The next part of this notebook will provide a glimpse into the Freemium subscription only for brevity.

- **Key Concepts.** Identify prospective consumer-bases or (lp, district) combinations by the following aspects:

    - Returning consumers. Those who viewed the course for multiple days. If a user gets back to the freemium version for several days, there is a strong chance that he/she found it useful.
    
    - Engaged consumers. Those with high access and engagement to the offered course. Sometimes it may happen that a user explored the freemium version in a single day for a significant amount of time (maybe on a weekend or a holiday).

### Freemium: Clustering by Access Pattern

In [None]:
g = grouped[ grouped["total_span_in_weeks"] == 1 ]

g_details = pd.merge(g[["lp_id", "district_id", "min_week", "total_days_used"]]
                    ,engagement_df_refined[["lp_id", "district_id", "weekofyear", "pct_access", "engagement_index"]]
                    ,how="inner"
                    ,left_on=["lp_id", "district_id", "min_week"]
                    ,right_on=["lp_id", "district_id", "weekofyear"]
                    )

g_details["lp_id"] = g_details["lp_id"].astype(int)

g_details.head(10)

In [None]:
g_details.describe()

In [None]:
g_details_agg = g_details.groupby(by=["lp_id", "district_id"]).mean().reset_index()
g_details_agg.head()

In [None]:
g_details_agg.describe()

- **Freemium consumer-base Statistics.**

    - pct_access: ranges between 0.0% and 7.5%. More than 50% of the consumer-bases appear to be inactive or non-responsive.
    
    - engagement_index: ranges between 0.0 and 1451.36. More than 75% of the consumer-bases show extremely low engagement (< 1.0)
    
- Let's split the freemium consumer bases into 2 groups:

    - Inactive bases. (lp_id, district_id) with pct_access = 0.
    
    - Active bases. (lp_id, district_id) with pct_access > 0.

In [None]:
dict_ = {"inactive": (g_details_agg["pct_access"] == 0.).sum()
        ,"active": (g_details_agg["pct_access"] > 0.).sum()
        }

grouped_freemium = pd.DataFrame.from_dict(dict_, orient="index").reset_index()
grouped_freemium.columns = ["activity type", "count"]
grouped_freemium

In [None]:
fig = px.pie(
    grouped_freemium, 
    names='activity type', 
    values='count',
    color_discrete_sequence=px.colors.sequential.Mint,
    title='Percentage of Active Consumer Bases in Freemium', 
    width=700,
    height=500,
    hole=0.5
)
fig.show()

- **Remark.** ~53.2% Freemium consumer-bases appeared to be inactive or non-responsive.

### Inactive (lp, district) combinations

Let's group the inactive consumer-bases based on the returning habit.

In [None]:
freemium_inactive = g_details_agg[ g_details_agg["pct_access"] == 0. ]

freemium_inactive.describe()

In [None]:
(freemium_inactive["total_days_used"] == 1).sum() * 100. / freemium_inactive.shape[0]

In [None]:
(freemium_inactive["total_days_used"] > 1).sum() * 100. / freemium_inactive.shape[0]

- **Freemium Inactive Returning habit.**

    - ~94.24% didn't return after using the freemium version for 1 day.
    
    - ~5.76% used the freemium version for multiple days; seems they found it useful.

### Active (lp, district) combinations

- ~46.79% of the Freemium consumer-bases showed an active usage habit.

- Let's group those first based on the usage pattern, i.e., pct_access and engagement_index.

In [None]:
freemium_active = g_details_agg[ g_details_agg["pct_access"] > 0.]

freemium_active.describe()

In [None]:
freemium_active.shape[0] * 100. / g_details_agg.shape[0]

In [None]:
# features = g_details_agg[(g_details_agg["pct_access"] > 0) & 
#                          (g_details_agg["engagement_index"] > 0)][["pct_access", "engagement_index"]].reset_index(drop=True)

features = freemium_active[[  "total_days_used"
                            , "pct_access"
                            , "engagement_index"]].reset_index(drop=True)

In [None]:
features.isna().sum()

In [None]:
(freemium_active["total_days_used"] == 1).sum() * 100. / freemium_active.shape[0]

In [None]:
(freemium_active["total_days_used"] > 1).sum() * 100. / freemium_active.shape[0]

- **Freemium Active.** Returning habit of active consumer bases:

    - 93.47% didn't return after the first day of usage.
    
    - 6.52% used the freemium version for multiple days, therefore there is a strong possibility that they found it useful.

In [None]:
features = freemium_active[[  "lp_id"
                            , "district_id"
                            , "total_days_used"
                            , "pct_access"
                            , "engagement_index"]].reset_index(drop=True)
features.shape

In [None]:
features.head()

In [None]:
scaler = preprocessing.MinMaxScaler()
features_normal = scaler.fit_transform(features[["pct_access", "engagement_index"]])

In [None]:
pd.DataFrame(features_normal).describe()

In [None]:
inertia = []
K = range(1,10)
for k in K:
    kmeanModel = KMeans(n_clusters=k).fit(features_normal)
    kmeanModel.fit(features_normal)
    inertia.append(kmeanModel.inertia_)

In [None]:
# Plot the elbow
plt.plot(K, inertia, 'bx-')
plt.xlabel('k')
plt.ylabel('Inertia')
plt.show()

In [None]:
kmeans = KMeans(n_clusters=3, random_state=0).fit(features_normal)

In [None]:
labels = pd.DataFrame(kmeans.labels_) #This is where the label output of the KMeans we just ran lives. Make it a dataframe so we can concatenate back to the original data
labeledCombs = pd.concat((features,labels),axis=1)
labeledCombs = labeledCombs.rename({0:'labels'},axis=1)

In [None]:
features.shape

In [None]:
labeledCombs.shape

In [None]:
labeledCombs.isna().sum()

In [None]:
labeledCombs = labeledCombs[ ~(labeledCombs["pct_access"].isna() & 
                               labeledCombs["engagement_index"].isna()
                              ) 
                           ]
labeledCombs.shape

In [None]:
labeledCombs.head()

In [None]:
sns.lmplot( x='pct_access'
           ,y='engagement_index'
           ,data=labeledCombs
           ,hue='labels'
           ,fit_reg=False)

In [None]:
sns.pairplot(labeledCombs, hue='labels')

- **Groups.** pct_access looks like the key differentiating factor. A total of 3 groups found.

    - Group 0: Low pct_access. 
    
    - Group 1: Medium pct_access. 
    
    - Group 2: High pct_access. 
    
- Group 0 performed very poorly with respect to the usage pattern. Therefore, some opportunity for expansion may lie in the consumer-bases belonging to Group 1 and Group 2. Later on, we'll strip out the returning users from all these groups and form a new Group for **Active & Returning** consumer-bases.

In [None]:
labeledCombs['Constant'] = "Data" #This is just to add something constant for the strip/swarm plots' X axis. Can be anything you want it to be.

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(12, 6))
fig.suptitle("pct_access and engagement_index distribution")

sns.stripplot( x=labeledCombs['Constant']
              ,y=labeledCombs['pct_access']
              ,hue=labeledCombs['labels']
              ,jitter=True
              ,ax=axes[0]
             )

sns.stripplot( x=labeledCombs['Constant']
              ,y=labeledCombs['engagement_index']
              ,hue=labeledCombs['labels']
              ,jitter=True
              ,ax=axes[1]
            )

fig.tight_layout()
fig.show()

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(12, 6))
fig.suptitle("pct_access and engagement_index distribution")

sns.boxplot( x="labels"
            ,y="pct_access"
            ,data=labeledCombs
            ,ax=axes[0]
           )

sns.boxplot( x="labels"
            ,y="engagement_index"
            ,data=labeledCombs
            ,ax=axes[1]
           )

fig.tight_layout()
fig.show()

In [None]:
labeledCombs.columns

In [None]:
freemium_inactive.columns

In [None]:
grouped_1wk = pd.concat((labeledCombs.drop(["Constant"], axis=1)
                        ,freemium_inactive.drop(["min_week", "weekofyear"], axis=1))
                        ,axis=0)

grouped_1wk.shape

In [None]:
grouped_1wk.isna().sum()

In [None]:
labeledCombs["labels"].unique()

In [None]:
## Returning customer in Group 3 for the inactive bases.
grouped_1wk["labels"].fillna( grouped_1wk["total_days_used"].apply( lambda x: 3 if x > 1 else 0)
                             ,inplace = True
                            )

grouped_1wk.describe()

In [None]:
grouped_1wk["labels"].unique()

In [None]:
grouped_1wk[((grouped_1wk["labels"].isin([0, 1, 2]) & (grouped_1wk["total_days_used"] > 1)))].describe()

- All returning consumer bases with pct_access > 0.

In [None]:
grouped_1wk.loc[((grouped_1wk["labels"].isin([0, 1, 2]) & (grouped_1wk["total_days_used"] > 1))), "labels"] = 4

In [None]:
grouped_1wk.describe()

In [None]:
grouped = grouped_1wk.groupby(["labels"])["lp_id"]\
                     .count()\
                     .reset_index(name="count")

grouped["groups"] = grouped["labels"].map({ 0: "Group 0 (Low Usage, Non-returning Consumers)"
                                          ,1: "Group 1 (Moderate Usage, Non-returning Consumers)"
                                          ,2: "Group 2 (High Usage, Non-returning Consumers)"
                                          ,3: "Group 3 (Inactive yet Returning Consumers)"
                                          ,4: "Group 4 (Active & Returning Consumers)"
                                         })

grouped

In [None]:
fig = px.pie(
    grouped, 
    names='groups', 
    values='count',
    color_discrete_sequence=px.colors.sequential.Mint,
    title='Percentage of Groups in the Engagement Information Data:', 
    width=700,
    height=500,
    hole=0.5
)
fig.show()

- **Which groups can be targeted for a Paid subscription ?**

- Prospective target: 100 - 92.4 = 7.6%

    - Returning target: 3.07 + 3.05 = 6.12%

    - Non-returning yet engaged = 1.47 + 0.0245 = 1.4945%

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(12, 6))
fig.suptitle('Compare Groups: pct_access and engagement_index')

axes[0].set_title("pct_access distribution")

sns.boxplot( x="labels"
           , y="pct_access"
           , data=grouped_1wk
           , palette="cubehelix"
           , ax=axes[0]
           )

axes[0].set_xlabel("group")
# axes[0].set_xticklabels(grouped_1wk["labels"].unique(), rotation=70)

axes[1].set_title("engagement_index distribution")

sns.boxplot( x="labels"
           , y="engagement_index"
           , data=grouped_1wk
           , palette="cubehelix"
           , ax=axes[1]
           )
axes[1].set_xlabel("group")
# axes[1].set_xticklabels(grouped_1wk["labels"].unique(), rotation=70)

fig.tight_layout()
fig.show()

**Group performance analysis (ref. following code cell):**

- Group 2 clearly stands out by an engaged user-base (median pct_access 1.975 and enagement_index 44.95).

- Group 1 comes next with a relatively lower engaged user-base (median pct_access 0.33 and enagement_index 5.79).

- Group 4 stays third in the list with engaged & returning consumers, however, median pct_access 0.02 and enagement_index 0.715 (pretty low usage)

- Group 3 is a special one with returning consumers but zero engagement.

In [None]:
grouped_1wk.groupby(["labels"])[["pct_access", "engagement_index"]].median()

## Summary