In [13]:
import pandas as pd
import numpy as np
import tensorflow as tf

from matplotlib import pyplot as plt
import seaborn as sns
%matplotlib inline

2024-04-11 15:22:51.691964: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [14]:
# helper functions

def featureProportion(path, frac_features, frac_features_names, drop_features):
#     drop_features: list of strings with the names of the categorical features 
#     binary_features: list of string witht the names of the binary columns
#     target_col: string

    df = pd.read_csv(path, low_memory = False)
    df = df.dropna()
    
    total_courses_count = "cnt_courses_in_specialization"
    
    for feature, name in zip(frac_features, frac_features_names):
        
        df[name] = df[feature] / df[total_courses_count]

    df = df.drop(columns = drop_features)
    
    return df

In [15]:

data_descriptions = pd.read_csv('data_descriptions.csv')
pd.set_option('display.max_colwidth', None)
data_descriptions

Unnamed: 0,Column_name,Column_type,Data_type,Description
0,subscription_id,Identifier,character,Unique identifier of each subscription
1,observation_dt,Identifier,date,The date on which the subscription was observed to calculate the features in the dataset. It was chosen at random amongst all the dates between the start of the subscription and the end of the subscription (before cancellation)
2,is_retained,Target,Integer,"TRAINING SET ONLY! 0 = the learner cancelled their subscription before next payment, 1 = the learner made an additional payment in this subscription"
3,specialization_id,Feature - Specialization Info,character,Unique identifier of a specialization (each subscription gives a learner access to a particular specialization)
4,cnt_courses_in_specialization,Feature - Specialization Info,integer,number of courses in the specialization
5,specialization_domain,Feature - Specialization Info,character,"primary domain of the specialization (Computer Science, Data Science, etc.)"
6,is_professional_certificate,Feature - Specialization Info,boolean,"BOOLEAN for whether the specialization is a ""professional certicate"" (a special type of specialization that awards completers with an industry-sponsored credential)"
7,is_gateway_certificate,Feature - Specialization Info,boolean,"BOOLEAN for whether the specialization is a ""gateway certificate"" (a special type of specialization geared towards learners starting in a new field)"
8,learner_days_since_registration,Feature - Learner Info,integer,Days from coursera registration date to the date on which the observation is made
9,learner_country_group,Feature - Learner Info,character,"the region of the world that the learner is from (United States, East Asia, etc.)"


#### load the data

In [20]:
training_data = pd.read_csv("training.csv", low_memory = False)
validation_data = pd.read_csv("validation.csv")
test_data = pd.read_csv("test.csv")

training_data.shape

(248373, 38)

In [21]:
training_data.columns

Index(['Unnamed: 0', 'subscription_id', 'observation_dt', 'is_retained',
       'specialization_id', 'cnt_courses_in_specialization',
       'specialization_domain', 'is_professional_certificate',
       'is_gateway_certificate', 'learner_days_since_registration',
       'learner_country_group', 'learner_gender',
       'learner_cnt_other_courses_active',
       'learner_cnt_other_courses_paid_active',
       'learner_cnt_other_courses_items_completed',
       'learner_cnt_other_courses_paid_items_completed',
       'learner_cnt_other_transactions_past', 'learner_other_revenue',
       'subscription_period_order', 'days_since_last_payment',
       'days_til_next_payment_due',
       'cnt_enrollments_started_before_payment_period',
       'cnt_enrollments_completed_before_payment_period',
       'cnt_enrollments_active_before_payment_period',
       'cnt_items_completed_before_payment_period',
       'cnt_graded_items_completed_before_payment_period',
       'is_subscription_started_wit

In [22]:
training_data.head()

Unnamed: 0.1,Unnamed: 0,subscription_id,observation_dt,is_retained,specialization_id,cnt_courses_in_specialization,specialization_domain,is_professional_certificate,is_gateway_certificate,learner_days_since_registration,...,cnt_enrollments_completed_during_payment_period,cnt_enrollments_active_during_payment_period,cnt_items_completed_during_payment_period,cnt_graded_items_completed_during_payment_period,is_active_capstone_during_pay_period,sum_hours_learning_before_payment_period,sum_hours_learning_during_payment_period,cnt_days_active_before_payment_period,cnt_days_active_during_payment_period,cnt_days_since_last_activity
0,139734,XKeF8IJMEeu0UgqKK0UAdQ,2021-03-21,0.0,ZwOPfLMgEeW1-RKql4-XpQ,5.0,Business,False,False,34.0,...,0.0,1.0,5.0,0.0,False,2.433333,0.65,3.0,1.0,0.0
1,151769,1c702INZEeuXJw4d4vXLEQ,2021-04-12,0.0,Z-5wCcbTEeqeNBKhfgCLyw,7.0,Computer Science,True,True,1513.0,...,0.0,0.0,0.0,0.0,False,0.0,0.0,0.0,0.0,31.0
2,11767,2_OpkQktEeyFWAp3BiJlJQ,2022-04-22,1.0,kr43OcbTEeqeNBKhfgCLyw,8.0,Data Science,True,True,303.0,...,0.0,0.0,0.0,0.0,False,56.683333,0.0,28.0,0.0,156.0
3,402361,ll6Bim_iEeuyRw7ELLda1Q,2021-07-27,1.0,W62RsyrdEeeFQQqyuQaohA,5.0,Data Science,False,False,2215.0,...,0.0,0.0,0.0,0.0,False,1.466667,0.0,7.0,0.0,134.0
4,348243,J3nNWmXBEey3uAqWLqeC6w,2022-03-15,1.0,Z-5wCcbTEeqeNBKhfgCLyw,7.0,Computer Science,True,True,2515.0,...,0.0,0.0,0.0,0.0,False,8.3,0.0,7.0,0.0,55.0


In [24]:
training_data.specialization_domain.unique()

array(['Business', 'Computer Science', 'Data Science',
       'Arts and Humanities', 'Information Technology',
       'Physical Science and Engineering', 'Personal Development',
       'Health', 'Social Sciences', 'Language Learning', 'Math and Logic',
       nan], dtype=object)

In [25]:
# convert integers of course enrollment to fractions
frac_features = ["cnt_enrollments_started_before_payment_period", 
                                    "cnt_enrollments_completed_before_payment_period", 
                                    "cnt_enrollments_active_before_payment_period", 
                                    "cnt_enrollments_started_during_payment_period", 
                                    "cnt_enrollments_completed_during_payment_period", 
                                    "cnt_enrollments_active_during_payment_period",]

frac_features_names = ["frc_enrollments_started_before_payment_period", 
                       "frc_enrollments_completed_before_payment_period", 
                       "frc_enrollments_active_before_payment_period", 
                       "frc_enrollments_started_during_payment_period", 
                       "frc_enrollments_completed_during_payment_period", 
                       "frc_enrollments_active_during_payment_period",]

# drop the categorical features
drop_features = ["cnt_enrollments_started_before_payment_period", 
                 "cnt_enrollments_completed_before_payment_period", 
                 "cnt_enrollments_active_before_payment_period", 
                 "cnt_enrollments_started_during_payment_period", 
                 "cnt_enrollments_completed_during_payment_period", 
                 "cnt_enrollments_active_during_payment_period",]

In [57]:
# preprocessing training set
path_train = "training.csv"
train_data = featureProportion(path_train, frac_features, frac_features_names, drop_features)

# time and event columns
time_col = "learner_days_since_registration"
event_col = "is_retained"

In [58]:
# from categorical to numerical features

train_data = train_data.drop(columns = ["Unnamed: 0", "learner_gender"])

categories = ["specialization_domain", "is_professional_certificate", "is_gateway_certificate", 
              "learner_country_group", "is_subscription_started_with_free_trial", 
              "is_active_capstone_during_pay_period"]

train_data = pd.get_dummies(train_data, columns = categories)

In [62]:
# train_data.columns

In [65]:
features = np.setdiff1d(train_data.columns, [time_col, event_col] ).tolist()
features

['cnt_courses_in_specialization',
 'cnt_days_active_before_payment_period',
 'cnt_days_active_during_payment_period',
 'cnt_days_since_last_activity',
 'cnt_graded_items_completed_before_payment_period',
 'cnt_graded_items_completed_during_payment_period',
 'cnt_items_completed_before_payment_period',
 'cnt_items_completed_during_payment_period',
 'days_since_last_payment',
 'days_til_next_payment_due',
 'frc_enrollments_active_before_payment_period',
 'frc_enrollments_active_during_payment_period',
 'frc_enrollments_completed_before_payment_period',
 'frc_enrollments_completed_during_payment_period',
 'frc_enrollments_started_before_payment_period',
 'frc_enrollments_started_during_payment_period',
 'is_active_capstone_during_pay_period_False',
 'is_active_capstone_during_pay_period_True',
 'is_gateway_certificate_False',
 'is_gateway_certificate_True',
 'is_professional_certificate_False',
 'is_professional_certificate_True',
 'is_subscription_started_with_free_trial_False',
 'is_sub