In [2]:
import pandas as pd
import os

This notebook produces summary statistics for event metrics.

In [9]:
data_set_path = '../create-churn-dataset/socialnet_dataset.csv'
churn_data = pd.read_csv(data_set_path, index_col=[0, 1])

In [15]:
# in order to get is_churn flag to appear in describe stats,
# we need to convert it to a float
churn_data['is_churn'] = churn_data['is_churn'].astype(float)
summary = churn_data.describe() 
summary = summary.transpose() 

# add additional columns for the skew, 98% conf interval, and non-zero counts
# for each metric
summary['skew'] = churn_data.skew() 
summary['1%'] = churn_data.quantile(q=0.01)
summary['99%'] = churn_data.quantile(q=0.99) 
summary['nonzero'] = churn_data.astype(bool).sum(axis=0) / churn_data.shape[0]
# reprder columns for nicer presentation
summary = summary[ ['count','nonzero','mean','std','skew','min','1%',
 '25%','50%','75%','99%','max']] 
summary.columns = summary.columns.str.replace("%", "pct")

In [16]:
summary

Unnamed: 0,count,nonzero,mean,std,skew,min,1pct,25pct,50pct,75pct,99pct,max
is_churn,9307.0,0.136564,0.136564,0.343405,2.117118,0.0,0.0,0.0,0.0,0.0,1.0,1.0
like_per_month,9307.0,0.982809,69.742667,148.226606,9.430774,0.0,0.0,11.0,27.0,70.0,618.94,4057.0
newfriend_per_month,9307.0,0.843881,4.830558,6.286209,3.674039,0.0,0.0,1.0,3.0,6.0,30.0,100.0
post_per_month,9307.0,0.967766,29.902439,57.498481,8.456573,0.0,0.0,6.0,14.0,32.0,256.88,1395.0
adview_per_month,9307.0,0.964113,30.388525,60.387084,12.755622,0.0,0.0,6.0,15.0,34.0,231.0,1998.0
dislike_per_month,9307.0,0.926077,11.356291,17.055435,6.304839,0.0,0.0,3.0,6.0,14.0,77.0,406.0
unfriend_per_month,9307.0,0.219082,0.245944,0.493024,1.976274,0.0,0.0,0.0,0.0,0.0,2.0,4.0
message_per_month,9307.0,0.953906,40.780273,94.143806,7.832462,0.0,0.0,5.0,13.0,38.0,444.7,2266.0
reply_per_month,9307.0,0.823574,15.460084,34.930147,6.991935,0.0,0.0,1.0,4.0,15.0,165.88,852.0
account_tenure,9307.0,1.0,28.413882,18.960799,2.386889,16.0,16.0,19.0,21.0,24.0,109.0,114.0


In [17]:
save_path = 'socialnet_dataset_summarystats.csv'
summary.to_csv(save_path, header=True)