In [49]:
import numpy as np
import pandas as pd
import matplotlib
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.metrics import accuracy_score

In [18]:
people = pd.read_csv("labeled_data_with_metrics.csv")
people = people.drop(columns= "Unnamed: 0", axis=1)
people

Unnamed: 0,distinct_id,locationSetting,notificationSettings,active_timespan,US,Facebook,Foodie,Google,Unknown,num_sessions,...,commentAdded,commentLength,churned,number_ratings,SQS,avg_session_duration,time_delta_slope,session_time_slope,time_delta_slope_90,session_time_slope_90
0,00000000-0000-0000-0000-000000000000,1,0,24.202130,0,0,1,0,0,2,...,0,0.000000,1,0.0,-1.000000,-1.000000,0.000000e+00,0.000000,0.000000e+00,0.000000
1,000BA5B1-DBEF-414E-ACEC-1A2FCC2053DD,1,1,1.130949,1,0,0,1,1,97,...,0,0.000000,0,1.0,40.181818,82.854545,1.113364e+03,6.302727,0.000000e+00,0.000000
2,001210DC-54C8-43AD-B295-148F47818391,1,0,7.158310,1,0,0,1,1,61,...,0,0.000000,0,0.0,20.000000,47.940000,-1.171600e+04,6.667879,0.000000e+00,0.000000
3,0019C3B9-FA62-4AB4-A895-1390A1FA818C,1,1,0.000000,1,0,1,0,0,1,...,0,0.000000,0,0.0,-1.000000,-1.000000,0.000000e+00,0.000000,0.000000e+00,0.000000
4,0025A8A4-2590-4ECA-8CE8-419D710AE46F,1,0,113.687361,1,0,1,0,1,64,...,0,0.000000,0,1.0,27.375000,177.325000,6.155496e+05,-41.476190,-9.127842e+06,6.700000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7210,wandaandreu@gmail.com_5053,1,0,188.791597,1,0,1,0,1,5,...,0,0.000000,0,0.0,1.666667,54.700000,1.371636e+07,-44.650000,0.000000e+00,0.000000
7211,willwojt@gmail.com_6868,1,0,37.992164,1,0,1,0,1,178,...,5,2.252809,0,7.0,4.171717,61.190909,-1.221786e+02,0.021697,-1.221786e+02,0.021697
7212,wolphramite@gmail.com_291,1,0,79.125498,1,0,1,0,1,4,...,0,0.000000,1,0.0,1.000000,17.500000,0.000000e+00,0.000000,0.000000e+00,0.000000
7213,xbarbarazhong@gmail.com_6704,1,0,0.584016,1,0,1,0,1,21,...,0,0.000000,1,0.0,42.000000,124.150000,0.000000e+00,132.500000,0.000000e+00,0.000000


In [31]:
people.columns

Index(['distinct_id', 'locationSetting', 'notificationSettings',
       'active_timespan', 'US', 'Facebook', 'Foodie', 'Google', 'Unknown',
       'num_sessions', 'sessions_per_day', 'America/New_York',
       'America/Los_Angeles', 'America/Chicago', 'Asia/Kolkata',
       'America/Denver', 'other_timezone', 'commentAdded', 'commentLength',
       'churned', 'number_ratings', 'SQS', 'avg_session_duration',
       'time_delta_slope', 'session_time_slope', 'time_delta_slope_90',
       'session_time_slope_90'],
      dtype='object')

In [20]:
train, test = train_test_split(people, test_size=0.1, random_state=42)


In [43]:
Y_test = test['churned'].to_numpy()
X_test = test[['number_ratings', 'SQS', 'avg_session_duration',
       'time_delta_slope', 'session_time_slope', 'time_delta_slope_90',
       'session_time_slope_90']].to_numpy()

In [50]:
Y_train = train['churned'].to_numpy()
X_train = train[['number_ratings', 'SQS', 'avg_session_duration',
       'time_delta_slope', 'session_time_slope', 'time_delta_slope_90',
       'session_time_slope_90']].to_numpy()

In [51]:
lr_model = LogisticRegression()

In [52]:
lr_model.fit(X_train, Y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [53]:
predictions = lr_model.predict(X_train)

In [54]:
acc = accuracy_score(Y_train, predictions)
acc

0.6089634991529339

# Distinguishing features

In [57]:
train_qual = train[['locationSetting', 'notificationSettings', 'US', 'Facebook', 'Foodie', 'Google', 'Unknown', 
                    'America/New_York',
       'America/Los_Angeles', 'America/Chicago', 'Asia/Kolkata',
       'America/Denver', 'other_timezone',
       'churned']]
train_quant = train[[
       'active_timespan', 
       'num_sessions', 'sessions_per_day', 'commentAdded', 'commentLength',
       'churned', 'number_ratings', 'SQS', 'avg_session_duration',
       'time_delta_slope', 'session_time_slope', 'time_delta_slope_90',
       'session_time_slope_90']]

In [60]:
train_qual.groupby("churned").mean()

Unnamed: 0_level_0,locationSetting,notificationSettings,US,Facebook,Foodie,Google,Unknown,America/New_York,America/Los_Angeles,America/Chicago,Asia/Kolkata,America/Denver,other_timezone
churned,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
0,0.951956,0.412073,0.933785,0.070219,0.886665,0.043117,0.514629,0.223899,0.117647,0.106868,0.02433,0.004928,0.522328
1,0.951941,0.413432,0.721195,0.065311,0.899877,0.034812,0.858287,0.318854,0.140173,0.134011,0.191929,0.006778,0.208256


In [61]:
train_quant.groupby("churned").mean()

Unnamed: 0_level_0,active_timespan,num_sessions,sessions_per_day,commentAdded,commentLength,number_ratings,SQS,avg_session_duration,time_delta_slope,session_time_slope,time_delta_slope_90,session_time_slope_90
churned,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
0,70.213749,47.270095,0.262612,0.443794,0.142454,0.779181,10.814419,40.34311,116959.386464,3.562816,-153178.147501,0.599774
1,23.220553,7.733518,0.042964,0.000616,0.003432,0.002773,6.726724,50.870589,48629.249464,9.857287,-3851.097807,7.038598


In [110]:
useful = ['churned', 'active_timespan', 'num_sessions', 'sessions_per_day', 
          'commentAdded', 'commentLength', 'number_ratings', 'time_delta_slope', 
          'session_time_slope', 'time_delta_slope_90', 'session_time_slope_90',]
useful2 = ['churned', 'SQS', 'avg_session_duration', 'Unknown', 'time_delta_slope',
       'session_time_slope','US']

In [111]:
train2 = train[useful]

In [112]:
Y_train2 = train2['churned'].to_numpy()
X_train2 = train2.drop(columns='churned', axis=1).to_numpy()


In [113]:
lr_model2 = LogisticRegression(solver="lbfgs")

In [114]:
lr_model2.fit(X_train2, Y_train2)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [115]:
predictions2 = lr_model2.predict(X_train2)

In [116]:
acc = accuracy_score(Y_train2, predictions2)
acc

0.5210226397659017