In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE

from collections import Counter
from pprint import pprint

from sklearn.cluster import KMeans

import plotly.offline as py
import plotly.graph_objs as go
py.init_notebook_mode(connected=True)

In [2]:
def plotly_create_scatter(x, y, 
                          labels=[], color='lightseagreen', 
                          symbol='circle', mode='markers', 
                          opacity=1.0, size=7):
    
    trace = go.Scatter(x=x, y=y,
                       mode=mode,
                       text=labels,
                       hoverinfo='text',
                       marker=dict(symbol=symbol, 
                                   color=color, 
                                   opacity=opacity, 
                                   size=size))
    return trace
    
    
def plotly_draw_scatter(*data, zero_margin=True, width=400, height=350):
    if zero_margin:
        MARGIN = dict(l=0, r=0, b=0, t=30)
    else:
        MARGIN = dict(l=40, r=40, b=30, t=30)
        
    layout = go.Layout(height=height, width=width, hovermode='closest', 
                       margin=MARGIN, showlegend=False)
    fig = go.Figure(data=data, layout=layout)
    py.iplot(fig, show_link=False)
    

def plotly_bins(data, bins):
    hist, edges = np.histogram(data, bins)
    data = [go.Bar(x=(edges[1:]+edges[:-1])/2, 
                   y=hist, 
                   marker=dict(color='lightseagreen'))]
    MARGIN = dict(l=40, r=40, b=30, t=20)
    layout = go.Layout(height=220, width=500, margin=MARGIN)
    fig = go.Figure(data=data, layout=layout)
    py.iplot(fig, show_link=False)

In [3]:
pd.set_option('display.float_format', lambda x: '%.0f' % x)
fn = "app_launch_data.csv"
df = pd.read_csv(fn)

df.shape

(3712154, 4)

In [4]:
df.head()

Unnamed: 0,userid,app,session_start,session_duration
0,0,com.livli,0,329328
1,0,com.sec.android.app.launcher,639753,6928
2,0,com.vkontakte.android,646752,618933
3,0,com.sec.android.app.camera,1265715,5788
4,0,com.domobile.applock,1275045,7976


In [5]:
len(set(df['userid']))

841

In [6]:
# number of sessions
df['app'].value_counts().head()

com.vkontakte.android                   870195
com.sec.android.app.launcher            348202
com.google.android.inputmethod.latin    223987
com.android.chrome                      184474
com.instagram.android                   170216
Name: app, dtype: int64

In [7]:
df.sort_values(by='session_start', ascending=True).head()

Unnamed: 0,userid,app,session_start,session_duration
0,0,com.livli,0,329328
272945,66,com.livli,0,48434
271317,65,com.livli,0,12599
2684210,612,com.livli,0,109837
270023,64,com.livli,0,29323


In [8]:
df.sort_values(by='session_duration', ascending=False).head()

Unnamed: 0,userid,app,session_start,session_duration
1216847,269,com.lenovo.deskclock,10365025955,43818410
1215192,269,com.lenovo.deskclock,6897904722,42297810
1213263,269,com.lenovo.deskclock,3074991555,42131291
1913500,431,com.ihandysoft.alarmclock,1857287820,42069321
627608,132,pl.nenter.app.flashlightgalaxys5,4081490732,42052567


In [9]:
# total duration
app_df = df.groupby(['app'])['app','session_duration'].sum()

app_df.sort_values(by='session_duration', ascending=False).head()

Unnamed: 0_level_0,session_duration
app,Unnamed: 1_level_1
com.vkontakte.android,112779914216
com.google.android.youtube,58873366220
com.android.chrome,24865667479
com.instagram.android,21614179834
com.whatsapp,9012139855


In [10]:

print(df['app'].value_counts().shape)
app_df = df.groupby(['app'])['app','session_duration'].sum()
print(app_df.shape)

(7131,)
(7131, 1)


In [11]:
# mean duration
app_df = df.groupby(['app'])['app','session_duration'].agg([np.mean, np.std, np.min, np.max])
app_df.columns = ["{}_{}".format(col[0], col[1]) for col in app_df.columns.values]
app_df.sort_values(by=['session_duration_mean'], ascending=False).head()

Unnamed: 0_level_0,session_duration_mean,session_duration_std,session_duration_amin,session_duration_amax
app,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
com.ihandysoft.alarmclock,13069313,14692908.0,2080,42069321
pl.nenter.app.flashlightgalaxys5,5310476,12976367.0,1387,42052567
com.audioaddict.sky,4759870,9739800.0,1744,41056152
com.maximumsoft.domashniy,3283976,2551330.0,38694,6648162
es.ottplayer.tv,2754194,,2754194,2754194


In [12]:
# top installs
df[['userid', 'app']].drop_duplicates()['app'].value_counts().head()

com.livli                     830
com.android.vending           830
com.vkontakte.android         825
com.google.android.youtube    798
com.android.chrome            739
Name: app, dtype: int64

In [13]:
def brand_stat(brand):
    stat = df[df['app'].map(lambda x: x.find(brand) > -1)]
    app_df = stat.groupby(['app'])['app','session_duration'].sum().sort_values(by='session_duration', ascending=False)
    return app_df.head()

In [14]:
brand_stat("google")

Unnamed: 0_level_0,session_duration
app,Unnamed: 1_level_1
com.google.android.youtube,58873366220
com.google.android.inputmethod.latin,3562969651
com.google.android.googlequicksearchbox,2465412943
com.google.android.deskclock,1118988736
com.google.android.apps.translate,781899545


In [15]:
brand_stat("yandex")

Unnamed: 0_level_0,session_duration
app,Unnamed: 1_level_1
com.yandex.browser,2631466939
ru.yandex.searchplugin,610019867
ru.yandex.yandexnavi,366965472
ru.yandex.music,315577926
ru.yandex.yandexbus,252459673


In [16]:
brand_stat("microsoft")

Unnamed: 0_level_0,session_duration
app,Unnamed: 1_level_1
com.microsoft.office.word,416272553
com.microsoft.launcher,140740820
com.microsoft.office.powerpoint,123895090
com.microsoft.microsoftsolitairecollection,63319477
com.microsoft.office.excel,62247315


In [17]:
plotly_bins(df['session_start'].tolist(), 50)

In [18]:
plotly_bins(df['session_duration'].tolist(), np.linspace(0, 10**5))

In [19]:
#print(max(df[df['app']==app]['session_start']) // (24 * 3600000))

In [20]:
def app_week_usage():
    time_data = df['session_start'].map(lambda x: x%(7*24*3600000)/(24*3600000))
    cn, x = np.histogram(time_data, bins=100)
    
    time_plot = plotly_create_scatter((x[1:] - x[:1]) - (x[1]-x[0])/2, 
                                      cn / sum(cn),
                                      mode='lines+markers')
    plotly_draw_scatter(time_plot, zero_margin=False, height=220, width=700)
    
app_week_usage()

In [21]:
# apps per user

app_cn = []
for x in df.userid.unique():
    all_sessions = df[df['userid'] == x]
    user_apps_cn = all_sessions.app.unique().size
    app_cn.append(user_apps_cn)
    
plotly_bins(app_cn, bins=30)

In [22]:
# sessions per user

sess_cn = []
for x in df.userid.unique():
    all_sessions = df[df['userid'] == x]
    sess_cn.append(all_sessions.shape[0])
    
plotly_bins(sess_cn, bins=30)

In [23]:
feature_df = pd.DataFrame(columns=df.app.unique())
for x in df.userid.unique():
    user_apps = df[df['userid'] == x].app.unique()
    
    feature_dict = {z: 1 for z in user_apps}
    feature_df = feature_df.append(feature_dict, ignore_index=True)
    
feature_df = feature_df.fillna(0)

feature_df.shape

(841, 7131)

In [24]:
%%time
score = []
for j in range(2, 20):
    kmeans = KMeans(n_clusters=j).fit(feature_df)
    score.append(kmeans.inertia_)

CPU times: user 1min 49s, sys: 4.66 s, total: 1min 53s
Wall time: 1min 33s


In [25]:
score_plot = plotly_create_scatter(list(range(2, 20)), score, mode='lines+markers')
plotly_draw_scatter(score_plot, zero_margin=False, height=220)

In [26]:
%%time
n_clusters = 5
kmeans = KMeans(n_clusters=n_clusters, random_state=0).fit(feature_df)
visual_data = np.vstack((kmeans.cluster_centers_, feature_df.values))

CPU times: user 4.21 s, sys: 259 ms, total: 4.47 s
Wall time: 3.9 s


In [27]:
%%time
X_embedded = TSNE(n_components=2, random_state=3).fit_transform(visual_data)

X_embedded.shape

CPU times: user 29.8 s, sys: 1.53 s, total: 31.3 s
Wall time: 31.5 s


In [28]:
centers_x = [x[0] for x in X_embedded[:n_clusters]]
centers_y = [x[1] for x in X_embedded[:n_clusters]]

points_x = [x[0] for x in X_embedded[n_clusters:]]
points_y = [x[1] for x in X_embedded[n_clusters:]]
    
points = plotly_create_scatter(points_x, points_y, [], color='lightseagreen', opacity=0.4)
centers = plotly_create_scatter(centers_x, centers_y, [], color='black', symbol='x', size=9)
plotly_draw_scatter(points, centers)