In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tslearn.clustering import TimeSeriesKMeans
import math
from tslearn.barycenters import dtw_barycenter_averaging
import base
from scipy.stats import entropy
from sklearn import preprocessing
from scipy.stats import f_oneway
import time
from statsmodels.stats.multicomp import pairwise_tukeyhsd

# Helpful functions

In [None]:
def plotting_values(column): #Values that needed to plot the diagram.
    values, counts = np.unique(column, return_counts=True)
    #need to convert to string to get only wanted values in the x-axis. Edit if you need int.
    values_str = list(map(str, values))
    return values_str, counts

# Merging the responses to app use data

In [None]:
app_data = pd.read_csv('APP_DATA.csv')

question_data = pd.read_csv('QUESTION_DATA.csv')

res = app_data.merge(question_data, how='inner', left_on=['uuid', 'PERIOD_END_DATE'], right_on=['uuid', 'date'])

#res.to_csv('MERGED_DATA.csv', index=False)

# Data preprosessing for K-means (all categories)

In [None]:
#Deleting the tools-category as it does have data we don't need
#Counting the total hourly app use of different users

df = pd.read_csv('MERGED_DATA.csv')
df = df.drop(['TOOLS'], axis=1)
df['total'] = df.iloc[:, 3:41].sum(axis=1)
df = df[['uuid', 'PERIOD_END_DATE', 'HOUR', 'total']]

#Creating a temporary file
#This file will have hours as columns

df["index"] = df["uuid"] + ':' + df["PERIOD_END_DATE"]
df = df.drop(['uuid', 'PERIOD_END_DATE'], axis=1)


df = df.pivot_table(values='total', index='index', columns='HOUR')

In [None]:
#Making the data prettier and more readable and filling NA-values

df = pd.read_csv('TEMP_FILE.csv')
df[['uuid', 'period_end_date']] = df['index'].str.split(':', 1, expand=True)
df = df.drop(['index'], axis=1)
cols = list(df.columns.values)
df = df[[cols[-2]] + [cols[-1]] + cols[0:24]]
df = df.fillna(0)
df2 = df.iloc[:, 2:26].astype(int)
df = df[[cols[-2]] + [cols[-1]]]
df = df.join(df2)
uid = df[['uuid', 'period_end_date']]
uid
#df.to_csv('PRETTY_TEMP_FILE.csv')

# Data preprocessing for single categories (faster if wanted to view single category)

In [None]:
#Combining the games categories and reformatting the dataframe
df1 = pd.read_csv('APP_SUM_DATA.csv')
df1 = df1.drop(['TOOLS'], axis=1)
df1['GAMES'] = df1.iloc[:, 9:24].sum(axis=1)
df1 = df1.drop(df1.iloc[:, 9:24], axis=1)
df1 = df1.iloc[:, 0:8].join(df1['GAMES']).join(df1.iloc[:, 8:36])
df1.iloc[:, 20:26]

# K-means and self created indicator value for one category (indicator value calculated with dunn_index, entropy and balance values)

In [None]:
df = pd.read_csv('DD_clustering_series.csv')
df = df.iloc[:, 9:27].join(df.iloc[:, 3:9])


k = 1
dunn_index_list = []
balance_list = []
entropy_list = []
score_list = []
while k < 20: #Clustering from k=1 to k=20


    cl_count = k
    km = TimeSeriesKMeans(n_clusters=cl_count, metric="dtw") #K-means algorithm with dynamic time warping

    labels = km.fit_predict(df) #Giving the cluster lables to timeseries data

    df['cluster%s' % k] = labels

    x = 0
    cluster_list = []
    while x < k:
        clusters = df.loc[df['cluster%s' % k].astype(int) == x]
        clusters = clusters.drop(['cluster%s' % k], axis=1)
        cluster_list.append(clusters.values)
        x += 1



    dunn_index = base.dunn(cluster_list)
    print(dunn_index)
    dunn_index_list.append(dunn_index)
    
    pd_series = pd.Series(labels)
    j = pd_series.value_counts()
    ent = entropy(j)
    entropy_list.append(ent)
    balance = (max(j) - min(j))/ max(j)
    balance_list.append(balance)
    
    k +=1

print(dunn_index_list)

In [None]:
#Creating a new dataframe that has the entropy, balance and dunn_index to calculate
#the indicator value
scores = pd.DataFrame([dunn_index_list, entropy_list, balance_list, score_list]) 
scores = scores.transpose()

# Calculating indicator value

In [None]:
df = pd.read_csv('SCORES_LIST.csv')
df3 = preprocessing.normalize([df['dunn_index'], 1-df['entropy'], df['balance']] )
df['d_normalised'] = df3[0]
df['1 - e_normalised'] = df3[1]
df['b_normalised'] = df3[2]
df['score'] = 0.4*df['d_normalised']+0.4*df['1 - e_normalised']+0.2*df['b_normalised'] #Final indicator value's list

# Drawing the clusters

In [None]:
cl_count = 5

plot_count = math.ceil(math.sqrt(cl_count))
fig, axs = plt.subplots(plot_count,plot_count,figsize=(25,25))
fig.suptitle('Clusters')
row_i=0
column_j=0
cl_number = 0


for label in set(labels):
    cluster = []
    for i in range(len(labels)):
            if(labels[i]==label):
                axs[row_i, column_j].plot(df.iloc[i], c="gray",alpha=0.4)
                cluster.append(df.iloc[i])
                
    if len(cluster) > 0:
        axs[row_i, column_j].plot(dtw_barycenter_averaging(np.vstack(cluster)),c="red")
    axs[row_i, column_j].set_title("Cluster "+str(cl_number))
    cl_number += 1 
    column_j+=1
    if column_j%plot_count == 0:
        row_i+=1
        column_j=0

#plt.savefig('PIC_NAME.png')
plt.show()

# Example of clustering format

![alt text](Clustering_K9.png "Example of cluster format")

# Script for boxplot

In [None]:
fig = plt.figure(figsize =(10, 7))
 
# Creating axes instance
ax = fig.add_axes([0, 0, 1, 1])
ax.set_xticklabels(['Cluster_0', 'Cluster_1',
                    'Cluster_2', 'Cluster_3', 'Cluster_4', 'Cluster_5', 'Cluster_6'])
 
# Creating plot
bp = ax.boxplot(cl)
 
# show plot
#plt.savefig('BOX_PLOT.png')
plt.show()

# Example for the boxplot format

![alt text](DD_box_6_k6.png "Example of boxplot format")