In [29]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans

In [6]:
specs = pd.read_csv('/code/data/raw/specs.csv')

In [9]:
specs.head()

Unnamed: 0,event_id,info,args
0,2b9272f4,The end of system-initiated feedback (Correct)...,"[{""name"":""game_time"",""type"":""int"",""info"":""mill..."
1,df4fe8b6,The end of system-initiated feedback (Incorrec...,"[{""name"":""game_time"",""type"":""int"",""info"":""mill..."
2,3babcb9b,The end of system-initiated instruction event ...,"[{""name"":""game_time"",""type"":""int"",""info"":""mill..."
3,7f0836bf,The end of system-initiated instruction event ...,"[{""name"":""game_time"",""type"":""int"",""info"":""mill..."
4,ab3136ba,The end of system-initiated instruction event ...,"[{""name"":""game_time"",""type"":""int"",""info"":""mill..."


In [15]:
len(specs)

386

In [10]:
vectorizer = TfidfVectorizer()

In [12]:
X = vectorizer.fit_transform(specs['info'].values)

In [14]:
X.shape

(386, 443)

In [22]:
vectorizer.get_feature_names()[0:10]

['3010',
 '3020',
 '3021',
 'about',
 'acceptable',
 'accuracy',
 'action',
 'activity',
 'actually',
 'after']

In [28]:
X.toarray()

array([[0.        , 0.        , 0.        , ..., 0.13042985, 0.20638076,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.11247416,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.19987916, 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ]])

In [40]:
clusters = KMeans(n_clusters=20, random_state=0).fit_predict(X.toarray())

In [62]:
clusters = ['event_' + str(i) for i in clusters]

In [63]:
specs['cluster'] = clusters

In [67]:
cluster_df = specs.set_index('event_id')['cluster']

In [None]:
cluster_df

In [65]:
test = pd.read_csv('/code/data/raw/test.csv')

In [66]:
test.head()

Unnamed: 0,event_id,game_session,timestamp,event_data,installation_id,event_count,event_code,game_time,title,type,world
0,27253bdc,0ea9ecc81a565215,2019-09-10T16:50:24.910Z,"{""event_code"": 2000, ""event_count"": 1}",00abaee7,1,2000,0,Welcome to Lost Lagoon!,Clip,NONE
1,27253bdc,c1ea43d8b8261d27,2019-09-10T16:50:55.503Z,"{""event_code"": 2000, ""event_count"": 1}",00abaee7,1,2000,0,Magma Peak - Level 1,Clip,MAGMAPEAK
2,27253bdc,7ed86c6b72e725e2,2019-09-10T16:51:51.805Z,"{""event_code"": 2000, ""event_count"": 1}",00abaee7,1,2000,0,Magma Peak - Level 2,Clip,MAGMAPEAK
3,27253bdc,7e516ace50e7fe67,2019-09-10T16:53:12.825Z,"{""event_code"": 2000, ""event_count"": 1}",00abaee7,1,2000,0,Crystal Caves - Level 1,Clip,CRYSTALCAVES
4,7d093bf9,a022c3f60ba547e7,2019-09-10T16:54:12.115Z,"{""version"":""1.0"",""round"":0,""event_count"":1,""ga...",00abaee7,1,2000,0,Chow Time,Game,CRYSTALCAVES


In [80]:
new_test = pd.merge(test, cluster_df,  left_on='event_id', right_index=True, sort=False, )

In [87]:
new_test.drop('event_id', axis=1, inplace=True)

In [88]:
new_test.head()

Unnamed: 0,game_session,timestamp,event_data,installation_id,event_count,event_code,game_time,title,type,world,cluster
0,0ea9ecc81a565215,2019-09-10T16:50:24.910Z,"{""event_code"": 2000, ""event_count"": 1}",00abaee7,1,2000,0,Welcome to Lost Lagoon!,Clip,NONE,event_12
1,c1ea43d8b8261d27,2019-09-10T16:50:55.503Z,"{""event_code"": 2000, ""event_count"": 1}",00abaee7,1,2000,0,Magma Peak - Level 1,Clip,MAGMAPEAK,event_12
2,7ed86c6b72e725e2,2019-09-10T16:51:51.805Z,"{""event_code"": 2000, ""event_count"": 1}",00abaee7,1,2000,0,Magma Peak - Level 2,Clip,MAGMAPEAK,event_12
3,7e516ace50e7fe67,2019-09-10T16:53:12.825Z,"{""event_code"": 2000, ""event_count"": 1}",00abaee7,1,2000,0,Crystal Caves - Level 1,Clip,CRYSTALCAVES,event_12
163,1c203986674d7d9b,2019-09-10T16:56:39.003Z,"{""event_code"": 2000, ""event_count"": 1}",00abaee7,1,2000,0,Balancing Act,Clip,CRYSTALCAVES,event_12


In [91]:
new_test = new_test.rename(columns={'cluster': 'event_id'})

In [92]:
new_test.head()

Unnamed: 0,game_session,timestamp,event_data,installation_id,event_count,event_code,game_time,title,type,world,event_id
0,0ea9ecc81a565215,2019-09-10T16:50:24.910Z,"{""event_code"": 2000, ""event_count"": 1}",00abaee7,1,2000,0,Welcome to Lost Lagoon!,Clip,NONE,event_12
1,c1ea43d8b8261d27,2019-09-10T16:50:55.503Z,"{""event_code"": 2000, ""event_count"": 1}",00abaee7,1,2000,0,Magma Peak - Level 1,Clip,MAGMAPEAK,event_12
2,7ed86c6b72e725e2,2019-09-10T16:51:51.805Z,"{""event_code"": 2000, ""event_count"": 1}",00abaee7,1,2000,0,Magma Peak - Level 2,Clip,MAGMAPEAK,event_12
3,7e516ace50e7fe67,2019-09-10T16:53:12.825Z,"{""event_code"": 2000, ""event_count"": 1}",00abaee7,1,2000,0,Crystal Caves - Level 1,Clip,CRYSTALCAVES,event_12
163,1c203986674d7d9b,2019-09-10T16:56:39.003Z,"{""event_code"": 2000, ""event_count"": 1}",00abaee7,1,2000,0,Balancing Act,Clip,CRYSTALCAVES,event_12


event_id
2b9272f4    event_15
df4fe8b6     event_5
3babcb9b     event_0
7f0836bf     event_0
ab3136ba     event_0
              ...   
29f54413    event_16
06372577    event_16
2a444e03    event_12
9e6b7fb5    event_14
d3640339     event_3
Name: cluster, Length: 386, dtype: object