In [6]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans

In [7]:
class EventClusterizer:
    def __init__(self, info_cluster_num=10, args_cluster_num=8):
        self.info_cluster_num = info_cluster_num
        self.args_cluster_num = args_cluster_num
        self.specs_df = pd.read_csv('/code/data/raw/specs.csv')

    def process(self):
        self.specs_df['info_clusters'] = self.vectorize(
            columns='info', cluster_num=self.info_cluster_num)
        self.specs_df['args_clusters'] = self.vectorize(
            columns='args',  cluster_num=self.args_cluster_num)
        self.specs_df = self.specs_df.set_index('event_id')
        return self.specs_df[['info_clusters', 'args_clusters']]

    def vectorize(self, columns, cluster_num):
        vectorizer = TfidfVectorizer()
        X = vectorizer.fit_transform(self.specs_df[columns].values)
        clusters = KMeans(
            n_clusters=cluster_num,
            random_state=77,
            ).fit_predict(X.toarray())
        clusters = [f'{columns}_' + str(i) for i in clusters]
        return clusters


In [8]:
clister = EventClusterizer()

In [9]:
clister.process()

Unnamed: 0_level_0,info_clusters,args_clusters
event_id,Unnamed: 1_level_1,Unnamed: 2_level_1
2b9272f4,info_2,args_2
df4fe8b6,info_2,args_2
3babcb9b,info_3,args_2
7f0836bf,info_3,args_2
ab3136ba,info_3,args_2
...,...,...
29f54413,info_6,args_7
06372577,info_6,args_7
2a444e03,info_7,args_7
9e6b7fb5,info_8,args_7


In [10]:
train_df = pd.read_csv('/code/data/raw/train.csv')
test_df = pd.read_csv('/code/data/raw/test.csv')

In [13]:
event_clusterizer = EventClusterizer(
        info_cluster_num=10, args_cluster_num=20
        )
event_cluster = event_clusterizer.process()
train = pd.merge(train_df, event_cluster,  left_on='event_id', right_index=True).reset_index()
# train.drop('event_id', axis=1, inplace=True)
# train = train.rename(columns={'clusters': 'event_id'})

test = pd.merge(test_df, event_cluster,  left_on='event_id', right_index=True).reset_index()

In [0]:
train.shape

In [0]:
train_df.shape

In [0]:
test.shape

In [0]:
test_df.shape