In [52]:
import pandas as pd
import json
import numpy as np
import sklearn
import scipy
import sys
import matplotlib.pyplot as plt
from pylab import *
from sklearn.metrics.pairwise import cosine_similarity

pd.set_option('display.max_columns', 500)

In [34]:
def log_progress(sequence, every=None, size=None):
    from ipywidgets import IntProgress, HTML, VBox
    from IPython.display import display
    
    is_iterator = False
    if size is None:
        try:
            size = len(sequence)
        except TypeError:
            is_iterator = True
    if size is not None:
        if every is None:
            if size <= 200:
                every = 1
            else:
                every = size / 200     # every 0.5%
    else:
        assert every is not None, 'sequence is iterator, set every'

    if is_iterator:
        progress = IntProgress(min=0, max=1, value=1)
        progress.bar_style = 'info'
    else:
        progress = IntProgress(min=0, max=size, value=0)
    label = HTML()
    box = VBox(children=[label, progress])
    display(box)
    
    index = 0
    try:
        for index, record in enumerate(sequence, 1):
            
            if index == 1 or index % every == 0:
                if is_iterator:
                    label.value = '{index} / ?'.format(index=index)
                else:
                    progress.value = index
                    label.value = u'{index} / {size}'.format(
                        index=index,
                        size=size
                    )
            yield record
    except:
        progress.bar_style = 'danger'
        raise
    else:
        progress.bar_style = 'success'
        progress.value = index
        label.value = str(index or '?')

In [2]:
df_places = pd.read_csv("venues_pid.csv")

In [11]:
df_places.head()

Unnamed: 0,title,label,color,labeled_places_22_02_pid,place_id
0,Ул летчика пилютова 50,20,#66d743,7229933,7229933
1,"Парк ""Сосновая поляна""",20,#66d743,11659767,11659767
2,Рококо,20,#66d743,5913,5913
3,Ветеранов дом 149,20,#66d743,8102072,8102072
4,352 школа,20,#66d743,15959331,15959331


In [5]:
df_checkins = pd.read_csv('all_checkins.csv', encoding='windows-1251')


In [10]:
df_places = df_places.rename(index=str, columns={"labeled_places_22_02_place_id": "place_id"})

In [7]:
df_checkins.head()

Unnamed: 0.1,Unnamed: 0,date,id,latitude,longitude,place_id,post_id,text,uid
0,0,1521713000.0,7221770_15476,59.900162,30.401269,5960695,15476.0,,7221770.0
1,1,1514320000.0,27628212_952,59.900162,30.401269,5960695,952.0,"Ах елки, елки, елочки......???",27628212.0
2,2,1513158000.0,10530097_10655,59.900162,30.401269,5960695,10655.0,"Первый раз за месяц вышло яркое солнце,и все п...",10530097.0
3,3,1482679000.0,43545311_2993,59.900162,30.401269,5960695,2993.0,Недавольная я...потратила кучу денег к НГ),43545311.0
4,4,1477990000.0,5116443_7790,59.900162,30.401269,5960695,7790.0,Работаем???,5116443.0


In [12]:
df_checkins_label = pd.merge(df_checkins, df_places, on='place_id', how='inner')

In [20]:
df_checkins_group_by_label = df_checkins_label.groupby(['label', 'uid'])['post_id'].count()

In [54]:
local_uid = {}

local_id = 0
for uid in df_checkins.drop_duplicates('uid').uid.tolist():
    local_uid[uid] = local_id
    local_id += 1

In [105]:
df_label = df_checkins_label.groupby('label')

KeyError: 'Column not found: False'

In [59]:
#len(df_places.label.value_counts())

bag_of_users = scipy.sparse.lil_matrix((df_places.label.max() + 1, len(local_uid)), dtype=int16)

for g in log_progress(df_checkins_group_by_label.iteritems(),100):
    label = g[0][0]
    uid = g[0][1]
    
    local_id = local_uid[uid]
    
    bag_of_users[label, local_id] = g[1]

In [94]:
df_checkins_group_by_label[0]

uid
60533.0        1
61203.0        2
84299.0        2
85398.0        2
89444.0        2
142949.0       3
213739.0       3
217288.0       2
289196.0       2
753388.0       1
891507.0       4
953687.0       1
1281061.0      2
1316853.0      1
1346381.0      3
1768480.0      2
2404492.0      1
2518903.0      1
2583196.0      1
2600707.0      2
3052039.0      1
3493107.0      1
4670083.0      2
5093998.0      1
5984234.0      3
6271460.0      1
7355310.0      1
8532522.0      1
11516023.0     1
12080115.0     2
14842609.0     2
15116677.0     1
19277976.0     1
22341569.0     2
24935284.0     1
42752769.0     3
49569322.0     1
52779276.0     1
53182568.0     1
68765597.0     1
72099011.0     1
89147687.0     1
91373504.0     1
122338958.0    1
139307171.0    1
144215710.0    3
151929207.0    1
163046980.0    1
218005412.0    2
265032367.0    1
398885244.0    1
449572060.0    1
Name: post_id, dtype: int64

In [63]:
cosine_similarity(bag_of_users[4],bag_of_users[1])

array([[ 0.81226687]])

In [84]:
cosine_similarity_dict = {}

for j in range(bag_of_users.shape[0]):
    j_cos_sim = []
    for k in range(bag_of_users.shape[0]):
        j_cos_sim.append(cosine_similarity(bag_of_users[j], bag_of_users[k])[0][0])
        
    cosine_similarity_dict[j] = j_cos_sim

[0.99999999999999989,
 0.8122668690159931,
 0.0,
 0.53476583575360637,
 0.99999999999999989,
 0.0014168214973118494,
 0.0,
 0.0,
 0.99999999999999989,
 0.0021950660214931248,
 0.072758204026122847,
 0.15864527468697756,
 0.57384270966315398,
 0.0013961296876320022,
 0.0,
 0.0021771882667400017,
 0.0,
 0.00080783983558514445,
 0.0088133952592079241,
 0.0,
 0.0,
 0.0091209929653442521,
 0.084449221201817484,
 0.00028067221287936169,
 0.99999999999999989,
 0.0047609871997293827,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0088133952592079241,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0039045238265924273,
 0.0088133952592079241,
 0.0,
 0.0088133952592079241,
 0.001128073212910448,
 0.0,
 0.0,
 0.047298433430602721,
 0.0028782040722587238,
 0.0,
 0.0,
 0.0088133952592079259,
 0.0088133952592079241,
 0.00046423944601158641,
 0.0,
 0.0,
 0.014146358057624127,
 0.0,
 0.01615341810776005,
 0.0020396825407745845,
 0.0071052879673992216,
 0.001388331535706796,
 0.7456314952219788,
 0.0,
 0.00881339525920

In [86]:
cosine_similarity(bag_of_users[0], bag_of_users[4])

array([[ 1.]])

In [88]:
bag_of_users[0].nonzero()

(array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0], dtype=int32),
 array([ 1970,  2395,  2574,  2825,  4044,  4204,  5806,  7141,  7388,
         7835,  7936,  9047,  9502,  9535,  9999, 10000, 10001, 10002,
        10003, 10004, 10005, 10006, 10007, 10008, 10009, 10010, 10011,
        10012, 10013, 10014, 10015, 10016, 10017, 10018, 10019, 10020,
        10021, 10022, 10023, 10024, 10025, 10026, 10027, 10028, 10029,
        10030, 10031, 10032, 10033, 10034, 10035, 10036], dtype=int32))

In [89]:
bag_of_users[4].nonzero()

(array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0], dtype=int32),
 array([ 1970,  2395,  2574,  2825,  4044,  4204,  5806,  7141,  7388,
         7835,  7936,  9047,  9502,  9535,  9999, 10000, 10001, 10002,
        10003, 10004, 10005, 10006, 10007, 10008, 10009, 10010, 10011,
        10012, 10013, 10014, 10015, 10016, 10017, 10018, 10019, 10020,
        10021, 10022, 10023, 10024, 10025, 10026, 10027, 10028, 10029,
        10030, 10031, 10032, 10033, 10034, 10035, 10036], dtype=int32))

In [110]:
with open('clasters_sim.json', 'w') as out_file:
    json.dump(cosine_similarity_dict, out_file)