In [1]:
import networkx as nx
import pandas as pd
from glob import glob
import re

In [2]:
filenames  = glob("./data/events_rsvp_bez_selekcji/*.csv")

In [3]:
filenames[:2]

['./data/events_rsvp_bez_selekcji/1003506093022823_WystawaKrzymskywKATO.csv',
 './data/events_rsvp_bez_selekcji/1009535365747237_0509PAULINAPRZYBYSZMINIMALKONTENERYKULTURY.csv']

In [4]:
len(filenames)

379

In [5]:
df_test = pd.read_csv(filenames[5])
df_test.head()

Unnamed: 0,name,id,rsvp_status
0,Michał Spandel,139423733084723,attending
1,Konrad Kobylinski,131965087159246,attending
2,Megii Spr,126812671002976,attending
3,Paulina Żebrowska,1428057144189383,attending
4,Sebastian Dziedzic,1496948513953791,attending


In [6]:
df_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 330 entries, 0 to 329
Data columns (total 3 columns):
name           330 non-null object
id             330 non-null int64
rsvp_status    330 non-null object
dtypes: int64(1), object(2)
memory usage: 7.8+ KB


In [7]:
pattern = re.compile(r"/(\d+)_.+\.csv")

In [8]:
pattern.findall(filenames[1])

['1009535365747237']

In [9]:
pattern = re.compile(r"/(\d+)_.+\.csv")

dfs = []
for filename in filenames:
    df_part = pd.read_csv(filename)
    search_result = pattern.findall(filename)
    if len(search_result) != 1:
        print(filename)
        print(search_result)
        continue
    df_part["event_id"] = pattern.findall(filename)[0]
    dfs.append(df_part)

df = pd.concat(dfs)

In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 63610 entries, 0 to 0
Data columns (total 4 columns):
name           63610 non-null object
id             63610 non-null int64
rsvp_status    63610 non-null object
event_id       63610 non-null object
dtypes: int64(1), object(3)
memory usage: 2.4+ MB


In [11]:
df.rsvp_status.value_counts()

attending    37454
unsure       20018
declined      6138
Name: rsvp_status, dtype: int64

In [14]:
df.to_csv("./data/rsvp.csv", index=False)

In [32]:
df_mayattend = df[df.rsvp_status != "declined"]

In [None]:
len(df_mayattend.id.unique())

In [34]:
at_least_twice = df_mayattend.id.value_counts()
at_least_twice = at_least_twice[at_least_twice > 1]
df_mayattend = df_mayattend[df_mayattend.id.isin(at_least_twice.index)]

8219

In [53]:
users_no = len(df_mayattend.id.unique())
users_no

8219

In [57]:
# df_mayattend.event_id.value_counts().value_counts()

In [35]:
pairs = df_mayattend.merge(df_mayattend, on="id")

In [36]:
pairs.shape

(284565, 7)

In [37]:
pairs.head()

Unnamed: 0,name_x,id,rsvp_status_x,event_id_x,name_y,rsvp_status_y,event_id_y
0,Anna Dudzińska,1490062834615246,attending,1003506093022823,Anna Dudzińska,attending,1003506093022823
1,Anna Dudzińska,1490062834615246,attending,1003506093022823,Anna Dudzińska,unsure,1563282527304477
2,Anna Dudzińska,1490062834615246,attending,1003506093022823,Anna Dudzińska,unsure,1589039124712548
3,Anna Dudzińska,1490062834615246,attending,1003506093022823,Anna Dudzińska,attending,1601631150112875
4,Anna Dudzińska,1490062834615246,attending,1003506093022823,Anna Dudzińska,attending,978102145535062


In [38]:
pair_counts = pairs.query("event_id_x < event_id_y").groupby(["event_id_x", "event_id_y"]).size()

In [39]:
pair_counts.head()

event_id_x        event_id_y      
1003506093022823  1009535365747237     5
                  1011421725600905     5
                  1020724457947795    13
                  1032546810172594     1
                  1037899542905963    36
dtype: int64

In [40]:
pair_counts.reset_index().head()

Unnamed: 0,event_id_x,event_id_y,0
0,1003506093022823,1009535365747237,5
1,1003506093022823,1011421725600905,5
2,1003506093022823,1020724457947795,13
3,1003506093022823,1032546810172594,1
4,1003506093022823,1037899542905963,36


In [41]:
events = pd.read_csv("data/events.csv")

In [42]:
events.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6300 entries, 0 to 6299
Data columns (total 16 columns):
id                 6300 non-null int64
facebook_id        6300 non-null int64
facebook_page      6300 non-null int64
picture            6300 non-null object
description        6046 non-null object
comments           6300 non-null object
attending_count    6300 non-null int64
end_time           6300 non-null object
start_time         6300 non-null object
place              6300 non-null object
maybe_count        6300 non-null int64
noreply_count      6300 non-null int64
declined_count     6300 non-null int64
name               6300 non-null object
feed               6300 non-null object
created_at         6300 non-null object
dtypes: int64(7), object(9)
memory usage: 787.6+ KB


In [43]:
attended_event_ids = set(df_mayattend.event_id.astype(int).unique())

In [44]:
all_event_ids = set(events.facebook_id)

In [45]:
len(attended_event_ids)

362

In [46]:
len(all_event_ids)

6300

In [47]:
len(attended_event_ids.intersection(all_event_ids))

362

In [64]:
counts_we_track = df_mayattend.event_id.astype(int).value_counts()

In [65]:
attended_event_ids_array = df_mayattend.event_id.astype(int).unique()

events_that_matter = events.set_index("facebook_id") \
  .loc[attended_event_ids_array, ["name", "attending_count", "maybe_count", "start_time"]]

In [66]:
events_that_matter.head()

Unnamed: 0_level_0,name,attending_count,maybe_count,start_time
facebook_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1003506093022823,Wystawa- Krzymsky w KATO,78,4,2015-07-28 16:00:00
1009535365747237,05/09- PAULINA PRZYBYSZ MINIMAL @KONTENERY KUL...,337,44,2015-09-05 18:30:00
1011421725600905,Natalia Bażowska oprowadza po wystawie „Sztuka...,16,58,2016-06-08 16:00:00
1015810938479436,"Uroczysta premiera ""ŚPIEWAJĄCEGO OBRUSIKA"" z u...",24,42,2016-02-05 19:15:00
1016481675037256,"WOJCIECH KUCHARCZYK / ""DUŻE ZDJĘCIA ŁADNYCH RO...",39,15,2012-05-01 00:00:00


In [67]:
events_that_matter["count"] = events_that_matter["attending_count"] + events_that_matter["maybe_count"]

In [68]:
import numpy as np

In [70]:
G = nx.Graph()

for eid, row in events_that_matter.iterrows():
    G.add_node(int(eid), name=row["name"],
                         attending_count_fb=int(row["attending_count"]),
                         maybe_count_fb=int(row["maybe_count"]),
                         count_fb=int(row["count"]),
                         count_we_track = int(counts_we_track.loc[int(eid)]),
                         start_time=row["start_time"])

for (eid1, eid2), count in pair_counts.iteritems():
    # no numpy values for export
    # oe = float(count * users_no / (events_that_matter.loc[int(eid1), "count"] * events_that_matter.loc[int(eid2), "count"]))
    oe = float(count * users_no / (counts_we_track[int(eid1)] * counts_we_track[int(eid2)]))
    pmi = float(np.log10(oe))
    if pmi > 0:
        G.add_edge(int(eid1), int(eid2), count=int(count), oe=oe, pmi=pmi, weight=pmi)


# nx.write_graphml(G, "./data/pairs_v1.graphml")
nx.write_graphml(G, "./data/pairs_v2.graphml")

In [80]:
for eid, row in events_that_matter.iterrows():
    print(eid)
    break

1003506093022823


In [84]:
events_that_matter.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 378 entries, 1003506093022823 to 999062826853462
Data columns (total 4 columns):
name               378 non-null object
attending_count    378 non-null int64
maybe_count        378 non-null int64
start_time         378 non-null object
dtypes: int64(2), object(2)
memory usage: 14.8+ KB
