In [3]:
import networkx as nx
import pandas as pd
from glob import glob
import re

In [6]:
filenames  = glob("./data/events_rsvp_bez_selekcji/*.csv")

In [23]:
filenames[:2]

['./data/events_rsvp_bez_selekcji/1003506093022823_WystawaKrzymskywKATO.csv',
 './data/events_rsvp_bez_selekcji/1009535365747237_0509PAULINAPRZYBYSZMINIMALKONTENERYKULTURY.csv']

In [19]:
len(filenames)

379

In [16]:
df_test = pd.read_csv(filenames[5])
df_test.head()

Unnamed: 0,name,id,rsvp_status
0,Michał Spandel,139423733084723,attending
1,Konrad Kobylinski,131965087159246,attending
2,Megii Spr,126812671002976,attending
3,Paulina Żebrowska,1428057144189383,attending
4,Sebastian Dziedzic,1496948513953791,attending


In [17]:
df_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 330 entries, 0 to 329
Data columns (total 3 columns):
name           330 non-null object
id             330 non-null int64
rsvp_status    330 non-null object
dtypes: int64(1), object(2)
memory usage: 7.8+ KB


In [24]:
pattern = re.compile(r"/(\d+)_.+\.csv")

In [28]:
pattern.findall(filenames[1])

['1009535365747237']

In [31]:
pattern = re.compile(r"/(\d+)_.+\.csv")

dfs = []
for filename in filenames:
    df_part = pd.read_csv(filename)
    search_result = pattern.findall(filename)
    if len(search_result) != 1:
        print(filename)
        print(search_result)
        continue
    df_part["event_id"] = pattern.findall(filename)[0]
    dfs.append(df_part)

df = pd.concat(dfs)

In [32]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 63610 entries, 0 to 0
Data columns (total 4 columns):
name           63610 non-null object
id             63610 non-null int64
rsvp_status    63610 non-null object
event_id       63610 non-null object
dtypes: int64(1), object(3)
memory usage: 2.4+ MB


In [33]:
df.rsvp_status.value_counts()

attending    37454
unsure       20018
declined      6138
Name: rsvp_status, dtype: int64

In [34]:
df_mayattend = df[df.rsvp_status != "declined"]

In [35]:
len(df_mayattend.id.unique())

33124

In [37]:
pairs = df_mayattend.merge(df_mayattend, on="id")

In [39]:
pairs.shape

(309470, 7)

In [40]:
pairs.head()

Unnamed: 0,name_x,id,rsvp_status_x,event_id_x,name_y,rsvp_status_y,event_id_y
0,Anna Dudzińska,1490062834615246,attending,1003506093022823,Anna Dudzińska,attending,1003506093022823
1,Anna Dudzińska,1490062834615246,attending,1003506093022823,Anna Dudzińska,unsure,1563282527304477
2,Anna Dudzińska,1490062834615246,attending,1003506093022823,Anna Dudzińska,unsure,1589039124712548
3,Anna Dudzińska,1490062834615246,attending,1003506093022823,Anna Dudzińska,attending,1601631150112875
4,Anna Dudzińska,1490062834615246,attending,1003506093022823,Anna Dudzińska,attending,978102145535062


In [42]:
pair_counts = pairs.query("event_id_x < event_id_y").groupby(["event_id_x", "event_id_y"]).size()

In [43]:
pair_counts.head()

event_id_x        event_id_y      
1003506093022823  1009535365747237     5
                  1011421725600905     5
                  1020724457947795    13
                  1032546810172594     1
                  1037899542905963    36
dtype: int64

In [45]:
pair_counts.reset_index().head()

Unnamed: 0,event_id_x,event_id_y,0
0,1003506093022823,1009535365747237,5
1,1003506093022823,1011421725600905,5
2,1003506093022823,1020724457947795,13
3,1003506093022823,1032546810172594,1
4,1003506093022823,1037899542905963,36


In [63]:
events = pd.read_csv("data/events.csv")

In [64]:
events.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6300 entries, 0 to 6299
Data columns (total 16 columns):
id                 6300 non-null int64
facebook_id        6300 non-null int64
facebook_page      6300 non-null int64
picture            6300 non-null object
description        6046 non-null object
comments           6300 non-null object
attending_count    6300 non-null int64
end_time           6300 non-null object
start_time         6300 non-null object
place              6300 non-null object
maybe_count        6300 non-null int64
noreply_count      6300 non-null int64
declined_count     6300 non-null int64
name               6300 non-null object
feed               6300 non-null object
created_at         6300 non-null object
dtypes: int64(7), object(9)
memory usage: 787.6+ KB


In [68]:
attended_event_ids = set(df_mayattend.event_id.astype(int).unique())

In [69]:
all_event_ids = set(events.facebook_id)

In [70]:
len(attended_event_ids)

378

In [71]:
len(all_event_ids)

6300

In [72]:
len(attended_event_ids.intersection(all_event_ids))

378

In [73]:
attended_event_ids_array = df_mayattend.event_id.astype(int).unique()

events_that_matter = events.set_index("facebook_id") \
  .loc[attended_event_ids_array, ["name", "attending_count", "maybe_count", "start_time"]]

In [74]:
events_that_matter.head()

Unnamed: 0_level_0,name,attending_count,maybe_count,start_time
facebook_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1003506093022823,Wystawa- Krzymsky w KATO,78,4,2015-07-28 16:00:00
1009535365747237,05/09- PAULINA PRZYBYSZ MINIMAL @KONTENERY KUL...,337,44,2015-09-05 18:30:00
1011421725600905,Natalia Bażowska oprowadza po wystawie „Sztuka...,16,58,2016-06-08 16:00:00
1015810938479436,"Uroczysta premiera ""ŚPIEWAJĄCEGO OBRUSIKA"" z u...",24,42,2016-02-05 19:15:00
1016481675037256,"WOJCIECH KUCHARCZYK / ""DUŻE ZDJĘCIA ŁADNYCH RO...",39,15,2012-05-01 00:00:00


In [81]:
G = nx.Graph()

for eid, row in events_that_matter.iterrows():
    G.add_node(int(eid), name=row["name"],
                         attending_count=int(row["attending_count"]),
                         maybe_count=int(row["maybe_count"]),
                         start_time=row["start_time"])

for (eid1, eid2), count in pair_counts.iteritems():
    # no numpy values for export
    G.add_edge(int(eid1), int(eid2), count=int(count))


nx.write_graphml(G, "./data/pairs.graphml")

In [80]:
for eid, row in events_that_matter.iterrows():
    print(eid)
    break

1003506093022823


In [77]:
events_that_matter.head()

Unnamed: 0_level_0,name,attending_count,maybe_count,start_time
facebook_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1003506093022823,Wystawa- Krzymsky w KATO,78,4,2015-07-28 16:00:00
1009535365747237,05/09- PAULINA PRZYBYSZ MINIMAL @KONTENERY KUL...,337,44,2015-09-05 18:30:00
1011421725600905,Natalia Bażowska oprowadza po wystawie „Sztuka...,16,58,2016-06-08 16:00:00
1015810938479436,"Uroczysta premiera ""ŚPIEWAJĄCEGO OBRUSIKA"" z u...",24,42,2016-02-05 19:15:00
1016481675037256,"WOJCIECH KUCHARCZYK / ""DUŻE ZDJĘCIA ŁADNYCH RO...",39,15,2012-05-01 00:00:00
