In [1]:
from pyarrow import feather
import common.constants as c

In [31]:
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
edges_df = feather.read_feather(f"../{c.FRIENDS_FTR}")
edges_df

Unnamed: 0,Source,Target,Type,Weight
0,952492045813407744,91136866,Directed,1
1,952492045813407744,2783916882,Directed,1
2,952492045813407744,1348725896757338117,Directed,1
3,952492045813407744,601434817,Directed,1
4,952492045813407744,1051674991090110464,Directed,1
...,...,...,...,...
80692236,750896795954376705,303553328,Directed,1
80692237,750896795954376705,523366101,Directed,1
80692238,750896795954376705,847901551,Directed,1
80692239,750896795954376705,631630658,Directed,1


In [3]:
edges_df.rename({"Source": "target", "Target": "source"}, axis=1, inplace=True)

In [4]:
edges_df

Unnamed: 0,target,source,Type,Weight
0,952492045813407744,91136866,Directed,1
1,952492045813407744,2783916882,Directed,1
2,952492045813407744,1348725896757338117,Directed,1
3,952492045813407744,601434817,Directed,1
4,952492045813407744,1051674991090110464,Directed,1
...,...,...,...,...
80692236,750896795954376705,303553328,Directed,1
80692237,750896795954376705,523366101,Directed,1
80692238,750896795954376705,847901551,Directed,1
80692239,750896795954376705,631630658,Directed,1


In [5]:
edges_df = edges_df.set_index('source')

In [6]:
edges_df = edges_df.drop(["Type", "Weight"], axis=1)

In [7]:
edges_by_src_df = edges_df.groupby("source").agg({
    "target": lambda x: list(x)
})

edges_by_src_df

Unnamed: 0_level_0,target
source,Unnamed: 1_level_1
-1,"[325855007, 32458767, 3122121, 33128150, 10584..."
1000000043833810945,[1039344533857820672]
1000000083604246529,"[852333673743622144, 28599189]"
100000028,[19362341]
100000031,[111312524]
...,...
999999690,[54395910]
999999699838005251,"[145344704, 58569413, 714949505720823808]"
999999808483086341,[444692817]
999999909859389440,[1695019496]


In [8]:
edges_dict = edges_by_src_df.to_dict(orient='index')

In [9]:
edges_dict["999999690"]['target']

['54395910']

In [19]:
exposures = dict()
adopted = dict()

with open(f"../{c.TWTS_CSV}") as twts:

    reached_end = False
    i = 0
    
    while True:

        tw_row = twts.readline()
        if not tw_row:
            reached_end = True      # boolean, to allow finishing downloading the final round before breaking.
            print("END OF FILE!")
            break
               
        tweet = tw_row.split(',', 4)
        author_id = tweet[1]
        timestamp = tweet[2]
        author_info = edges_dict.get(author_id, None)
        
        if author_info:
            author_neighbors = author_info['target']

            for target in author_neighbors:
                if (target not in adopted):
                    exposures[target] = exposures.get(target, 0 ) + 1

        if author_id not in adopted:
            adopted[author_id] = timestamp
            
        i += 1
        if i % 1000000 == 0:
            print(f"i={i} processed.")
        
    print(len(exposures))
    print(len(adopted))
        

i=1000000 processed.
i=2000000 processed.
i=3000000 processed.
i=4000000 processed.
i=5000000 processed.
i=6000000 processed.
i=7000000 processed.
i=8000000 processed.
i=9000000 processed.
i=10000000 processed.
i=11000000 processed.
i=12000000 processed.
i=13000000 processed.
i=14000000 processed.
i=15000000 processed.
i=16000000 processed.
i=17000000 processed.
i=18000000 processed.
END OF FILE!
66652
6325895


In [20]:
exposures_df = pd.Series(exposures)
adopted_df = pd.Series(adopted)
exposures_df

145344704              716
27667256               139
2603680604             728
74152939                87
38444226               904
                      ... 
1237820118488489985      2
968877907560030208       1
1045370545               2
1248263845304504321      1
1110358619009241088      1
Length: 66652, dtype: int64

In [27]:
exposures_df = pd.DataFrame({"exposures": exposures_df})
adopted_df = pd.DataFrame({"adopted": adopted_df})

In [28]:
exposures_df = exposures_df.join(adopted_df)
exposures_df

Unnamed: 0,exposures,adopted
145344704,716,2020-05-26T16:54:24.000Z
27667256,139,2020-05-26T18:57:57.000Z
2603680604,728,2020-05-26T19:03:19.000Z
74152939,87,2020-05-26T19:30:08.000Z
38444226,904,2020-05-26T20:17:30.000Z
...,...,...
1237820118488489985,2,2020-06-02T20:32:10.000Z
968877907560030208,1,2020-06-02T17:01:14.000Z
1045370545,2,2020-06-02T17:05:15.000Z
1248263845304504321,1,2020-06-02T18:35:13.000Z


In [29]:
exposures_df.describe()

Unnamed: 0,exposures
count,66652.0
mean,274.421893
std,1048.435521
min,1.0
25%,31.0
50%,92.0
75%,245.0
max,75224.0


In [36]:
%matplotlib qt

exposures_df["adopted"] =  pd.to_datetime(exposures_df["adopted"])
exposures_df.plot.scatter(x = "adopted", y = "exposures")

<AxesSubplot:xlabel='adopted', ylabel='exposures'>

In [38]:
feather.write_feather(exposures_df, f"feathers/exposures_df.ftr")

# Tie Ratio

Tie_Ratio(m) = 1 / m*(m-1) [for all pairs of users in first m adoptions]