## 5 OBSERVATION-BASED MECHANISM

In [1]:
import summarize_results as sr

In [2]:
td = spark.read.parquet("s3://social-research-cheating/edges/rev_obs_data.parquet")
td.registerTempTable("td")

nodes = spark.read.parquet("s3://social-research-cheating/nodes.parquet")
nodes.registerTempTable("nodes")

### 5.1 MOTIF COUNTS IN THE DATA

In [6]:
obs_data = spark.sql("SELECT *, ROW_NUMBER() OVER (PARTITION BY mid ORDER BY time) AS aid FROM td")
obs_data.registerTempTable("obs_data")

observers = sr.get_observers(obs_data)
observers.show(5)
observers.write.parquet("s3://social-research-cheating/summary-tables/emp-net/observers.parquet")

+--------------------+--------------------+----------+----------+------+--------------------+----------+
|                 mid|                  id|start_date|    m_date|period|              killer|num_of_obs|
+--------------------+--------------------+----------+----------+------+--------------------+----------+
|1fdcc035-4caa-4d8...|account.5cc9dddaa...|2019-03-04|2019-03-03|     1|account.f6539b2da...|         6|
|49ce2354-78d8-45a...|account.9b71d114a...|2019-03-27|2019-03-26|     1|account.86503f31e...|         4|
|05b171f5-8f0c-4b9...|account.8d1623485...|2019-03-10|2019-03-09|     1|account.0e0f2587d...|         5|
|dcde74f2-9ac8-4ac...|account.d9c42f038...|2019-03-16|2019-03-10|     6|account.d4e33cc11...|         4|
|93adaa78-5656-4f2...|account.769ddbdda...|2019-03-05|2019-03-04|     1|account.e4d805ef9...|         1|
+--------------------+--------------------+----------+----------+------+--------------------+----------+
only showing top 5 rows



In [2]:
observers = spark.read.parquet("s3://social-research-cheating/summary-tables/emp-net/observers.parquet")
observers.registerTempTable("observers")

simple_obs = sr.get_obs_summary_tab(observers, 2)
simple_obs.show(5)

strict_obs = sr.get_obs_summary_tab(observers, 5)
strict_obs.show(5)

simple_obs.write.parquet("s3://social-research-cheating/summary-tables/emp-net/simple_obs.parquet")
strict_obs.write.parquet("s3://social-research-cheating/summary-tables/emp-net/strict_obs.parquet")

+--------------------+----------+----------+------+---------+
|                  id|start_date|    m_date|period|total_obs|
+--------------------+----------+----------+------+---------+
|account.f21ff46ad...|2019-03-23|2019-03-20|     3|        5|
|account.6eb0ac80c...|2019-03-14|2019-03-01|    13|       12|
|account.ba5039815...|2019-03-04|2019-03-03|     1|        1|
|account.26c64a1ab...|2019-03-05|2019-03-01|     4|       22|
|account.dfde84051...|2019-03-13|2019-03-01|    12|        4|
+--------------------+----------+----------+------+---------+
only showing top 5 rows

+--------------------+----------+----------+------+---------+
|                  id|start_date|    m_date|period|total_obs|
+--------------------+----------+----------+------+---------+
|account.f21ff46ad...|2019-03-23|2019-03-20|     3|        1|
|account.6eb0ac80c...|2019-03-14|2019-03-01|    13|        4|
|account.ba5039815...|2019-03-04|2019-03-03|     1|        0|
|account.26c64a1ab...|2019-03-05|2019-03-01| 

### 5.2 MOTIF COUNTS IN SIMULATIONS

In [3]:
for i in range(1, 6):
    mapping_table = spark.read.parquet("s3://social-research-cheating/mapping-tables/map_" 
                                       + str(i) + ".parquet")
    mapping_table.registerTempTable("mapping_table")
    
    temp = spark.sql("""SELECT mid, src, randomised AS new_src, dst, time, m_date 
                        FROM td t JOIN mapping_table m 
                        ON t.src = m.original AND t.mid = m.match_id""")
    temp.registerTempTable("temp")
    
    randomized_data = spark.sql("""SELECT mid, new_src AS src, randomised AS dst, time, m_date 
                                   FROM temp t JOIN mapping_table m 
                                   ON t.dst = m.original AND t.mid = m.match_id""")
    randomized_data.registerTempTable("randomized_data")
    
    add_flags = spark.sql("""SELECT mid, src, start_date AS src_sd, ban_date AS src_bd, 
                             cheating_flag AS src_flag,
                             CASE WHEN m_date <= ban_date AND m_date >= start_date THEN 1 ELSE 0 END 
                             AS src_curr_flag, dst, time, m_date 
                             FROM randomized_data r JOIN nodes n ON r.src = n.id""")
    add_flags.registerTempTable("add_flags")
    
    randomized_data = spark.sql("""SELECT mid, src, src_sd, src_bd, src_flag, src_curr_flag,
                                   dst, start_date AS dst_sd, ban_date AS dst_bd, cheating_flag AS dst_flag,
                                   CASE WHEN m_date <= ban_date AND m_date >= start_date THEN 1 ELSE 0 END 
                                   AS dst_curr_flag, time, m_date 
                                   FROM add_flags r JOIN nodes n ON r.dst = n.id""")
    randomized_data.registerTempTable("td")
    
    obs_data = spark.sql("SELECT *, ROW_NUMBER() OVER (PARTITION BY mid ORDER BY time) AS aid FROM td")
    obs_data.registerTempTable("obs_data")
    
    observers = sr.get_observers(obs_data)
    observers.write.parquet("s3://social-research-cheating/summary-tables/rand-net/observers/observers_" 
                            + str(i) + ".parquet")

In [4]:
for i in range(1, 6):
    observers = spark.read.parquet("s3://social-research-cheating/summary-tables/rand-net/observers/observers_" 
                                   + str(i) + ".parquet")
    observers.registerTempTable("observers")

    simple_obs = sr.get_obs_summary_tab(observers, 2)
    simple_obs.write.parquet("s3://social-research-cheating/summary-tables/rand-net/obs/simple_obs/obs_" 
                             + str(i) + ".parquet")
    
    strict_obs = sr.get_obs_summary_tab(observers, 5)
    strict_obs.write.parquet("s3://social-research-cheating/summary-tables/rand-net/obs/strict_obs/obs_" 
                             + str(i) + ".parquet")