05-observation-based-mechanism.ipynb
======================

**Things to do**
* Test code.
* Find the number of unique cheaters who harmed other players severely.
* Make code modular.

**Sample match ID for testing**
* 000213be-6b3b-438a-8d20-c1b57b01a174 (no cheater)
* 07a471f7-4776-460d-b896-1306b98b6d19 (one cheater)
* 15e457b1-0940-47ca-a730-de0dfd1ccd77 (two cheaters)

## Load packages and read tables.

In [1]:
from pyspark.sql.functions import col, lit, when
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from pyspark.sql.types import StructType, StructField, LongType
import pubg_analysis as pubg

In [2]:
# Read a table that contains edges.
td = spark.read.parquet("s3://social-research-cheating/edges/obs_data.parquet")
td.registerTempTable("td")

# Read a table that contains player data.
players = spark.read.parquet("s3://social-research-cheating/nodes.parquet")
players.registerTempTable("nodes")

In [4]:
# Show the first few rows of each dataset.
td.show(5)
players.show(5)

+--------------------+--------------------+------+------+-------------+--------+--------------------+------+------+-------------+--------+--------------------+----------+
|                 mid|                 src|src_sd|src_bd|src_curr_flag|src_flag|                 dst|dst_sd|dst_bd|dst_curr_flag|dst_flag|                time|    m_date|
+--------------------+--------------------+------+------+-------------+--------+--------------------+------+------+-------------+--------+--------------------+----------+
|013caebc-8504-4d7...|account.618a8b12e...|    NA|    NA|            0|       0|account.847e9695e...|    NA|    NA|            0|       0|2019-03-07 11:37:...|2019-03-07|
|013caebc-8504-4d7...|account.0e2dd932a...|    NA|    NA|            0|       0|account.43d92d8f6...|    NA|    NA|            0|       0|2019-03-07 11:37:...|2019-03-07|
|013caebc-8504-4d7...|account.247cce3d3...|    NA|    NA|            0|       0|account.49ec1af7c...|    NA|    NA|            0|       0|2019-03

## 1. Count the number of motifs on the empirical network.

In [5]:
# First, assume that victims are severely harmed if they were killed after getting into the top 30 percent.
res_tab = pubg.add_level_of_harm(td, 30)
res_tab.registerTempTable("new_td")
res_tab.show(5)

+--------------------+--------------------+------+------+-------------+--------+--------------------+------+------+-------------+--------+--------------------+----------+--------+-------+------+
|                 mid|                 src|src_sd|src_bd|src_curr_flag|src_flag|                 dst|dst_sd|dst_bd|dst_curr_flag|dst_flag|                time|    m_date|num_rows|ranking|damage|
+--------------------+--------------------+------+------+-------------+--------+--------------------+------+------+-------------+--------+--------------------+----------+--------+-------+------+
|00010411-099d-419...|account.8de17f9ae...|    NA|    NA|            0|       0|account.de378e31d...|    NA|    NA|            0|       0|2019-03-03 18:16:...|2019-03-03|      84|      1|     1|
|00010411-099d-419...|account.de378e31d...|    NA|    NA|            0|       0|account.7b1593aa4...|    NA|    NA|            0|       0|2019-03-03 18:16:...|2019-03-03|      84|      2|     1|
|00010411-099d-419...|acc

In [6]:
records = spark.sql("SELECT *, ROW_NUMBER() OVER (PARTITION BY mid ORDER BY time) AS aid FROM new_td")
records.registerTempTable("records")

In [7]:
# Get a summary table of the empirical network.
observers = pubg.get_observers(records)
observers.show(5)

# Store the summary table in the S3 bucket for the later use.
observers.write.parquet("s3://social-research-cheating/summary-tables/emp-net/observers/new_observers_30.parquet")

+--------------------+--------------------+----------+----------+------+--------------------+---+-------+
|                 mid|                  id|start_date|    m_date|period|              killer|obs|sev_dam|
+--------------------+--------------------+----------+----------+------+--------------------+---+-------+
|fc500511-eb82-4d5...|account.ffa1c3246...|2019-03-03|2019-03-01|     2|account.ab37a9b8d...|  4|      0|
|ab2e4d50-8d3d-453...|account.b1e3c68c2...|2019-03-08|2019-03-02|     6|account.69784fd42...| 12|      8|
|39156f0b-4b13-4f7...|account.e95674b02...|2019-03-15|2019-03-11|     4|account.c2d667eb3...|  2|      0|
|9f2e8e0e-a4bd-4b4...|account.ae0a3b2cc...|2019-03-25|2019-03-10|    15|account.eccfc12bf...|  6|      2|
|44d0ae45-1180-419...|account.5183e6cd3...|2019-03-06|2019-03-04|     2|account.2ec6e051e...|  7|      0|
+--------------------+--------------------+----------+----------+------+--------------------+---+-------+
only showing top 5 rows



*The cells below get the data for testing.*

In [7]:
# Test the code.
# mid 1 = 'dc12a386-7826-42d6-94b6-8b43d6e6a3fc'

observers.registerTempTable("observers")
test_tab = spark.sql("SELECT * FROM observers WHERE mid ='dc12a386-7826-42d6-94b6-8b43d6e6a3fc'")
test_tab.show()

+--------------------+--------------------+----------+----------+------+--------------------+---+-------+
|                 mid|                  id|start_date|    m_date|period|              killer|obs|sev_dam|
+--------------------+--------------------+----------+----------+------+--------------------+---+-------+
|dc12a386-7826-42d...|account.821b8fa28...|2019-03-04|2019-03-02|     2|account.b5d2d9eae...|  6|      4|
|dc12a386-7826-42d...|account.821b8fa28...|2019-03-04|2019-03-02|     2|account.b7cca1636...|  7|      1|
|dc12a386-7826-42d...|account.936171e13...|2019-03-03|2019-03-02|     1|account.b5d2d9eae...|  2|      0|
|dc12a386-7826-42d...|account.936171e13...|2019-03-03|2019-03-02|     1|account.b7cca1636...|  6|      0|
+--------------------+--------------------+----------+----------+------+--------------------+---+-------+



In [8]:
observers.registerTempTable("observers")
test_tab = spark.sql("SELECT * FROM observers WHERE mid ='dc12a386-7826-42d6-94b6-8b43d6e6a3fc'")
test_tab.show()

+--------------------+--------------------+----------+----------+------+--------------------+---+-------+
|                 mid|                  id|start_date|    m_date|period|              killer|obs|sev_dam|
+--------------------+--------------------+----------+----------+------+--------------------+---+-------+
|dc12a386-7826-42d...|account.821b8fa28...|2019-03-04|2019-03-02|     2|account.b5d2d9eae...|  6|      4|
|dc12a386-7826-42d...|account.821b8fa28...|2019-03-04|2019-03-02|     2|account.b7cca1636...|  7|      1|
|dc12a386-7826-42d...|account.936171e13...|2019-03-03|2019-03-02|     1|account.b5d2d9eae...|  2|      0|
|dc12a386-7826-42d...|account.936171e13...|2019-03-03|2019-03-02|     1|account.b7cca1636...|  6|      0|
+--------------------+--------------------+----------+----------+------+--------------------+---+-------+



In [8]:
# Test the code.
# mid 2 = '42ddcedc-225b-485b-b155-572bcee86af7'

test_tab = spark.sql("SELECT * FROM observers WHERE mid ='42ddcedc-225b-485b-b155-572bcee86af7'")
test_tab.show()

+--------------------+--------------------+----------+----------+------+--------------------+---+-------+
|                 mid|                  id|start_date|    m_date|period|              killer|obs|sev_dam|
+--------------------+--------------------+----------+----------+------+--------------------+---+-------+
|42ddcedc-225b-485...|account.8720ae80b...|2019-03-04|2019-03-02|     2|account.851e8f8ba...|  8|      0|
|42ddcedc-225b-485...|account.f8da2242d...|2019-03-04|2019-03-02|     2|account.851e8f8ba...|  5|      0|
|42ddcedc-225b-485...|account.58edd9c06...|2019-03-03|2019-03-02|     1|account.851e8f8ba...|  3|      0|
|42ddcedc-225b-485...|account.5ff57478a...|2019-03-05|2019-03-02|     3|account.851e8f8ba...|  5|      0|
|42ddcedc-225b-485...|account.a9af721e6...|2019-03-30|2019-03-02|    28|account.851e8f8ba...|  8|      0|
+--------------------+--------------------+----------+----------+------+--------------------+---+-------+



In [9]:
test_tab = spark.sql("SELECT * FROM observers WHERE mid ='42ddcedc-225b-485b-b155-572bcee86af7'")
test_tab.show()

+--------------------+--------------------+----------+----------+------+--------------------+---+-------+
|                 mid|                  id|start_date|    m_date|period|              killer|obs|sev_dam|
+--------------------+--------------------+----------+----------+------+--------------------+---+-------+
|42ddcedc-225b-485...|account.8720ae80b...|2019-03-04|2019-03-02|     2|account.851e8f8ba...|  8|      0|
|42ddcedc-225b-485...|account.f8da2242d...|2019-03-04|2019-03-02|     2|account.851e8f8ba...|  5|      0|
|42ddcedc-225b-485...|account.58edd9c06...|2019-03-03|2019-03-02|     1|account.851e8f8ba...|  3|      0|
|42ddcedc-225b-485...|account.5ff57478a...|2019-03-05|2019-03-02|     3|account.851e8f8ba...|  5|      0|
|42ddcedc-225b-485...|account.a9af721e6...|2019-03-30|2019-03-02|    28|account.851e8f8ba...|  8|      0|
+--------------------+--------------------+----------+----------+------+--------------------+---+-------+



In [7]:
observers = spark.read.parquet("s3://social-research-cheating/summary-tables/emp-net/observers/new_observers_30.parquet")
observers.registerTempTable("observers")

# Get the table that contains the total number of observations and the number of unique cheaters.
obs_info = spark.sql("""SELECT id, start_date, SUM(obs) AS total_obs, SUM(sev_dam) AS total_sev_dam, 
                        SUM(CASE WHEN obs >= 5 THEN 1 ELSE 0 END) AS total_cheaters, 
                        SUM(CASE WHEN obs >= 5 AND sev_dam > 0 THEN 1 ELSE 0 END) AS sev_cheaters 
                        FROM observers
                        GROUP BY id, start_date""")
obs_info.registerTempTable("obs_info")

# Get the date when the player first observed cheating.
first_m_dates = spark.sql("""SELECT * 
                             FROM (SELECT id, m_date, period, ROW_NUMBER() OVER (PARTITION BY id ORDER BY m_date) 
                             AS rownumber FROM observers) WHERE rownumber IN (1)""")
first_m_dates.registerTempTable("first_m_dates")

add_dates = spark.sql("""SELECT o.id, o.start_date, f.m_date, f.period, 
                         o.total_obs, o.total_sev_dam, o.total_cheaters, o.sev_cheaters 
                         FROM obs_info o LEFT JOIN first_m_dates f ON o.id = f.id""")

# Store the summary table in the S3 bucket for the later use.
add_dates.write.parquet("s3://social-research-cheating/summary-tables/emp-net/obs/obs_30_5.parquet")

## 2. Reuse the mapping table in the S3 bucket to create randomised networks.

In [10]:
td = spark.read.parquet("s3://social-research-cheating/edges/obs_data.parquet")
td.registerTempTable("td")

# Read the mapping table.
map_tab = spark.read.parquet("s3://social-research-cheating/mapping-tables/map_1.parquet")
map_tab.registerTempTable("map_tab")
map_tab.show(5)

+--------------------+--------------------+---------+--------+--------------------+---------+--------+
|            match_id|            original|orig_flag|orig_tid|          randomised|rand_flag|rand_tid|
+--------------------+--------------------+---------+--------+--------------------+---------+--------+
|07c3165b-19ca-412...|account.ad3c0bd6d...|        0|      12|account.ad3c0bd6d...|        0|      12|
|07c3165b-19ca-412...|account.e2a400b78...|        0|       8|account.dfd5fd1b3...|        0|       8|
|07c3165b-19ca-412...|account.993a6791a...|        0|       9|account.efbc5b1d2...|        0|       9|
|07c3165b-19ca-412...|account.5f5216d74...|        0|      13|account.5f5216d74...|        0|      13|
|07c3165b-19ca-412...|account.399fe82f8...|        0|      24|account.c55215aff...|        0|      24|
+--------------------+--------------------+---------+--------+--------------------+---------+--------+
only showing top 5 rows



## 3. Count the number of motifs on the randomised network.

In [2]:
# Read a table that contains edges.
td = spark.read.parquet("s3://social-research-cheating/edges/obs_data.parquet")
td.registerTempTable("td")

# Read a table that contains player data.
players = spark.read.parquet("s3://social-research-cheating/nodes.parquet")
players.registerTempTable("nodes")

In [3]:
for i in range(3, 6):
    # Read the mapping table.
    map_tab = spark.read.parquet("s3://social-research-cheating/mapping-tables/map_" 
                                 + str(i) + ".parquet")
    map_tab.registerTempTable("map_tab")
    
    # Get randomised gameplay logs.
    temp_rand_logs = spark.sql("""SELECT mid, src, randomised AS new_src, dst, time, m_date 
                                  FROM td t JOIN map_tab m ON t.src = m.original AND t.mid = m.match_id""")
    temp_rand_logs.registerTempTable("temp_rand_logs")
    
    randomised_logs = spark.sql("""SELECT mid, new_src AS src, randomised AS dst, time, m_date 
                                   FROM temp_rand_logs t JOIN map_tab m 
                                   ON t.dst = m.original AND t.mid = m.match_id""")

    randomised_logs.registerTempTable("randomised_logs")
    
    # Add more information about players.
    add_flags = spark.sql("""SELECT mid, src, start_date AS src_sd, ban_date AS src_bd, cheating_flag AS src_flag,
                             CASE WHEN m_date <= ban_date AND m_date >= start_date THEN 1 ELSE 0 END AS src_curr_flag, 
                             dst, time, m_date 
                             FROM randomised_logs r JOIN nodes n ON r.src = n.id""")
    add_flags.registerTempTable("add_flags")
    
    randomised_logs = spark.sql("""SELECT mid, src, src_sd, src_bd, src_flag, src_curr_flag,
                                   dst, start_date AS dst_sd, ban_date AS dst_bd, cheating_flag AS dst_flag,
                                   CASE WHEN m_date <= ban_date AND m_date >= start_date THEN 1 ELSE 0 END AS dst_curr_flag,
                                   time, m_date 
                                   FROM add_flags r JOIN nodes n ON r.dst = n.id""")
    randomised_logs.registerTempTable("logs")
    
    rand_logs = pubg.add_level_of_harm(randomised_logs, 30)
    rand_logs.registerTempTable("new_td")
    
    records = spark.sql("SELECT *, ROW_NUMBER() OVER (PARTITION BY mid ORDER BY time) AS aid FROM new_td")
    records.registerTempTable("records")
    
    # Get a summary table of the randomised network.
    observers = pubg.get_observers(records)

    # Store the summary table in the S3 bucket for the later use.
    observers.write.parquet("s3://social-research-cheating/summary-tables/rand-net/observers/observers_30/observers_30_" 
                            + str(i) + ".parquet")

In [2]:
for i in range(1, 6):
    observers = spark.read.parquet("s3://social-research-cheating/summary-tables/rand-net/observers/observers_30/observers_30_" 
                                   + str(i) + ".parquet")
    observers.registerTempTable("observers")

    # Get the table that contains the total number of observations and the number of unique cheaters.
    obs_info = spark.sql("""SELECT id, start_date, SUM(obs) AS total_obs, SUM(sev_dam) AS total_sev_dam, 
                            SUM(CASE WHEN obs >= 5 THEN 1 ELSE 0 END) AS total_cheaters, 
                            SUM(CASE WHEN obs >= 5 AND sev_dam > 0 THEN 1 ELSE 0 END) AS sev_cheaters 
                            FROM observers
                            GROUP BY id, start_date""")
    obs_info.registerTempTable("obs_info")

    # Get the date when the player first observed cheating.
    first_m_dates = spark.sql("""SELECT * 
                                 FROM (SELECT id, m_date, period, ROW_NUMBER() OVER (PARTITION BY id ORDER BY m_date) 
                                 AS rownumber FROM observers) WHERE rownumber IN (1)""")
    first_m_dates.registerTempTable("first_m_dates")

    add_dates = spark.sql("""SELECT o.id, o.start_date, f.m_date, f.period, 
                             o.total_obs, o.total_sev_dam, o.total_cheaters, o.sev_cheaters 
                             FROM obs_info o LEFT JOIN first_m_dates f ON o.id = f.id""")

    # Store the summary table in the S3 bucket for the later use.
    add_dates.write.parquet("s3://social-research-cheating/summary-tables/rand-net/obs/obs_30_5/obs_30_5_" 
                            + str(i) + ".parquet")