## 1 STORING DATA IN AMAZON S3

In [1]:
from pyspark.sql.types import StructField, StructType, StringType, IntegerType, TimestampType
import pandas as pd
import get_data

### 1.1 PLAYER DATA

| Variable   | Explanation   
|:-----------|:-------
| id         | ID of the player               
| pname      | nickname of the player  
| cheating_flag     | 1 if the player was banned, 0 otherwise
| ban_date   | date in the format YYYY-MM-DD when the cheater was banned

In [2]:
node_schema = StructType([StructField("id", StringType(), True),
                          StructField("pname", StringType(), True),
                          StructField("cheating_flag", IntegerType(), True),
                          StructField("ban_date", StringType(), True)])

FILE_PATH = "s3://social-research-cheating/td_nodes.txt"

players = spark.read.options(header='false', delimiter='\t').schema(node_schema).csv(FILE_PATH)
players.write.parquet("s3://social-research-cheating/players.parquet")
players.show(10)

+--------------------+---------------+-------------+--------+
|                  id|          pname|cheating_flag|ban_date|
+--------------------+---------------+-------------+--------+
|account.1d0281ff2...|      ulimnet10|            0|      NA|
|account.1c295c6c0...|       yoon9242|            0|      NA|
|account.a2b8791d5...|        meco001|            0|      NA|
|account.e3b1eb159...|         forsir|            0|      NA|
|account.65433d8ee...|      jimin0311|            0|      NA|
|account.74c0462cd...|namyoonwoo07074|            0|      NA|
|account.64d031587...|       wreu1234|            0|      NA|
|account.7f874085e...|        kbs4799|            0|      NA|
|account.5c8366a6b...|       ssabu110|            0|      NA|
|account.d89f4429c...|      gusrb0187|            0|      NA|
+--------------------+---------------+-------------+--------+
only showing top 10 rows



### 1.2 THE LOG

The log used in this analysis is a history of actions done by players in the game.

| Variable   | Explanation   
|:-----------|:-------
| mid         | ID of the match               
| src      | ID of the killer  
| dst     | ID of the victim 
| time   | time in the format YYYY-MM-DD HH:MM:SS.SSS Z when the attack (killing) happened
| m_date   | date in the format YYYY-MM-DD when the match was played 

In [3]:
file_nums = [(1, 7), (2, 7), (3, 7), (4, 4), (5, 4), 
             (6, 4), (7, 4), (8, 5), (9, 7), (10, 6),
             (11, 4), (12, 4)]

RAW_DATA_PATH = "s3://social-research-cheating/edges/raw_td.parquet"

for tup in file_nums:
    get_data.combine_telemetry_data(tup[0], tup[1], RAW_DATA_PATH)

In [2]:
raw_td = spark.read.parquet("s3://social-research-cheating/raw_td.parquet")
raw_td.registerTempTable("raw_td")
print(raw_td.count())

98319451


### 1.3 TEAM ID AND RANK DATA OF TEAMS

In [27]:
PATH_TO_TEAM_DATA = "s3://social-research-cheating/team_data.parquet"
PATH_TO_RANK_DATA = "s3://social-research-cheating/edges/team_ranks.parquet"

get_data.combine_team_data(31, 6, PATH_TO_TEAM_DATA, PATH_TO_RANK_DATA)

In [3]:
team_data = spark.read.parquet(PATH_TO_TEAM_DATA)
team_data.show(10)
print(team_data.count())

+--------------------+--------------------+---+
|                 mid|                  id|tid|
+--------------------+--------------------+---+
|b6a091d4-2bdb-451...|account.9fbe4bbe5...|  1|
|24d0a877-2d20-43a...|account.9ad264163...| 17|
|866b5d75-0d8f-497...|account.4c10d9e9f...| 47|
|476c22d8-d929-46c...|account.74c896572...| 21|
|499aa106-272e-468...|account.bebee03c5...| 29|
|355aafa1-b7a2-45c...|account.289b29eda...| 13|
|4020041c-a4a6-46f...|account.4d93bc13f...| 35|
|450b9c1c-6bd0-4d7...|account.a8a2ff4b7...| 15|
|79ca6d6c-8f3a-485...|account.452fb2497...| 30|
|02c36bd8-de13-479...|account.1a3ac664c...| 14|
+--------------------+--------------------+---+
only showing top 10 rows

93730706


In [2]:
rank_data = spark.read.parquet(PATH_TO_RANK_DATA)
rank_data.registerTempTable("rank_data")
rank_data.show(5)

+--------------------+---+-----+----+----------+
|                 mid|tid|  mod|rank|    m_date|
+--------------------+---+-----+----+----------+
|f905942d-149d-49d...| 38|  duo|   3|2019-03-17|
|a8f5eca6-cc65-480...| 15|squad|   7|2019-03-17|
|2b708e1f-5496-4fb...| 24|  duo|  29|2019-03-17|
|63514f97-098a-496...| 30|  duo|   9|2019-03-17|
|3b171f42-13c5-4df...| 26|squad|   5|2019-03-17|
+--------------------+---+-----+----+----------+
only showing top 5 rows



### 1.4 THE LOG OF MATCHES WITH VICTIMS OF CHEATING (INCLUDING SELF-LOOPS)

The data used for analyzing the observation-based mechanism should contain self-loops because players who killed themselves (self-loops) cannot observe what happens in the rest of the match after they die. To reduce the data size, we extract the matches with at least one victim of cheating and at least one potential cheater. The number of unique match IDs in the data is 19,216.

| Variable   | Explanation   
|:-----------|:-------
| mid         | ID of the match               
| src      | ID of the killer
| src_sd      | date in the format YYYY-MM-DD when the killer started cheating ('NA' if the player is a non-cheater)
| src_bd      | date in the format YYYY-MM-DD when the killer was banned ('NA' if the player is a non-cheater)
| src_curr_flag      | 1 if the killer was cheating on the date when the match was played
| src_flag      | 1 if the killer was banned, 0 otherwise
| dst     | ID of the victim
| dst_sd      | date in the format YYYY-MM-DD when the victim started cheating ('NA' if the player is a non-cheater)
| dst_bd      | date in the format YYYY-MM-DD when the victim was banned ('NA' if the player is a non-cheater)
| dst_curr_flag      | 1 if the victim was cheating on the date when the match was played
| dst_flag      | 1 if the victim was banned, 0 otherwise
| time   | time in the format YYYY-MM-DD HH:MM:SS.SSS Z when the attack (killing) happened
| m_date   | date in the format YYYY-MM-DD when the match was played

In [2]:
PATH_TO_RAW_DATA = "s3://social-research-cheating/raw_td.parquet"
nodes = spark.read.parquet("s3://social-research-cheating/nodes.parquet")
nodes.registerTempTable("nodes")

get_data.get_obs_data(PATH_TO_RAW_DATA, nodes)

In [3]:
obs_data = spark.read.parquet("s3://social-research-cheating/edges/obs_data.parquet")
obs_data.registerTempTable("obs_data")
print(obs_data.count())

self_loops = spark.sql("SELECT * FROM obs_data WHERE src = dst")
print(self_loops.count())

1693699
7522


### 1.5 THE LOG OF MATCHES WITH VICTIMS OF CHEATING (EXCLUDING SELF-LOOPS)

We need the data without self-loops to analyze the victimization-based mechanism. We can simply reuse the data for the observation-based mechanism by getting rid of self-loops from it. Thus, the number of edges is 1,693,699 - 7,522 = 1,686,177.

| Variable   | Explanation   
|:-----------|:-------
| mid         | ID of the match               
| src      | ID of the killer
| src_sd      | date in the format YYYY-MM-DD when the killer started cheating ('NA' if the player is a non-cheater)
| src_bd      | date in the format YYYY-MM-DD when the killer was banned ('NA' if the player is a non-cheater)
| src_curr_flag      | 1 if the killer was cheating on the date when the match was played
| src_flag      | 1 if the killer was banned, 0 otherwise
| dst     | ID of the victim
| dst_sd      | date in the format YYYY-MM-DD when the victim started cheating ('NA' if the player is a non-cheater)
| dst_bd      | date in the format YYYY-MM-DD when the victim was banned ('NA' if the player is a non-cheater)
| dst_flag      | 1 if the victim was banned, 0 otherwise
| dst_curr_flag      | 1 if the victim was cheating on the date when the match was played
| time   | time in the format YYYY-MM-DD HH:MM:SS.SSS Z when the attack (killing) happened
| m_date   | date in the format YYYY-MM-DD when the match was played

In [6]:
spark.read.parquet("s3://social-research-cheating/edges/obs_data.parquet").createOrReplaceTempView("obs_data")
vic_data = spark.sql("SELECT * FROM obs_data WHERE src != dst")

vic_data.write.parquet("s3://social-research-cheating/edges/vic_data.parquet")
print(vic_data.count())

1686177


### 1.6 IDENTIFYING MATCHES WITH MULTIPLE WINNERS

For each match, we need to check the number of winners (or winning teams) and test whether all winners have the same team ID because each game should have only one player or team as a winner in principle.

In [2]:
obs_data = spark.read.parquet("s3://social-research-cheating/edges/obs_data.parquet")
obs_data.registerTempTable("obs_data")

team_ids = spark.read.parquet("s3://social-research-cheating/edges/tiny_team_data.parquet")
team_ids.registerTempTable("team_ids")

nodes = spark.read.parquet("s3://social-research-cheating/nodes.parquet")
nodes.registerTempTable("nodes")

In [4]:
match_info = spark.sql("SELECT DISTINCT mid, m_date FROM obs_data")
match_info.registerTempTable("match_info")

victims = spark.sql("SELECT DISTINCT mid, dst FROM obs_data")
victims.registerTempTable("victims")

winners = spark.sql("""SELECT DISTINCT o.mid, src FROM obs_data o 
                       WHERE NOT EXISTS (SELECT mid, dst FROM victims v WHERE o.mid = v.mid AND o.src = v.dst)""")
winners.registerTempTable("winners")

add_tids = spark.sql("""SELECT w.mid, src, CASE WHEN tid IS NULL THEN 'NA' ELSE tid END AS src_tid
                        FROM winners w LEFT JOIN team_ids t ON w.mid = t.mid AND w.src = t.id""")
add_tids.registerTempTable("add_tids")

# Get a list of matches where at least one winner has no team ID.
missing_tids = spark.sql("SELECT DISTINCT mid FROM add_tids WHERE src_tid = 'NA'")
missing_tids.registerTempTable("missing_tids")

add_dates = spark.sql("""SELECT a.mid, src, src_tid, m_date 
                         FROM add_tids a LEFT JOIN match_info m ON a.mid = m.mid""")
add_dates.registerTempTable("add_dates")

winners = spark.sql("""SELECT a.*, 
                       CASE WHEN cheating_flag = 1 AND m_date < start_date THEN 1 ELSE 0 END AS is_potential_cheater 
                       FROM add_dates a LEFT JOIN nodes p ON a.src = p.id""")
winners.registerTempTable("winners")
winners.show(10)

+--------------------+--------------------+-------+----------+--------------------+
|                 mid|                 src|src_tid|    m_date|is_potential_cheater|
+--------------------+--------------------+-------+----------+--------------------+
|164f026b-1ebe-432...|account.003c16440...|     15|2019-03-24|                   0|
|df0c2dab-df33-491...|account.003c16440...|     18|2019-03-03|                   0|
|e1cca6d6-b6ab-4b0...|account.00f8fc6c0...|      3|2019-03-07|                   0|
|04d7073c-2a59-4ea...|account.015a81049...|      4|2019-03-24|                   0|
|4c507cfd-6d2a-445...|account.028c2a3e2...|     NA|2019-03-06|                   0|
|a170b53b-db37-4a4...|account.028c2a3e2...|     NA|2019-03-06|                   0|
|e8d11cc8-3def-42c...|account.0315d621d...|     22|2019-03-03|                   0|
|8d05cc01-06e2-4eb...|account.035add3c5...|     NA|2019-03-01|                   0|
|0e89cc95-a856-49c...|account.03876ee12...|     16|2019-03-05|              

In [6]:
winner_stats = spark.sql("""SELECT mid, COUNT(src) AS num_of_winners, 
                            COUNT(DISTINCT src_tid) AS num_of_teams, 
                            SUM(is_potential_cheater) AS num_of_potential_cheaters 
                            FROM winners GROUP BY mid""")
winner_stats.registerTempTable("winner_stats")

summary_table = spark.sql("""SELECT w.mid, num_of_winners, num_of_teams, 
                             num_of_potential_cheaters, 
                             CASE WHEN n.mid IS NULL THEN 0 ELSE 1 END AS has_missing_tid 
                             FROM winner_stats w LEFT JOIN missing_tids n 
                             ON w.mid = n.mid""")
summary_table.show(10)
# summary_table.write.parquet("s3://social-research-cheating/general-stats/stats_of_winners.parquet")

+--------------------+--------------+------------+-------------------------+---------------+
|                 mid|num_of_winners|num_of_teams|num_of_potential_cheaters|has_missing_tid|
+--------------------+--------------+------------+-------------------------+---------------+
|0143e2da-14d2-4d8...|             9|           6|                        0|              0|
|036a8903-186b-45f...|             4|           2|                        0|              0|
|080d5622-6b94-4d7...|             3|           2|                        0|              0|
|0c7d472e-5064-4d4...|             2|           2|                        0|              0|
|0ef25288-88d3-476...|             2|           1|                        0|              0|
|1203abce-50ec-40d...|             4|           4|                        0|              0|
|1574a6bb-a63f-473...|             5|           2|                        0|              0|
|16d6f605-4118-4de...|             4|           3|                    

In [10]:
summary_table = spark.read.parquet("s3://social-research-cheating/general-stats/stats_of_winners.parquet")
summary_table.registerTempTable("summary_table")

multiple_winners = spark.sql("""SELECT * FROM summary_table 
                                WHERE num_of_teams > 1 AND num_of_potential_cheaters > 0""")
multiple_winners.show(5)
print(multiple_winners.count())

# Get a list of matches where multiple winners with different team IDs exist and at least one of their team IDs is 'NA'.
invalid_matches = spark.sql("""SELECT * FROM summary_table 
                               WHERE num_of_teams > 1 AND num_of_potential_cheaters > 0 AND has_missing_tid = 1""")
invalid_matches.show(5)
print(invalid_matches.count())

+--------------------+--------------+------------+-------------------------+---------------+
|                 mid|num_of_winners|num_of_teams|num_of_potential_cheaters|has_missing_tid|
+--------------------+--------------+------------+-------------------------+---------------+
|013caebc-8504-4d7...|             4|           4|                        1|              0|
|0bd6149a-c6f5-4ed...|             9|           7|                        1|              0|
|0c2c1334-9af0-41d...|            11|           6|                        1|              0|
|2dc03f99-5d44-42e...|             7|           5|                        1|              0|
|35866cf5-93de-48a...|             4|           2|                        1|              0|
+--------------------+--------------+------------+-------------------------+---------------+
only showing top 5 rows

1964
+--------------------+--------------+------------+-------------------------+---------------+
|                 mid|num_of_winners|num

### 1.7 ADDITIONAL SELF-LOOPS

First, we need to add extra self-loops for the matches where multiple winners with different team IDs exist but none of them have 'NA' as their team IDs.

In [2]:
obs_data = spark.read.parquet("s3://social-research-cheating/edges/obs_data.parquet")
obs_data.registerTempTable("obs_data")

team_ids = spark.read.parquet("s3://social-research-cheating/edges/tiny_team_data.parquet")
team_ids.registerTempTable("team_ids")

# The table below contains the ranks of players for 1,964 team matches. 
team_ranks = spark.read.parquet("s3://social-research-cheating/edges/ordered_ranks.parquet")
team_ranks.registerTempTable("team_ranks")

nodes = spark.read.parquet("s3://social-research-cheating/nodes.parquet")
nodes.registerTempTable("nodes")

In [3]:
summary_table = spark.read.parquet("s3://social-research-cheating/general-stats/stats_of_winners.parquet")
summary_table.registerTempTable("summary_table")

invalid_matches = spark.sql("""SELECT * FROM summary_table 
                               WHERE num_of_teams > 1 AND num_of_potential_cheaters > 0""")
invalid_matches.registerTempTable("invalid_matches")
print(invalid_matches.count())

sampled_obs = spark.sql("SELECT o.* FROM obs_data o JOIN invalid_matches i ON o.mid = i.mid")
sampled_obs.registerTempTable("sampled_obs")

add_killer_tids = spark.sql("""SELECT s.*, CASE WHEN tid IS NULL THEN 'NA' ELSE tid END AS src_tid 
                               FROM sampled_obs s LEFT JOIN team_ids t ON s.mid = t.mid AND s.src = t.id""")
add_killer_tids.registerTempTable("add_killer_tids")

add_tids = spark.sql("""SELECT a.*, CASE WHEN tid IS NULL THEN 'NA' ELSE tid END AS dst_tid 
                        FROM add_killer_tids a LEFT JOIN team_ids t 
                        ON a.mid = t.mid AND a.dst = t.id""")
add_tids.registerTempTable("add_tids")

1964


In [7]:
victims = spark.sql("SELECT mid, dst FROM sampled_obs")
victims.registerTempTable("victims")

winners = spark.sql("""SELECT DISTINCT o.mid, src, src_tid, m_date FROM add_tids o 
                       WHERE NOT EXISTS (SELECT mid, dst FROM victims v WHERE o.mid = v.mid AND o.src = v.dst)""")
winners.registerTempTable("winners")

winners_with_flags = spark.sql("""SELECT t.*, 
                                  CASE WHEN cheating_flag = 1 AND m_date < start_date THEN 1 ELSE 0 END 
                                  AS is_potential_cheater 
                                  FROM winners t LEFT JOIN nodes n ON t.src = n.id""")
winners_with_flags.registerTempTable("winners_with_flags")

# Get a list of winners (with team IDs) who are potential cheaters.
potential_cheaters = spark.sql("""SELECT * FROM winners_with_flags 
                                  WHERE is_potential_cheater = 1 AND src_tid != 'NA'""")
potential_cheaters.registerTempTable("potential_cheaters")
potential_cheaters.show(5)

+--------------------+--------------------+-------+----------+--------------------+
|                 mid|                 src|src_tid|    m_date|is_potential_cheater|
+--------------------+--------------------+-------+----------+--------------------+
|44925719-4ae3-421...|account.175b7548e...|     14|2019-03-10|                   1|
|21330d5b-0ba7-420...|account.175b7548e...|     23|2019-03-10|                   1|
|110662d0-787d-49f...|account.175b7548e...|     26|2019-03-17|                   1|
|82021728-9cd8-4e6...|account.175b7548e...|     19|2019-03-10|                   1|
|130ffdf8-f954-436...|account.175b7548e...|     10|2019-03-17|                   1|
+--------------------+--------------------+-------+----------+--------------------+
only showing top 5 rows



In [10]:
add_ranks = spark.sql("""SELECT w.mid, src, src_tid, 
                         CASE WHEN rank IS NULL THEN 'NA' ELSE rank END AS src_rank 
                         FROM potential_cheaters w LEFT JOIN team_ranks t 
                         ON w.mid = t.mid AND w.src_tid = t.tid WHERE rank != 1""")
add_ranks.registerTempTable("add_ranks")
add_ranks.show(5)

+--------------------+--------------------+-------+--------+
|                 mid|                 src|src_tid|src_rank|
+--------------------+--------------------+-------+--------+
|110662d0-787d-49f...|account.175b7548e...|     26|       4|
|130ffdf8-f954-436...|account.175b7548e...|     10|       4|
|82021728-9cd8-4e6...|account.175b7548e...|     19|       9|
|0031e4e0-b475-46d...|account.175b7548e...|     20|      15|
|7ce8d183-c8e9-42f...|account.175b7548e...|      3|      23|
+--------------------+--------------------+-------+--------+
only showing top 5 rows



In [13]:
rank_table = spark.sql("""SELECT mid, tid, rank, 
                          LAG(tid) OVER (ORDER BY mid, rank) AS lag_tid, 
                          LAG(rank) OVER (ORDER BY mid, rank) AS lag_rank, 
                          LEAD(tid) OVER (ORDER BY mid, rank) AS lead_tid, 
                          LEAD(rank) OVER (ORDER BY mid, rank) AS lead_rank 
                          FROM team_ranks""")
rank_table.registerTempTable("rank_table")

rank_info = spark.sql("""SELECT a.mid, src, src_tid, src_rank, lag_tid, lag_rank, lead_tid, lead_rank 
                         FROM add_ranks a JOIN rank_table t ON a.mid = t.mid AND a.src_tid = t.tid""")
rank_info.registerTempTable("rank_info")
rank_info.show(5)

+--------------------+--------------------+-------+--------+-------+--------+--------+---------+
|                 mid|                 src|src_tid|src_rank|lag_tid|lag_rank|lead_tid|lead_rank|
+--------------------+--------------------+-------+--------+-------+--------+--------+---------+
|130ffdf8-f954-436...|account.175b7548e...|     10|       4|     20|       3|      25|        6|
|82021728-9cd8-4e6...|account.175b7548e...|     19|       9|      8|       8|      27|       10|
|7ce8d183-c8e9-42f...|account.175b7548e...|      3|      23|     25|      22|      13|       23|
|0031e4e0-b475-46d...|account.175b7548e...|     20|      15|     21|      14|      11|       16|
|110662d0-787d-49f...|account.175b7548e...|     26|       4|     15|       3|      32|        5|
+--------------------+--------------------+-------+--------+-------+--------+--------+---------+
only showing top 5 rows



The cell below uses an example match to explain how we detect potential cheaters who need self-loops among multiple winners.

In [14]:
example_winners = spark.sql("""SELECT * FROM winners_with_flags 
                               WHERE mid = '013caebc-8504-4d71-be02-a082ddccda9a'""")
example_winners.show()

example_potential_cheater_ranks = spark.sql("""SELECT * FROM add_ranks 
                                               WHERE mid = '013caebc-8504-4d71-be02-a082ddccda9a'""")
example_potential_cheater_ranks.show()

example_rank_info = spark.sql("""SELECT * FROM rank_info 
                                 WHERE mid = '013caebc-8504-4d71-be02-a082ddccda9a'""")
example_rank_info.show()

+--------------------+--------------------+-------+----------+--------------------+
|                 mid|                 src|src_tid|    m_date|is_potential_cheater|
+--------------------+--------------------+-------+----------+--------------------+
|013caebc-8504-4d7...|account.c9a9eaa2a...|     50|2019-03-07|                   0|
|013caebc-8504-4d7...|account.3e5396b91...|      6|2019-03-07|                   0|
|013caebc-8504-4d7...|account.577f76fe0...|     36|2019-03-07|                   1|
|013caebc-8504-4d7...|account.0e2dd932a...|      9|2019-03-07|                   0|
+--------------------+--------------------+-------+----------+--------------------+

+--------------------+--------------------+-------+--------+
|                 mid|                 src|src_tid|src_rank|
+--------------------+--------------------+-------+--------+
|013caebc-8504-4d7...|account.577f76fe0...|     36|      13|
+--------------------+--------------------+-------+--------+

+--------------------

In [16]:
# Add the time when the last team member died for each match.
add_lag_time = spark.sql("""SELECT mid, src, src_tid, src_rank, 
                            lag_tid, lag_rank, lag_time, lead_tid, lead_rank 
                            FROM (SELECT l.*, time AS lag_time, 
                            ROW_NUMBER() OVER (PARTITION BY l.mid, l.src, l.src_tid ORDER BY time DESC) AS row_number 
                            FROM rank_info l JOIN add_tids a 
                            ON l.lag_tid = a.dst_tid AND l.mid = a.mid) 
                            WHERE row_number IN (1)""")
add_lag_time.registerTempTable("add_lag_time")

add_time = spark.sql("""SELECT mid, src, src_tid, src_rank, lag_tid, lag_rank, lag_time, 
                        lead_tid, lead_rank, lead_time 
                        FROM (SELECT l.*, time AS lead_time, 
                        ROW_NUMBER() OVER (PARTITION BY l.mid, l.src, l.src_tid ORDER BY time DESC) AS row_number 
                        FROM add_lag_time l JOIN add_tids a ON l.lead_tid = a.dst_tid AND l.mid = a.mid) 
                        WHERE row_number IN (1)""")
add_time.registerTempTable("add_time")

add_time_diffs = spark.sql("""SELECT *, (UNIX_TIMESTAMP(lag_time) - UNIX_TIMESTAMP(lead_time)) AS time_diff 
                              FROM add_time""")
add_time_diffs.registerTempTable("add_time_diffs")

add_new_time = spark.sql("""SELECT *, 
                            CASE WHEN lag_rank = 1 AND time_diff < 0
                            THEN TO_TIMESTAMP(FROM_UNIXTIME(UNIX_TIMESTAMP(lead_time) + 1))
                            WHEN lead_rank = 1 THEN lag_time
                            WHEN lead_rank != 1 AND lag_rank != 1 AND time_diff < 0
                            THEN TO_TIMESTAMP(FROM_UNIXTIME(UNIX_TIMESTAMP(lead_time) + 1))
                            ELSE TO_TIMESTAMP(FROM_UNIXTIME(UNIX_TIMESTAMP(lead_time) + FLOOR(0 + (RAND() * time_diff)))) END 
                            AS new_time
                            FROM add_time_diffs""")
add_new_time.registerTempTable("add_new_time")

In [19]:
player_info = spark.sql("""SELECT DISTINCT mid, src AS id, src_sd AS sd, src_bd AS bd, 
                           src_curr_flag AS curr_flag, src_flag AS flag, m_date 
                           FROM sampled_obs 
                           UNION 
                           SELECT DISTINCT mid, dst, dst_sd, dst_bd, 
                           dst_curr_flag, dst_flag, m_date 
                           FROM sampled_obs""")
player_info.registerTempTable("player_info")

self_loops = spark.sql("""SELECT a.mid, src, sd, bd, curr_flag, flag, 
                          src, sd, bd, curr_flag, flag, new_time AS time, m_date 
                          FROM add_new_time a JOIN player_info p ON a.mid = p.mid AND a.src = p.id""")
self_loops.registerTempTable("self_loops")

In [21]:
# The following example is a self-loop of the potential cheater in the example match shown above.
example_self_loop = spark.sql("SELECT * FROM self_loops WHERE mid = '013caebc-8504-4d71-be02-a082ddccda9a'")
example_self_loop.show()

+--------------------+--------------------+----------+----------+---------+----+--------------------+----------+----------+---------+----+-------------------+----------+
|                 mid|                 src|        sd|        bd|curr_flag|flag|                 src|        sd|        bd|curr_flag|flag|               time|    m_date|
+--------------------+--------------------+----------+----------+---------+----+--------------------+----------+----------+---------+----+-------------------+----------+
|013caebc-8504-4d7...|account.577f76fe0...|2019-03-08|2019-03-09|        0|   1|account.577f76fe0...|2019-03-08|2019-03-09|        0|   1|2019-03-07 11:48:22|2019-03-07|
+--------------------+--------------------+----------+----------+---------+----+--------------------+----------+----------+---------+----+-------------------+----------+



The next task is to create additional self-loops for the matches where multiple winners have different team IDs including 'NA'. In this case, we don't have any hints on how to estimate the exact time of death (because we don't have the ranks of single players). 

In [24]:
potential_cheaters_without_team_ids = spark.sql("""SELECT * FROM winners_with_flags 
                                                   WHERE is_potential_cheater = 1 AND src_tid = 'NA'""")
potential_cheaters_without_team_ids.registerTempTable("potential_cheaters_without_team_ids")
potential_cheaters_without_team_ids.show(5)
print(potential_cheaters_without_team_ids.count())

+--------------------+--------------------+-------+----------+--------------------+
|                 mid|                 src|src_tid|    m_date|is_potential_cheater|
+--------------------+--------------------+-------+----------+--------------------+
|f2f76e66-9fb7-40d...|account.57d64f776...|     NA|2019-03-27|                   1|
|6283fdb3-c24d-413...|account.f24c22165...|     NA|2019-03-05|                   1|
|99a629b2-f4e3-42e...|account.cdd20db96...|     NA|2019-03-03|                   1|
|bbe25e99-755d-4ca...|account.cdd20db96...|     NA|2019-03-04|                   1|
|12bcdfe5-34a4-473...|account.81f027093...|     NA|2019-03-06|                   1|
+--------------------+--------------------+-------+----------+--------------------+
only showing top 5 rows

17


Given the time difference between the last killing done by a player and the end time of the match, we can create a new random time of death for each player who needs a self-loop. If the value of 'time_diff' is zero, this means that the killer is the winner who is ranked first because the end time of the match corresponds with the time of the last killing done by the killer. Thus, we can ignore that case as there is no need to create a self-loop.

In [27]:
# Get the last killing of each match.
last_kills = spark.sql("""SELECT * 
                          FROM (SELECT o.*, ROW_NUMBER() OVER (PARTITION BY mid ORDER BY time DESC) AS row_number 
                          FROM sampled_obs AS o) WHERE row_number = 1""")
last_kills.registerTempTable("last_kills")

# Find the last killing of each winner whose team ID is 'NA'.
single_winner_kills = spark.sql("""SELECT * 
                                   FROM (SELECT o.*, 
                                   ROW_NUMBER() OVER (PARTITION BY w.mid, w.src ORDER BY time DESC) AS row_number 
                                   FROM sampled_obs o JOIN potential_cheaters_without_team_ids w 
                                   ON o.mid = w.mid AND o.src = w.src) 
                                   WHERE row_number = 1""")
single_winner_kills.registerTempTable("single_winner_kills")

cal_time_diff = spark.sql("""SELECT n.mid, n.src, n.time,  
                             (UNIX_TIMESTAMP(l.time) - UNIX_TIMESTAMP(n.time)) AS time_diff
                             FROM single_winner_kills n JOIN last_kills l ON n.mid = l.mid""")
cal_time_diff.registerTempTable("cal_time_diff")
cal_time_diff.show()

+--------------------+--------------------+--------------------+---------+
|                 mid|                 src|                time|time_diff|
+--------------------+--------------------+--------------------+---------+
|f2f76e66-9fb7-40d...|account.57d64f776...|2019-03-27 08:41:...|     1676|
|0e85fbcc-0d91-4f0...|account.9ccecb41a...|2019-03-06 10:15:...|        0|
|b2c7e5a4-f0f0-48d...|account.ac666be40...|2019-03-03 20:26:...|      374|
|bac58a82-62ca-485...|account.d8fc8cfc9...|2019-03-03 17:42:...|       44|
|f54ab324-6b31-474...|account.44b0bd971...|2019-03-04 15:46:...|        0|
|483a0e46-2d62-444...|account.4d2951657...|2019-03-13 08:19:...|        0|
|bbe25e99-755d-4ca...|account.cdd20db96...|2019-03-04 12:33:...|     1497|
|cb84a1ce-cd19-427...|account.88cca8d42...|2019-03-04 21:40:...|      871|
|6283fdb3-c24d-413...|account.f24c22165...|2019-03-05 15:50:...|     1208|
|b62ae865-af8e-4e3...|account.9c45a718a...|2019-03-03 21:27:...|        0|
|99a629b2-f4e3-42e...|acc

In [28]:
add_rand_time = spark.sql("""SELECT c.*,  
                             CASE WHEN time_diff = 0 THEN NULL
                             ELSE TO_TIMESTAMP(FROM_UNIXTIME(UNIX_TIMESTAMP(time) + FLOOR(0 + (RAND() * time_diff)))) END 
                             AS new_time
                             FROM cal_time_diff AS c""")
add_rand_time.registerTempTable("add_rand_time")
add_rand_time.show()

add_rand_time = spark.sql("SELECT * FROM add_rand_time WHERE new_time IS NOT NULL")
add_rand_time.registerTempTable("add_rand_time")
add_rand_time.show()

rand_self_loops = spark.sql("""SELECT a.mid, src, sd, bd, curr_flag, flag, 
                               src, sd, bd, curr_flag, flag, new_time AS time, m_date 
                               FROM add_rand_time a JOIN player_info p 
                               ON a.mid = p.mid AND a.src = p.id""")
rand_self_loops.registerTempTable("rand_self_loops")

+--------------------+--------------------+--------------------+---------+-------------------+
|                 mid|                 src|                time|time_diff|           new_time|
+--------------------+--------------------+--------------------+---------+-------------------+
|f2f76e66-9fb7-40d...|account.57d64f776...|2019-03-27 08:41:...|     1676|2019-03-27 08:51:56|
|0e85fbcc-0d91-4f0...|account.9ccecb41a...|2019-03-06 10:15:...|        0|               null|
|b2c7e5a4-f0f0-48d...|account.ac666be40...|2019-03-03 20:26:...|      374|2019-03-03 20:27:24|
|bac58a82-62ca-485...|account.d8fc8cfc9...|2019-03-03 17:42:...|       44|2019-03-03 17:42:44|
|f54ab324-6b31-474...|account.44b0bd971...|2019-03-04 15:46:...|        0|               null|
|483a0e46-2d62-444...|account.4d2951657...|2019-03-13 08:19:...|        0|               null|
|bbe25e99-755d-4ca...|account.cdd20db96...|2019-03-04 12:33:...|     1497|2019-03-04 12:35:28|
|cb84a1ce-cd19-427...|account.88cca8d42...|2019-03

In [29]:
# Combine two sets of self-loops.
full_self_loops = spark.sql("SELECT * FROM self_loops UNION SELECT * FROM rand_self_loops")
full_self_loops.registerTempTable("full_self_loops")
print(full_self_loops.count())

obs_data = spark.read.parquet("s3://social-research-cheating/edges/obs_data.parquet")
obs_data.registerTempTable("obs_data")

new_obs_data = spark.sql("""SELECT * FROM obs_data UNION SELECT * FROM full_self_loops 
                            ORDER BY mid, time""")
new_obs_data.write.parquet("s3://social-research-cheating/edges/rev_obs_data.parquet")

715
