### Load packages and read tables.

In [2]:
from functools import reduce
from pyspark.sql.functions import col, lit, when
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from pyspark.sql.types import StructType, StructField, LongType

# Read telemetry data.
raw_td = spark.read.parquet("/tmp/td_day_3_6.parquet")
raw_td.registerTempTable("raw_td")
td = spark.sql("SELECT * FROM raw_td WHERE m_date >= '2019-03-01' AND m_date <= '2019-03-02'")
td.registerTempTable("td")

# Read the information of cheaters.
cheaters = spark.read.parquet("/tmp/cheater_info.parquet")
cheaters.registerTempTable("cheaters")

# Add the information of cheaters to the node table.
nodes = spark.sql("""SELECT t.id, t.pname, CASE WHEN c.start_date IS NULL THEN 0 ELSE 1 END AS cheating_flag, 
                     CASE WHEN c.start_date IS NULL THEN 'NA' ELSE c.start_date END AS start_date, 
                     CASE WHEN c.ban_date IS NULL THEN 'NA' ELSE c.ban_date END AS ban_date 
                     FROM td_nodes t LEFT JOIN cheaters c ON t.id = c.id""")
nodes.registerTempTable("nodes")

### Create a table that contains observers who observed killings done by cheaters.

In [4]:
# First, add information of killers.
src_info = spark.sql("""SELECT mid, src, start_date AS src_sd, ban_date AS src_bd, cheating_flag AS src_flag, dst, time, m_date 
                        FROM td t JOIN nodes n ON t.src = n.id""")
src_info.registerTempTable("src_info")

# Add information of victims.
full_info = spark.sql("""SELECT mid, src, src_sd, src_bd, src_flag, dst, start_date AS dst_sd, ban_date AS dst_bd, 
                         cheating_flag AS dst_flag, time, m_date FROM src_info s JOIN nodes n ON s.dst = n.id 
                         ORDER BY src_flag""")
full_info.registerTempTable("full_info")

# Add information of cheaters.
add_cheating_flag = spark.sql("""SELECT mid, src, 
                                 CASE WHEN src_bd >= m_date AND src_sd <= m_date AND src_flag == 1 THEN 1 ELSE 0 END AS src_flag, 
                                 dst, CASE WHEN dst_bd >= m_date AND dst_sd <= m_date AND dst_flag == 1 THEN 1 ELSE 0 END AS dst_flag, 
                                 time, m_date FROM full_info ORDER BY mid, time""")
add_cheating_flag.registerTempTable("add_cheating_flag")
# display(add_cheating_flag)

# Get matches where cheaters killed at least one player.
records = spark.sql("SELECT *, ROW_NUMBER() OVER (PARTITION BY mid ORDER BY time) AS aid FROM add_cheating_flag")
records.registerTempTable("records")
# display(records)

# Get a list of killers to be excluded.
exclude_src_tab = spark.sql("SELECT mid, src, aid FROM records WHERE src_flag = 1 ORDER BY mid, aid")
exclude_src_tab.registerTempTable("exclude_src_tab")
# display(exclude_src_tab)

legit_records = spark.sql("SELECT mid, aid FROM exclude_src_tab")
legit_records.registerTempTable("legit_records")
# display(legit_records)

# Get a list of matches that have at least one killer who did cheat.
legit_matches = spark.sql("SELECT DISTINCT mid FROM legit_records")
legit_matches.registerTempTable("legit_matches")
# print(legit_matches.count())

records = spark.sql("""SELECT r.mid, src, src_flag, dst, dst_flag, time, m_date, aid 
                       FROM records r JOIN legit_matches l ON r.mid = l.mid""")
records.registerTempTable("records")

# Get a list of killers for each match.
killers = spark.sql("SELECT DISTINCT mid, src, src_flag, m_date FROM records ORDER BY mid")
killers.registerTempTable("killers")
# display(killers)

temp = spark.sql("SELECT k.mid, src, src_flag, aid, m_date FROM killers k JOIN legit_records l ON k.mid = l.mid")
# display(temp)
temp.registerTempTable("temp")

# Get a table of killers who observed killings done by cheaters.
observer_src_tab = spark.sql("""SELECT t.mid, t.src AS id, src_flag AS flag, t.aid, m_date 
                                FROM temp t LEFT JOIN exclude_src_tab e ON t.mid = e.mid AND t.aid > e.aid 
                                UNION 
                                SELECT t.mid, t.src AS id, src_flag AS flag, t.aid, m_date 
                                FROM temp t LEFT JOIN exclude_src_tab e ON t.mid = e.mid AND t.aid < e.aid""")
# display(observer_src_tab)

# Get a table of players in the column of victims who observed killings done by cheaters when they were alive.
observer_dst_tab = spark.sql("""SELECT r.mid, dst AS id, dst_flag AS flag, l.aid AS flagged_aid, m_date 
                                FROM records r JOIN legit_records l ON r.aid > l.aid""")
observer_dst_tab.registerTempTable("observer_dst_tab")
# display(observer_dst_tab)

# Get a list of observers by combining the two tables above.
observers = observer_src_tab.union(observer_dst_tab)
observers.registerTempTable("observers")
display(observers)

mid,id,flag,aid,m_date
05c04ee8-6c76-4b0d-99ed-ae44275b6612,account.497a6f427e3746fcb991a52fb624f6fa,0,85,2019-03-02
05c04ee8-6c76-4b0d-99ed-ae44275b6612,account.e23da74339b5493996e8115d7ce7bfe1,0,88,2019-03-02
03e81837-d61c-4a1d-b422-ab8fe8a31afe,account.9a7d2fb469f44dcb944d51d6483c5f46,0,25,2019-03-01
293c9414-77dd-4496-ab1a-c70589a5ff2d,account.a61810b6ea294f858642c8113b1012c7,0,47,2019-03-02
44c4e28c-25c5-4c77-bc39-9686d8a35b97,account.c825f8ea96d7444c8056d41a3435c91c,0,37,2019-03-01
44c4e28c-25c5-4c77-bc39-9686d8a35b97,account.e10f58e954f847bfb09c18aac622a404,0,58,2019-03-01
4b2d5cd7-c874-4184-871f-edcfc5645d2e,account.47a88a77cb3d4198b7d5e6daa5b851da,0,63,2019-03-01
4b2d5cd7-c874-4184-871f-edcfc5645d2e,account.d9edfc613e1e46068405666895884222,0,89,2019-03-01
4b2d5cd7-c874-4184-871f-edcfc5645d2e,account.2b3588d97f144dba92d79741926ce61b,0,87,2019-03-01
4b2d5cd7-c874-4184-871f-edcfc5645d2e,account.1d26ea1bbf714185978f9637215d944d,0,94,2019-03-01


### Plot the distribution of observations done by cheaters and non-cheaters.
* This part needs to be optimised or requires more resources.

In [6]:
# observations = spark.sql("SELECT id, COUNT(*) AS obs FROM observers WHERE flag != 1 GROUP BY id")
# observations.registerTempTable("observations")
# display(observations)

cheater_lst = spark.sql("SELECT id AS cheater FROM cheaters WHERE ban_date <= '2019-03-02'")
cheater_lst.registerTempTable("cheater_lst")

# Add the cheater inforamtion.
add_cheater_info = spark.sql("""SELECT o.id, obs, CASE WHEN cheater IS NULL THEN 0 ELSE 1 END AS cheater 
                                FROM observations o LEFT JOIN cheater_lst c ON o.id = c.cheater""")

cheater_obs_df = add_cheater_info_df[add_cheater_info_df['cheater'] == 1]
non_cheater_obs_df = add_cheater_info_df[add_cheater_info_df['cheater'] == 0]

# Plot the histograms of cheaters and non-cheaters together.
ax = non_cheater_obs_df.hist(column='obs', histtype='step', edgecolor='blue', bins=40, 
                             weights=np.zeros_like(non_cheater_obs_df['obs'])+1./len(non_cheater_obs_df['obs']), 
                             label='Non-cheaters')
fig = cheater_obs_df.hist(column='obs', histtype='step', edgecolor='red', bins=40,
                          weights=np.zeros_like(cheater_obs_df['obs'])+1./len(cheater_obs_df['obs']), 
                          label='Cheaters', ax=ax)
plt.title('')
plt.xlabel('Number of observations')
plt.ylabel('Proportion')
plt.tight_layout()
plt.legend(loc='upper right', frameon=False)
image = plt.show() 
display(image)