# DESCRIPTIVE STATISTICS

In [1]:
import pandas as pd

## 1 NUMBER OF MATCHES / KILLINGS

The dataset consists of 1,146,941 matches involving 98,319,451 killings.

In [2]:
matches_by_date = pd.read_csv("data/general_stats/mids_by_date.csv")
print(matches_by_date['num_of_mids'].sum())

1146941


In [18]:
td = spark.read.parquet("s3://social-research-cheating/raw_td.parquet")
td.registerTempTable("td")
print(td.count())

98319451


## 2 NUMBER OF UNIQUE PLAYERS

The dataset contains 1,975,877 unique players.

In [3]:
killers = spark.sql("SELECT src FROM td")
killers.registerTempTable("killers")

victims = spark.sql("SELECT dst FROM td")
victims.registerTempTable("victims")

unique_players = spark.sql("SELECT src AS id FROM killers UNION SELECT * FROM victims")
print(unique_players.count())

1975877


## 3 NUMBER OF DAYS PLAYERS PLAYED THE GAME

In [2]:
days_df = pd.read_csv("data/general_stats/num_of_days.csv")
print("Mean: " + str(round(days_df['days'].mean(), 2)))
print("Median: " + str(days_df['days'].median()))

Mean: 5.77
Median: 3.0


## 4 NUMBER OF MATCHES WITH CHEATERS

The number of matches with exactly one cheater is 107,139 (10,265 + 19,195 + 64,095).

In [4]:
# Load the data of matches with cheaters
cheater_matches = spark.read.parquet("s3://social-research-cheating/general-stats/num_of_cheaters_per_match.parquet")
cheater_matches.registerTempTable("cheater_matches")

In [6]:
# Get the number of matches with different number of cheaters: Solo
# c_cnt: # of cheaters
# count(mid): # of matches

num_of_cheaters_solo = spark.sql("""SELECT c_cnt, COUNT(mid) FROM cheater_matches WHERE mod = 'NA' 
                                    GROUP BY c_cnt ORDER BY c_cnt""")
num_of_cheaters_solo.show()

+-----+----------+
|c_cnt|count(mid)|
+-----+----------+
|    1|     10265|
|    2|       951|
|    3|        77|
|    4|         3|
+-----+----------+



In [7]:
# Get the number of matches with different number of cheaters: Duo
num_of_cheaters_duo = spark.sql("""SELECT c_cnt, COUNT(mid) FROM cheater_matches 
                                   WHERE mod = 'duo' OR mod = 'normal-duo' OR mod = 'normal-duo-fpp' 
                                   GROUP BY c_cnt ORDER BY c_cnt""")
num_of_cheaters_duo.show()

+-----+----------+
|c_cnt|count(mid)|
+-----+----------+
|    1|     19195|
|    2|      1730|
|    3|       135|
|    4|        11|
+-----+----------+



In [8]:
# Get the number of matches with different number of cheaters: Squad
num_of_cheaters_squad = spark.sql("""SELECT c_cnt, COUNT(mid) FROM cheater_matches 
                                     WHERE mod = 'squad' OR mod = 'normal-squad' OR mod = 'normal-squad-fpp' 
                                     GROUP BY c_cnt ORDER BY c_cnt""")
num_of_cheaters_squad.show()

+-----+----------+
|c_cnt|count(mid)|
+-----+----------+
|    1|     64095|
|    2|      9079|
|    3|      1355|
|    4|       212|
|    5|        27|
|    6|         4|
+-----+----------+



## 5 NUMBER OF PLAYERS WHO PLAYED IN AT LEAST ONE GAME WITH A CHEATER

1,185,279 players played in at least one game with a cheater.

In [9]:
matches_with_cheaters = spark.read.parquet("s3://social-research-cheating/general-stats/matches_with_cheaters.parquet")
matches_with_cheaters.registerTempTable("matches_with_cheaters")

nodes = spark.read.parquet("s3://social-research-cheating/nodes.parquet")
nodes.registerTempTable("nodes")

add_killer_flags = spark.sql("""SELECT mid, src, 
                                CASE WHEN m_date <= ban_date AND m_date >= start_date THEN 1 ELSE 0 END 
                                AS src_curr_flag, dst, time, m_date, mod 
                                FROM matches_with_cheaters m LEFT JOIN nodes n ON m.src = n.id""")
add_killer_flags.registerTempTable("add_killer_flags")

temp = spark.sql("""SELECT mid, src, src_curr_flag, dst, 
                    CASE WHEN m_date <= ban_date AND m_date >= start_date THEN 1 ELSE 0 END 
                    AS dst_curr_flag, time, m_date, mod 
                    FROM add_killer_flags a LEFT JOIN nodes n ON a.dst = n.id""")
temp.registerTempTable("temp")

In [10]:
participants = spark.sql("""SELECT mid, src AS id, src_curr_flag AS is_cheating_now FROM temp 
                            UNION SELECT mid, dst, dst_curr_flag FROM temp""")
participants.registerTempTable("participants")

unique_non_cheaters = spark.sql("SELECT DISTINCT id FROM participants WHERE is_cheating_now = 0")
print(unique_non_cheaters.count())

1185279


## 6 NUMBER OF PLAYERS WITH AT LEAST ONE VICTIMIZATION EXPERIENCE

286,914 players were killed by a cheater at least once.

In [11]:
add_killer_flags = spark.sql("""SELECT mid, src, start_date AS src_sd, ban_date AS src_bd, 
                                cheating_flag AS src_flag, 
                                CASE WHEN m_date <= ban_date AND m_date >= start_date THEN 1 ELSE 0 END 
                                AS src_curr_flag, dst, time, m_date 
                                FROM td t LEFT JOIN nodes n ON t.src = n.id""")
add_killer_flags.registerTempTable("add_killer_flags")
    
edges = spark.sql("""SELECT mid, src, src_sd, src_bd, src_flag, src_curr_flag,
                     dst, start_date AS dst_sd, ban_date AS dst_bd, cheating_flag AS dst_flag, 
                     CASE WHEN m_date <= ban_date AND m_date >= start_date THEN 1 ELSE 0 END 
                     AS dst_curr_flag, time, m_date 
                     FROM add_killer_flags a LEFT JOIN nodes n ON a.dst = n.id""")
edges.registerTempTable("edges")

In [12]:
victims = spark.sql("SELECT COUNT(DISTINCT dst) FROM edges WHERE src != dst AND src_curr_flag = 1")
victims.show()

+-------------------+
|count(DISTINCT dst)|
+-------------------+
|             286914|
+-------------------+



## 7 NUMBER OF KILLINGS / DEATHS - CHEATERS VS. NON-CHEATERS

The number of killings done by cheaters is 453,071 (1,388 + 451,683) and cheaters were killed 98,349 (1,388 + 96,961) times.

In [13]:
# Exclude self-killings
cleaned_edges = spark.sql("SELECT * FROM edges WHERE src != dst")
cleaned_edges.registerTempTable("cleaned_edges")
print(cleaned_edges.count())

97841185


In [14]:
# Get the number of cases where both killer and victim were cheaters.
temp = spark.sql("""SELECT mid, src, src_curr_flag, dst, dst_curr_flag, time 
                    FROM cleaned_edges WHERE src_curr_flag = 1 AND dst_curr_flag = 1""")
temp.registerTempTable("temp")
print(temp.count())

1388


In [15]:
# Get the number of cases where both killer and victim were non-cheaters.
temp = spark.sql("""SELECT mid, src, src_curr_flag, dst, dst_curr_flag, time 
                    FROM cleaned_edges WHERE src_curr_flag = 0 AND dst_curr_flag = 0""")
temp.registerTempTable("temp")
print(temp.count())

97291153


In [16]:
# Get the number of cases where the killer was a cheater and the victim was a non-cheater.
temp = spark.sql("""SELECT mid, src, src_curr_flag, dst, dst_curr_flag, time 
                    FROM cleaned_edges WHERE src_curr_flag = 1 AND dst_curr_flag = 0""")
temp.registerTempTable("temp")
print(temp.count())

451683


In [17]:
# Get the number of cases where the killer was a non-cheater and the victim was a cheater.
temp = spark.sql("""SELECT mid, src, src_curr_flag, dst, dst_curr_flag, time 
                    FROM cleaned_edges WHERE src_curr_flag = 0 AND dst_curr_flag = 1""")
temp.registerTempTable("temp")
print(temp.count())

96961
