07-general-stats.ipynb
======================

1. Number of days players played the game
* Number of matches played by date
* Number of cheaters by ban date
* Number of victimisation experiences (Number of times players were killed by cheating)
* Number of cheaters per match (Consider only the matches where at least one cheater played.)

In [1]:
from pyspark.sql import functions as F
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

In [2]:
# Read telemetry data stored in my S3 bucket.
spark.read.parquet("s3://social-research-cheating/raw_td.parquet").createOrReplaceTempView("td")

# Read a table that contains player data.
spark.read.parquet("s3://social-research-cheating/players.parquet").createOrReplaceTempView("players")

# Read a table that contains cheater data.
spark.read.parquet("s3://social-research-cheating/cheater_info.parquet").createOrReplaceTempView("cheaters")

## 1. Count the number of days players played the game.

In [None]:
# Calculate the number of kills of each player by date.
kills_by_date = spark.sql("SELECT src AS id, m_date, COUNT(*) AS kills FROM td GROUP BY src, m_date")
kills_by_date_df = kills_by_date.toPandas()

# Calculate the number of deaths of each cheater by date.
deaths_by_date = spark.sql("SELECT dst AS id, m_date, COUNT(*) AS deaths FROM td GROUP BY dst, m_date")
deaths_by_date_df = deaths_by_date.toPandas()

temp_kills = kills_by_date_df[['id', 'm_date']]
temp_deaths = deaths_by_date_df[['id', 'm_date']]
temp = pd.concat([temp_kills, temp_deaths])
temp = temp.drop_duplicates(subset=['id', 'm_date'])

temp = spark.createDataFrame(temp)
temp.registerTempTable("temp")
days = spark.sql("SELECT id, COUNT(*) AS days FROM temp GROUP BY id")
days_df = days.toPandas()

# Calculate the mean and median number of days.
print(days_df['days'].mean())
print(days_df['days'].median())

In [None]:
# Plot the distribution of days players played the game.
bins = np.arange(0, days_df['days'].max() + 1.5) - 0.5
fig = days_df.hist(column = 'days', histtype='step', weights=np.zeros_like(days_df['days'])+1./len(days_df['days']), 
                   bins = bins)
plt.xlim(xmin=0.5)
plt.xlim(xmax=31)
plt.xlabel("Number of days players played the game")
plt.ylabel("Proportion")
plt.title("")
plt.tight_layout()
image = plt.show()

## 2. Count the number of matches by date.

In [None]:
# Read the data stored in the S3 bucket and convert it into a dataframe.
mids_by_date = spark.read.parquet("s3://social-research-cheating/general-stats/mids_by_date.parquet")
mids_by_date_df = mids_by_date.toPandas()

# Plot a line chart.
plt.plot(mids_by_date_df['m_date'], mids_by_date_df['num_of_mids'], 
         color='blue', linestyle='solid')
plt.xlim(xmin=1)
plt.xlim(xmax=31)
plt.ylim(xmin=10000)
plt.xlabel("Date")
plt.ylabel("Number of matches")
plt.tight_layout()
image = plt.show() 
display(image)

## 3. Count the number of cheaters by ban date.

In [None]:
# Read the data stored in the S3 bucket and convert it into a dataframe.
num_of_cheaters = spark.read.parquet("s3://social-research-cheating/general-stats/num_of_cheaters.parquet")
num_of_cheaters_df = num_of_cheaters.toPandas()

# Plot a line chart.
plt.plot(num_of_cheaters_df['m_date'], mids_by_date_df['num_of_mids'], 
         color='blue', linestyle='solid')
plt.xlim(xmin=1)
plt.xlim(xmax=31)
plt.xlabel("Date")
plt.ylabel("Number of banned cheaters")
plt.tight_layout()
image = plt.show() 
display(image)

## 4. Count the number of victimisation experiences.

In [3]:
# Add the cheating flag of killer for each record.
add_cheater_info = spark.sql("""SELECT mid, src, 
                                CASE WHEN m_date >= start_date AND start_date != 'NA' THEN 1 ELSE 0 END AS src_flag, 
                                dst FROM td t JOIN cheaters c ON t.src = c.id""")
add_cheater_info.registerTempTable("add_cheater_info")

# Count the number of records for each pair of killer and victim.
vic_tab = spark.sql("SELECT src, dst, COUNT(*) AS kills FROM add_cheater_info WHERE src_flag == 1 GROUP BY src, dst")
vic_tab.registerTempTable("vic_tab")

# Count the number of victimisation experiences of each victim.
vic_exp = spark.sql("SELECT dst, SUM(kills) AS total_vic_exp FROM vic_tab GROUP BY dst")
vic_exp.registerTempTable("vic_exp")
# vic_exp.show(20)

# Create a table that contains the number of victimisation experiences of each player in the dataset.
pids = spark.sql("SELECT id FROM td_nodes")
pids.registerTempTable("pids")

total_vic_exp = spark.sql("""SELECT n.id, CASE WHEN total_vic_exp IS NULL THEN 0 ELSE total_vic_exp END AS total_vic_exp 
                             FROM pids n LEFT JOIN vic_exp v ON n.id = v.dst""")
total_vic_exp_df = total_vic_exp.toPandas()
print(total_vic_exp_df.head(20))

dst,total_vic_exp
account.c9e48986e0dd4f90b6ac3fd881c37b12,1
account.809aed385c55484097129d1f931d81db,1
account.5f60bf73915c4318ad0705cf03c0d59a,1
account.f7cd513cf4224eceacd75e4cb86f8dff,1
account.c238e9a1e76f4b07b4e7c2d8330a2a3f,1
account.a9ddb5f546d64faaad6db35d4b3642c7,1
account.110a883639ec4c01ba40f12ae28b2508,1
account.15e50e10517747be96e1bef4fc5d00d9,1
account.2355b555855049228467c6e5313ef8d6,1
account.0bbd2655be2e4a6e81dcb604e675733a,1


In [None]:
# Plot the distribution of victimisation experiences.
bins = np.arange(0, total_vic_exp_df['total_vic_exp'].max() + 1.5) - 0.5
fig = total_vic_exp_df.hist(column = 'total_vic_exp', histtype='step', weights=np.zeros_like(total_vic_exp_df['total_vic_exp'])+1./len(total_vic_exp_df['total_vic_exp']))
plt.xlabel("Number of victimisation experiences")
plt.ylabel("Proportion")
plt.title("")
image = plt.show() 

# Print a list of players who have been harmed by cheating at least once.
print(total_vic_exp_df[total_vic_exp_df['total_vic_exp'] >= 1])

## 5. Count the number of cheaters per match.

In [4]:
# First, get a list of participants.
participants = spark.sql("""SELECT mid, m_date, dst AS id FROM td GROUP BY mid, m_date, dst 
                            UNION SELECT mid, m_date, src FROM td GROUP BY mid, m_date, src 
                            ORDER BY mid""")
participants.registerTempTable("participants")

# Get a node table that contains the start date of cheating for each cheater.
nodes = spark.sql("""SELECT t.id, t.pname, CASE WHEN c.start_date IS NULL THEN 0 ELSE 1 END AS cheating_flag, 
                     CASE WHEN c.start_date IS NULL THEN 'NA' ELSE c.start_date END AS start_date, 
                     CASE WHEN c.ban_date IS NULL THEN 'NA' ELSE c.ban_date END AS ban_date 
                     FROM td_nodes t LEFT JOIN cheaters c ON t.id = c.id""")
nodes.registerTempTable("nodes")

# Add cheating flags of cheaters.
players_with_cheater_info = spark.sql("""SELECT mid, m_date, p.id, 
                                         CASE WHEN m_date >= start_date AND cheating_flag = 1 THEN 1 ELSE 0 END AS flag 
                                         FROM participants p JOIN nodes n ON p.id = n.id ORDER BY mid, flag""")
display(participants_with_cheater_info)
players_with_cheater_info.registerTempTable("players_with_cheater_info")

# Calculate the number of cheaters per match.
num_of_cheaters = spark.sql("""SELECT mid, cheater_cnt 
                               FROM (SELECT mid, SUM(flag) AS cheater_cnt FROM players_with_cheater_info GROUP BY mid) 
                               WHERE cheater_cnt >= 1""")
num_of_cheaters.registerTempTable("num_of_cheaters")

print(num_of_cheaters.count())
num_of_cheaters.show(10)
num_of_cheaters_df = num_of_cheaters.toPandas()

In [None]:
# Plot the distribution of the number of cheaters per match.
bins = np.arange(0, num_of_cheaters_df['cheater_cnt'].max() + 1.5) - 0.5
fig = num_of_cheaters_df.hist(column = 'cheater_cnt', histtype='step', bins=bins, 
                              weights=np.zeros_like(num_of_cheaters_df['cheater_cnt'])+1./len(num_of_cheaters_df['cheater_cnt']))
plt.xlim(xmin=0)
plt.xlabel("Number of cheaters per match")
plt.ylabel("Proportion")
plt.title("")
image = plt.show()

# Count the number of matches where only one cheater took part in.
print(len(num_of_cheaters_df[num_of_cheaters_df['cheater_cnt'] == 1]))