# CHEATER STATISTICS

In [1]:
import pandas as pd
import numpy as np
import scipy.stats
import analyze_cheaters

## 1 DESCRIPTIVE STATISTICS

### 1.1 NUMBER OF CHEATERS

The dataset contains 6,161 cheaters.

In [2]:
num_of_cheaters = pd.read_csv("data/cheater_stats/num_of_cheaters.csv")
print(num_of_cheaters['num_of_cheaters'].sum())

6161


## 2 CHEATER PERFORMANCE ANALYSIS

To decide on a baseline for cheating detection, we compare cheaters and non-cheaters who played the game between March 1 and March 3 in terms of performance. It is hypothesized that cheaters will show better performance than non-cheaters with the help of cheating tools. We first assume that 651 cheaters who were banned during this period always did cheat and then compare cheaters and non-cheaters using two performance measures.<br> To see the relevant figures, refer to 'paper-general-stats-visualization.ipynb'.

The number of cheaters who were banned between March 1 and March 3 is 651.

In [3]:
td = spark.read.parquet("s3://social-research-cheating/cheater-analysis/data_for_cheater_analysis.parquet")
td.registerTempTable("td")

players = spark.read.parquet("s3://social-research-cheating/players.parquet")
players.registerTempTable("players")

cheaters = spark.sql("SELECT * FROM players WHERE ban_date <= '2019-03-03'")
cheaters.registerTempTable("cheaters")
print(cheaters.count())

651


### 2.1 THE AVERAGE KILL RATIOS OF CHEATERS

In [4]:
# Get the kills and deaths of cheaters
kills = spark.sql("SELECT mid, src, time, m_date FROM td t JOIN cheaters c ON t.src = c.id")
kills.registerTempTable("kills")

deaths = spark.sql("SELECT mid, dst, time, m_date FROM td t JOIN cheaters c ON t.dst = c.id")
deaths.registerTempTable("deaths")

In [5]:
cheater_kill_ratio = analyze_cheaters.get_avg_kill_ratio(kills, deaths)

print("Mean: " + str(round(cheater_kill_ratio['avg_kill_ratio'].mean(), 2)))
print("Median: " + str(round(cheater_kill_ratio['avg_kill_ratio'].median(), 2)))

Mean: 0.77
Median: 0.82


### 2.2 THE AVERAGE TIME DIFFERENCE BETWEEN KILLS OF CHEATERS

Note that players should kill at least two other players to be evaluated by this measure.

In [6]:
cheater_kill_interval = analyze_cheaters.get_avg_time_diff_between_kills(kills)

print("Mean: " + str(round(cheater_kill_interval['delta'].mean(), 2)))
print("Median: " + str(round(cheater_kill_interval['delta'].median(), 2)))

Mean: 139.67
Median: 123.93


### 2.3 THE AVERAGE KILL RATIOS OF NON-CHEATERS

In [7]:
kills = spark.sql("""SELECT mid, src, time, m_date FROM td t JOIN players p ON t.src = p.id 
                     WHERE cheating_flag = 0""")
kills.registerTempTable("kills")

deaths = spark.sql("""SELECT mid, dst, time, m_date FROM td t JOIN players p ON t.dst = p.id 
                      WHERE cheating_flag = 0""")
deaths.registerTempTable("deaths")

In [8]:
non_cheater_kill_ratio = analyze_cheaters.get_avg_kill_ratio(kills, deaths)

print("Mean: " + str(round(non_cheater_kill_ratio['avg_kill_ratio'].mean(), 2)))
print("Median: " + str(round(non_cheater_kill_ratio['avg_kill_ratio'].median(), 2)))

Mean: 0.4
Median: 0.44


### 2.4 THE AVERAGE TIME DIFFERENCE BETWEEN KILLS OF NON-CHEATERS

In [9]:
non_cheater_kill_interval = analyze_cheaters.get_avg_time_diff_between_kills(kills)

print("Mean: " + str(round(non_cheater_kill_interval['delta'].mean(), 2)))
print("Median: " + str(round(non_cheater_kill_interval['delta'].median(), 2)))

# print(len(non_cheater_kill_interval['delta']))
# print(non_cheater_kill_interval.head(10))

Mean: 194.11
Median: 172.63


### 2.5 COMPARING THE TWO GROUPS

In [10]:
print(scipy.stats.ttest_ind(cheater_kill_ratio['avg_kill_ratio'], non_cheater_kill_ratio['avg_kill_ratio'], equal_var=False))

print(scipy.stats.ttest_ind(cheater_kill_interval['delta'], non_cheater_kill_interval['delta'], equal_var=False))

Ttest_indResult(statistic=48.64290198138943, pvalue=5.2129985377298465e-219)
Ttest_indResult(statistic=-18.23534154575062, pvalue=5.033833786062794e-60)


## 3 ESTIMATION OF THE TIME OF CHEATING ADOPTION

The game company only provides the date when each cheater was banned. Using the two performance measures above, we estimate when cheaters (who were banned between March 1 and March 31) started cheating to compensate for the missing information.

In [3]:
td = spark.read.parquet("s3://social-research-cheating/data_for_estim.parquet")
td.registerTempTable("td")

cheaters = spark.sql("SELECT * FROM players WHERE cheating_flag = 1")
cheaters.registerTempTable("cheaters")

### 3.1 THE AVERAGE KILL RATIO PER DAY

In [4]:
kills = spark.sql("SELECT mid, src, time, m_date FROM td t JOIN cheaters c ON t.src = c.id")
kills.registerTempTable("kills")

kills_per_day = spark.sql("""SELECT src AS id, m_date, COUNT(*) AS num_of_kills 
                             FROM kills GROUP BY src, m_date""")
kills_per_day_df = kills_per_day.toPandas()

deaths = spark.sql("SELECT mid, dst, time, m_date FROM td t JOIN cheaters c ON t.dst = c.id")
deaths.registerTempTable("deaths")

deaths_per_day = spark.sql("""SELECT dst AS id, m_date, COUNT(*) AS num_of_deaths 
                              FROM deaths GROUP BY dst, m_date""")
deaths_per_day_df = deaths_per_day.toPandas()

dates_from_kills = kills_per_day_df[['id', 'm_date']]
dates_from_deaths = deaths_per_day_df[['id', 'm_date']]
dates = pd.concat([dates_from_kills, dates_from_deaths])
dates = dates.drop_duplicates(subset=['id', 'm_date'])

temp = pd.merge(dates, kills_per_day_df, how='outer', on=['id', 'm_date'])
temp = temp.fillna(0)
merged_table = pd.merge(temp, deaths_per_day_df, how='outer', on=['id', 'm_date'])
merged_table = merged_table.fillna(0)

merged_table['kill_ratio'] = merged_table['num_of_kills'] / (merged_table['num_of_kills'] + merged_table['num_of_deaths'])
avg_kill_ratio_per_day = merged_table[['id', 'm_date', 'kill_ratio']].sort_values(by=['id', 'm_date']).reset_index(drop=True)

### 3.2 THE AVERAGE TIME DIFFERENCE BETWEEN KILLS PER DAY

In [5]:
kills_df = kills.toPandas()
kills_df = kills_df.sort_values(['src', 'mid', 'time'])

kills_df['time'] = pd.to_datetime(kills_df['time'])
kills_df['delta'] = kills_df.groupby(['mid', 'src'])['time'].diff()
kills_df['delta'] = kills_df['delta'] / np.timedelta64(1, 's')

time_diffs = kills_df[['src', 'm_date', 'delta']]
avg_kill_interval_per_day = time_diffs.groupby(['src', 'm_date'], as_index=False).mean()

### 3.3 ESTIMATION OF THE TIME OF CHEATING ADOPTION BASED ON PERFORMANCE

The number of cheaters who have complete performance information is 2,980.

In [7]:
avg_kill_interval_per_day.columns = ['id', 'm_date', 'delta']
merged_table = pd.merge(avg_kill_ratio_per_day, avg_kill_interval_per_day, how='left', on=['id', 'm_date'])

merged_table['kill_ratio'] = merged_table['kill_ratio'].round(2)
merged_table['delta'] = merged_table['delta'].round(2)
merged_table['flag'] = 0

# Change the value of a flag into one if the record (row) meets the following conditions.
merged_table.loc[(merged_table['kill_ratio'] >= 0.8) & (merged_table['delta'] <= 140), 'flag'] = 1
flagged_rows =  merged_table[merged_table['flag'] == 1]

reset_index = flagged_rows.groupby(['id']).first().reset_index()
start_dates = spark.createDataFrame(reset_index)
start_dates.registerTempTable("start_dates")

estimation = spark.sql("""SELECT c.id, s.m_date AS start_date, ban_date 
                          FROM cheaters c LEFT JOIN start_dates s ON c.id = s.id""")

estimation_df = estimation.toPandas()
estimation_df['ban_date'] = pd.to_datetime(estimation_df['ban_date'])
estimation_df['start_date'] = pd.to_datetime(estimation_df['start_date'])
estimation_df['period'] = (estimation_df['ban_date'] - estimation_df['start_date']).astype('timedelta64[D]') + 1

complete_rows = estimation_df[estimation_df.period.notnull()]
complete_rows['period'] = complete_rows['period'].astype('int')

print(len(complete_rows))
complete_rows.to_csv('estimated_cheating_time.csv', index=False)

2980


For those who have complete information, the average duration of cheating before the ban is four days.

In [8]:
print("Mean: " + str(round(complete_rows['period'].mean(), 2)))

Mean: 4.36


In [9]:
# For those who have at least one piece of missing information, assume that they cheated for two days.
estimation_df['period'] = estimation_df['period'].fillna(2)
estimation_df['start_date'] = estimation_df['start_date'].fillna(estimation_df['ban_date'] - pd.to_timedelta(estimation_df['period'] - 1, unit='d'))
estimation_df['start_date'] = estimation_df['start_date'].astype('str')
estimation_df.loc[(estimation_df['start_date'] < '2019-03-01'), 'start_date'] = '2019-03-01'
estimation_df['start_date'] = pd.to_datetime(estimation_df['start_date'])
estimation_df['period'] = (estimation_df['ban_date'] - estimation_df['start_date']).astype('timedelta64[D]') + 1

4,321 cheaters were estimated to start cheating two days before the ban. 

In [10]:
print(len(estimation_df[estimation_df['period'] == 2]))

4321
