In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# WarThunder Performance Analysis

In [None]:
import pandas as pd
import numpy as np
import pprint
import re

In [None]:
df_data = pd.read_csv("../input/warthunder-manual-killdeath-record/records.csv")
df_data.head()

In [None]:
df_data.info()

In [None]:
df_data.describe()

# Data Cleaning

- Columns that documentation says are numberic should also be identified by Pandas as numeric

In [None]:
df_data.dropna(inplace=True)

In [None]:
def regex_search_alpha(string):
    return re.search(pattern="[a-zA-Z_]{1,}", string=string)

In [None]:
bmask_invalid_distance = df_data["Distance"].str.contains("[a-zA-Z_\-~?]")
bmask_invalid_espeed = df_data["EnemySpeed"].str.contains("[a-zA-Z_\-~?]")
bmask_invalid_speed = df_data["SelfSpeed"].str.contains("[a-zA-Z_\-~?]")

lst_invalids = list()
lst_invalids += list(df_data.loc[bmask_invalid_distance].index)
lst_invalids += list(df_data.loc[bmask_invalid_espeed].index)
lst_invalids += list(df_data.loc[bmask_invalid_speed].index)

df_data.drop(inplace=True, index=lst_invalids)

print(lst_invalids)

In [None]:
df_data["Distance"] = df_data["Distance"].astype(np.float64)

In [None]:
df_data["Event"] = df_data["Event"].str.lower()

In [None]:
df_data.info()

In [None]:
df_data.describe()

### Dealing with Apparently Erroneous Distances

The area in which ground vehicles are allowed to travel within all WarThunder maps never exceeds 3 kilometers. The recommended solution is to remove these columns even if the cuase for typographical errors can be guessed. E.g. double-digit non-decimal numbers or those with special characters corresponding to number keys on a standard keyboard.

In [None]:
bmask_overdistance = df_data["Distance"] > 3
df_data.loc[bmask_overdistance]

In [None]:
df_data.drop(inplace=True, index=df_data.loc[bmask_overdistance].index)

In [None]:
df_data.describe()

### Enforcing Data-types Based on Dataset Documentation

In [None]:
df_data["SelfSpeed"] = df_data["SelfSpeed"].astype(dtype=float)
df_data["EnemySpeed"] = df_data["EnemySpeed"].astype(dtype=float)

In [None]:
df_data.describe()

In [None]:
df_data.info()

# Exploratory Data Analysis

In [None]:
bmask_kb = df_data["Event"].str.contains(pat="kb", flags=re.IGNORECASE)
bmask_kill = df_data["Event"].str.contains(pat="k", flags=re.IGNORECASE)

**Average distance at which a "killed-by" event occurrs.**

In [None]:
df_data.loc[bmask_kb]["Distance"].describe()

**Average distance at which a "kill" is made by the player.**

In [None]:
df_data.loc[bmask_kill]["Distance"].describe()

In [None]:
bmask_using_centurion = df_data["VehicleUsed"].str.contains(pat=" Centurion Mk. 10")
df_data_by_vehicle = df_data.loc[bmask_using_centurion].groupby(by=["Event","EnemyVehicle"], axis="rows").mean()
df_data_by_vehicle

In [None]:
bmask_subject_vehicle = df_data["VehicleUsed"].str.contains(pat="Centurion Mk. 10") & df_data["EnemyVehicle"].str.contains(pat="T-54")
df_data_by_vehicle = df_data.loc[bmask_subject_vehicle].groupby(by=["Event","EnemyVehicle"], axis="rows").mean()
df_data_by_vehicle

### Longest Kills Made

In [None]:
df_data.loc[(df_data["Distance"] >= 2)]

## Remarks on Exploratory Analysis
- More data is needed. The average kill distance for the T-44-100 is .76 kilometers while the average killed-by distance is 1.08. Further, the kill and killed-by distances of the T-44-100 are greater than for the T-54 series (1949 and 1947) despite the 1947 having better armor and the 1949 having better shells.

- There is a presumed reason for the T-54 (1949) being killed at a longer range (0.75) than the T-54 (1947) despite the 1949 series having better ammunition for long range. Since the 1949 series is generally better than the 1947, it is spawned first. Once killed, the enemy players have an approximate idea of where the player is and manage to get close more often thanks to knowledge gained while in the 1949. This is an unproven explanation and may require an additional column/parameter to be verified.

In [None]:
print("Done")