In [2]:
import pandas as pd
import numpy as np
import re
import logging
from better_profanity import profanity

logging.basicConfig(level=logging.INFO, format='%(asctime)s %(levelname)s:%(message)s')

logging.info("Loading data...")
matches = pd.read_csv('data/match.csv')
players = pd.read_csv('data/players.csv')
chat = pd.read_csv('data/chat.csv')
heroes = pd.read_csv('data/hero_names.csv')
item_ids = pd.read_csv('data/item_ids.csv')
purchase_log = pd.read_csv('data/purchase_log.csv')

logging.info("Preparing profanity check...")
profanity.load_censor_words()
bad_words = profanity.CENSOR_WORDSET
pattern = '|'.join(re.escape(str(word)) for word in bad_words)
regex = re.compile(pattern, re.IGNORECASE)

logging.info("Classifying messages...")
profane_mask = chat['key'].str.contains(regex, na=False)
chat['message_type'] = np.where(profane_mask, 'Нецензурное', 'Корректное')

logging.info("Saving top heroes...")
hero_counts = players['hero_id'].value_counts().reset_index()
hero_counts.columns = ['hero_id', 'count']
hero_counts = hero_counts.merge(heroes, on='hero_id', how='left')
top_heroes = hero_counts.head(10)
top_heroes.to_csv('data/precomputed_top_heroes.csv', index=False)

logging.info("Calculating chat activity...")
chat_activity = []
match_ids = chat['match_id'].unique()
for match_id in match_ids:
    match_chat = chat[chat['match_id'] == match_id]
    unique_players_in_chat = match_chat['unit'].unique()
    num_players_in_chat = len(unique_players_in_chat)
    total_players = 10
    percent_players_in_chat = (num_players_in_chat / total_players) * 100
    chat_activity.append({
        'match_id': match_id,
        'players_in_chat': num_players_in_chat,
        'percent_players_in_chat': percent_players_in_chat
    })
chat_activity_df = pd.DataFrame(chat_activity)
chat_activity_df.to_csv('data/precomputed_chat_activity.csv', index=False)

logging.info("Calculating bad message percentage...")
message_counts = chat['message_type'].value_counts()
total_messages = len(chat)
bad_messages = message_counts.get('Нецензурное', 0)
bad_message_percentage = (bad_messages / total_messages) * 100
pd.DataFrame([{"bad_message_percentage": bad_message_percentage}]).to_csv('data/precomputed_bad_msg_percentage.csv', index=False)

logging.info("Calculating matches stats...")
total_matches = players['match_id'].nunique()
matches_with_chat = chat['match_id'].nunique()
matches_without_chat = total_matches - matches_with_chat
matches_stats = {
    "total_matches": total_matches,
    "matches_with_chat": matches_with_chat,
    "matches_without_chat": matches_without_chat
}
pd.DataFrame([matches_stats]).to_csv('data/precomputed_matches_stats.csv', index=False)

logging.info("Calculating boots purchase times...")
boots_row = item_ids[item_ids['item_name'] == 'boots_of_speed']
boots_id = boots_row['item_id'].iloc[0] if not boots_row.empty else 29
boots_purchases = purchase_log[purchase_log['item_id'] == boots_id].merge(matches[['match_id','radiant_win']], on='match_id', how='left')

def is_winner(row):
    return (row['radiant_win'] and row['player_slot'] < 128) or ((not row['radiant_win']) and row['player_slot'] >= 128)

def is_loser(row):
    return not is_winner(row)

boots_purchases['is_winner_player'] = boots_purchases.apply(is_winner, axis=1)
boots_purchases['is_loser_player'] = boots_purchases.apply(is_loser, axis=1)

df_winner_boots = boots_purchases[boots_purchases['is_winner_player']].groupby('match_id')['time'].min().reset_index()
df_winner_boots['category'] = 'Winner'
df_loser_boots = boots_purchases[boots_purchases['is_loser_player']].groupby('match_id')['time'].min().reset_index()
df_loser_boots['category'] = 'Loser'
df_boots_compare = pd.concat([df_winner_boots, df_loser_boots], ignore_index=True).fillna(0)
df_boots_compare.to_csv('data/precomputed_boots_compare.csv', index=False)

logging.info("Calculating hero GPM data...")
top_5_heroes = players['hero_id'].value_counts().head(5).index.tolist()
players_merged = players.merge(matches[['match_id','radiant_win']], on='match_id', how='left')

def is_winner2(row):
    return (row['radiant_win'] and row['player_slot'] < 128) or ((not row['radiant_win']) and row['player_slot'] >= 128)

players_merged['win'] = players_merged.apply(is_winner2, axis=1)
top_data = players_merged[players_merged['hero_id'].isin(top_5_heroes)].copy()
top_data['gpm_bin'] = pd.cut(top_data['gold_per_min'], bins=5)
group = top_data.groupby(['hero_id','gpm_bin'])['win'].mean().reset_index()
group['gpm_bin_center'] = group['gpm_bin'].apply(lambda x: x.mid if pd.notnull(x) else np.nan)
group = group.replace({np.nan: None})
group.to_csv('data/precomputed_hero_gpm.csv', index=False)

logging.info("Preprocessing completed.")


2024-12-08 22:26:33,106 INFO:Loading data...
2024-12-08 22:26:37,884 INFO:Preparing profanity check...
2024-12-08 22:26:37,900 INFO:Classifying messages...
2024-12-08 22:28:18,305 INFO:Saving top heroes...
2024-12-08 22:28:18,326 INFO:Calculating chat activity...
2024-12-08 22:29:00,228 INFO:Calculating bad message percentage...
2024-12-08 22:29:00,299 INFO:Calculating matches stats...
2024-12-08 22:29:00,315 INFO:Calculating boots purchase times...
2024-12-08 22:29:05,478 INFO:Calculating hero GPM data...
  group = top_data.groupby(['hero_id','gpm_bin'])['win'].mean().reset_index()
2024-12-08 22:29:08,671 INFO:Preprocessing completed.
