In [3]:
import numpy as np
import pandas as pd

data_players = pd.read_csv('datasets/dota-2-matches/players.csv')
data_matches = pd.read_csv('datasets/dota-2-matches/match.csv')
data_teamfights = pd.read_csv('datasets/dota-2-matches/teamfights.csv')

**teamfights_players:** Additional information provided for each player in each teamfight. `player_slot` can be used to link this back to `players.csv`

**players:** Individual players are identified by `account_id` but there is an option to play anonymously and roughly one third of the `account_id`s are not available. Anonymous users have the value of `0` for `account_id`. Contains totals for kills, deaths, denies, etc. Player action counts are available, and are indicated by variable names beginning with `unit_order_`. Counts for reasons for acquiring or losing gold, and gaining experience, have prefixes `gold_`, and `xp_`.

In [8]:
#Grouping new data frame by match_id and collection all heroes together than split them by team
import time
start_time = time.time()

dire_data=data_players.groupby('match_id')['hero_id'].apply(list)

dire_data.head()

def divide_by_team(df, new_df):
    for j in range(df.shape[0]):
        new_r = list()
        for i in range(5):
            new_r.append(df[j].pop(df[j].index(df[j][0])))
        radiant_items = {"Radiant_team": new_r}
        new_df = pd.concat([new_df, radiant_items], ignore_index=True) 
    return new_df


radiant_data = pd.DataFrame()
radiant_data = divide_by_team(dire_data, radiant_data)#--- 86.03843569755554 seconds ---

print("--- %s seconds ---" % (time.time() - start_time))#136.3190200328827 sec for split method

match_id
0    [86, 51, 83, 11, 67, 106, 102, 46, 7, 73]
1      [7, 82, 71, 39, 21, 73, 22, 5, 67, 106]
2      [51, 109, 9, 41, 27, 38, 7, 10, 12, 85]
3     [50, 44, 32, 26, 39, 78, 19, 31, 40, 47]
4    [8, 39, 55, 87, 69, 101, 100, 22, 67, 21]
Name: hero_id, dtype: object

In [None]:
#Adding the data into data_matches and dropping the useless columns
data_matches['Radiant_team']=radiant_data
data_matches['Dire_team']=dire_data
data_matches=data_matches.drop(columns=['start_time','duration','game_mode','positive_votes','negative_votes','cluster'])
data_matches.columns

In [None]:
#Adding the mean_values of features which would be grouped by match_id
def adding_mean_values_of_diff_features_by_team(df,groupby,features,new_df):
    for i in range(len(features)):
        dire_data=df.groupby(groupby)[features[i]].apply(list)
        radiant_data=pd.DataFrame()
        radiant_data=divide_by_team(dire_data,radiant_data)
        radiant_data=radiant_data['Radiant_team'].apply(lambda x : sum(x)/len(x))
        dire_data=dire_data.apply(lambda x :sum(x)/len(x))
        new_df[features[i]+'_radiant']=radiant_data
        new_df[features[i]+'_dire']=dire_data
    return new_df

In [None]:
features=['gold_spent','gold_per_min','xp_per_min','kills','deaths','tower_damage','hero_damage']
data_matches=adding_mean_values_of_diff_features_by_team(data_players,'match_id',features,data_matches)
print(data_matches.head())

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
def corr_heatmap(df, digits=3, cmap='coolwarm'):
    """
    Creates a correlation heatmap to easily visualize multicollinearity
    that might be present in the dataframe.

    Args:
        df (DataFrame) : DataFrame with features to check multicollinearity on.
        digits (int) : Number of decimal places to display
        cmap (str) : Colormap to display correlation range.

    Returns:
        fig : Matplotlib Figure
        ax : Matplotlib Axis
    """
    # Create correlation matrix from dataframe
    correl = df.corr().round(digits)
    

    # Create mask for upper triangle of matrix
    mask = np.zeros_like(correl)
    
    
    mask[np.triu_indices_from(mask)] = True
    

    #Create heatmap correlation matrix
    fig, ax = plt.subplots(figsize=((len(df.columns)),(len(df.columns))))
    sns.heatmap(correl, annot=True, ax=ax, cmap=cmap, vmin=-1, vmax=1,\
                mask=mask);
    return fig, ax
corr_heatmap(data_matches)
fig, ax = plt.subplots(figsize=(8,6))
# sns.histplot(x='blueWins', data=diff_df, hue='blueWins', palette='Set1',bins=(2), kde=False)
sns.countplot(x='radiant_win', data=data_matches, hue='radiant_win', palette='cool_r')

# Save as image
plt.tight_layout()