# Data Understanding

The metrics that will be used to evaluate this stage are:

- **Diversity** of statistical methods and graphs (rich and justified set of methods);

- **Complexity** of statistical methods and graphs (complex and integrated 3+D methods with clear results);

- **Interpretation** of results of statistical methods and graphs;

- **Knowledge extraction** from results of statistical methods (interesting, new and non-trivial knowledge obtained).

In [225]:
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd

teams = pd.read_csv("data/teams.csv")
teams_post = pd.read_csv("data/teams_post.csv")
series_post = pd.read_csv("data/series_post.csv")
coaches = pd.read_csv("data/coaches.csv")
awards_players = pd.read_csv("data/awards_players.csv")
players_teams = pd.read_csv("data/players_teams.csv")
players = pd.read_csv("data/players.csv")

## 1 - Analysing teams.csv

### 1.0 - General Information

- There are two similar columns `tmID` and `franchID`. They always have the same value except in some cases. We checked and, when they have different names, it is because the name under which the team participated in the competition (`tmdID`) is different from its current name (`franchID`).

- There are columns that have null values ​​in all entries (`divID`).

- There are columns that have the same value in all entries (`lgID`, `seeded`, `tmORB`, `tmDRB`, `tmTRB`, `opptmORB`, `opptmDRB`,  `opptmTRB`).

- Only 8 teams make the playoffs each year, 4 from each conference.

In [None]:
columns_null = teams.columns[teams.isnull().all()]
print("Columns where all the values are null:" + str(list(columns_null)))
    
columns_equal = teams.columns[teams.nunique() == 1]
print("Columns where all the values are the same:" + str(list(columns_equal)))

### 1.1 - Median of games won, lost, and playoff’s proporcion for each team
The teams that make it to the playoffs more often are also those with the highest average number of wins. The opposite is also true, the teams with the most losses and the fewest wins are also those that have made it to the playoffs the least number of times.

In [None]:
def teams_relation_won_lost_playoff_team(data):  
    data = teams.groupby('franchID').agg(won=('won','mean'), lost=('lost','mean'), playoff_proportion=('playoff', lambda x: (x == 'Y').sum() / len(x)))
    data = data.reset_index()
    data = data.sort_values(by='playoff_proportion', ascending=False)
    data_games = data.melt(id_vars='franchID', value_vars=['won','lost'], var_name='Result', value_name='Median')
    _, ax1 = plt.subplots(figsize=(14,5))
    sns.barplot(x='franchID', y='Median', hue='Result', data=data_games, ax=ax1)
    ax1.set_xlabel("Team's ID")
    ax1.set_ylabel("Median of Games Won/Lost")
    ax1.set_title("Median of Games Won, Lost, and Playoff Proportion for Each Team")
    ax1.tick_params(axis='x', rotation=90)
    ax2 = ax1.twinx()
    sns.scatterplot(x='franchID', y='playoff_proportion', data=data, ax=ax2, color='black', marker='o', s=100)
    ax2.set_ylabel("Proportion of Playoff Entries")
    ax1.legend(loc='upper left', title='Games')
    ax2.legend(['Playoff Proportion'], loc='upper right', title='Playoff Proportion')
    plt.show()
    
teams_relation_won_lost_playoff_team(teams.copy())

### 1.2 - Distribuition of games won by team
The variation of the number of games won by each team, by year, seems to not change that much.

In [None]:
def teams_relation_won_team(data):
    data_agg = data.groupby('franchID').agg(won=('won','mean'), lost=('lost','mean'), playoff_proportion=('playoff', lambda x: (x == 'Y').sum() / len(x))).reset_index()
    data_agg = data_agg.sort_values(by='playoff_proportion', ascending=False)
    data_games = data.melt(id_vars='franchID', value_vars=['won'], var_name='Result', value_name='Games')
    plt.figure(figsize=(6, 4))
    sns.boxplot(x='franchID', y='Games', data=data_games[data_games['Result'] == 'won'])
    plt.title("Distribution of Games Won by Team")
    plt.xlabel("Team's ID")
    plt.ylabel("Games Won")
    plt.xticks(rotation=90)
    plt.tight_layout()
    plt.show()

teams_relation_won_team(teams.copy())

### 1.3 Average attendance by team with average ranking
Neither the teams with the lower reaking have higher assist numbers, nor the teams with the highest have lower attendance levels. It is important to notice that the higher the ranking, the worst are the team’s results.

In [None]:
def teams_relation_attend_rank_team(data):
    data = data.groupby('franchID')[['attend','rank']].mean()
    data = data.reset_index()
    data = data.sort_values( by='attend', ascending=False)
    plt.figure(figsize=(8,4))
    sns.barplot(x='franchID', y='attend', data=data, color='skyblue', edgecolor='black')
    plt.title('Average Attendance by Team with Average Ranking')
    plt.xlabel('Team')
    plt.ylabel('Average Attendance')
    plt.xticks(rotation=90)
    ax2 = plt.gca().twinx()
    ax2.scatter(x=data['franchID'], y=data['rank'], color='black', label='Average Ranking', marker='o')    
    ax2.set_ylabel('Average Ranking', color='black')
    ax2.set_ylim(0,8)
    ax2.tick_params(axis='y', labelcolor='black')
    plt.legend(loc='upper left')
    plt.tight_layout()
    plt.show()

teams_relation_attend_rank_team(teams.copy())

### 1.4 - Team’s ranking over the years
The variation of the ranking of each team seems to be small in most of the cases. In most cases, the ranking variation appears to be progressive, with few cases in which the team's ranking varies by more than 3 values ​​from one year to the next.

In [None]:
def teams_relation_year_rank_team(data):
    data = data[['year','franchID','rank']]
    data = data.sort_values(by=['franchID','year'])
    teams = data['franchID'].unique()
    num_teams = len(teams)
    num_cols = 6
    num_rows = (num_teams + num_cols - 1) // num_cols
    fig, axes = plt.subplots(num_rows, num_cols, figsize=(14, 2 * num_rows ), squeeze=False)
    for idx, team in enumerate(teams):
        data_team = data[data['franchID'] == team]        
        ax = axes[idx // num_cols, idx % num_cols]
        sns.lineplot(x='year', y='rank', data=data_team, color='b', marker='o', linewidth=2, ax=ax)
        ax.set_title(f'{team}\'s')
        ax.set_xlabel('Year')
        ax.set_ylabel('Rank')        
        ax.invert_yaxis()        
        ax.xaxis.set_major_locator(plt.MaxNLocator(integer=True))
        ax.set_ylim(1,8)
        ax.set_xlim(1,10)
    for j in range(idx + 1, num_rows * num_cols):
        fig.delaxes(axes[ j // num_cols, j % num_cols])
    plt.tight_layout()
    plt.show()

teams_relation_year_rank_team(teams.copy())

### 1.5 - Team’s presence in playoffs over the years

The variation of the presence in the playoffs of each team seems to be small in most of the cases. The execeptions are WAS and NYL.

In [None]:
def teams_relation_year_playoff_team( data ):
    data['playoff_numeric'] = data['playoff'].map({'Y': 1, 'N': 0})
    data = data[['year','franchID','playoff_numeric']]
    data = data.sort_values(by=['franchID','year'])
    teams = data['franchID'].unique()
    num_teams = len(teams)
    num_cols = 6
    num_rows = (num_teams + num_cols - 1) // num_cols
    fig, axes = plt.subplots(num_rows, num_cols, figsize=(14, 1.5 * num_rows), squeeze=False)
    for idx, team in enumerate(teams):
        data_team = data[data[ 'franchID' ] == team]
        ax = axes[idx // num_cols, idx % num_cols]
        sns.lineplot(x='year', y='playoff_numeric', data=data_team, color='b', marker='o', linewidth=2, ax=ax )
        ax.set_title(f'{team}\'s')
        ax.set_xlabel('Year')
        ax.set_ylabel('Playoff')
        ax.xaxis.set_major_locator(plt.MaxNLocator(integer=True))
        ax.set_ylim(-0.2,1.2)
        ax.set_yticks([0,1])
        ax.set_xlim(1,10)
    for j in range(idx + 1, num_rows * num_cols):
        fig.delaxes(axes[j // num_cols, j % num_cols])
    plt.tight_layout()
    plt.show()

teams_relation_year_playoff_team(teams.copy())

### 1.6 - Relation between teams numbers of playoff and numbers o awards 
The number of player awards a team has doesn't seem to have much of a correlation with its playoff presence. For example, the New York Liberty have a lot of playoff appearances but few awards.

In [None]:
# join players.csv with players_teams.csv
players.rename(columns={'bioID': 'playerID'}, inplace=True)
players_teams = pd.merge( players_teams, players, on='playerID', how='left')

# join awards_players.csv with players_teams.csv and coaches.csv
merged_players_awards = pd.merge(awards_players, players_teams, on=['playerID', 'year'], how='inner')
awards_count = merged_players_awards.groupby(['playerID', 'year']).size().reset_index(name='awards')
players_teams = pd.merge(players_teams, awards_count[['playerID', 'year', 'awards']], on=['playerID', 'year'], how='left')
players_teams['awards'] = players_teams['awards'].fillna(0).astype(int)
merged_coaches_awards = pd.merge(awards_players, coaches, left_on=['playerID', 'year'], right_on=['coachID', 'year'], how='inner')
awards_count = merged_coaches_awards.groupby(['coachID', 'year']).size().reset_index(name='awards')
coaches = pd.merge(coaches, awards_count[['coachID', 'year', 'awards']], on=['coachID', 'year'], how='left')
coaches['awards'] = coaches['awards'].fillna(0).astype(int)

# join teams_post.csv with teams.csv
teams_post.rename(columns={'W': 'wonPost', 'L': 'lostPost'}, inplace=True)
teams = pd.merge(teams, teams_post, on=['tmID', 'year'], how='left')
teams['wonPost'] = teams['wonPost'].fillna(0).astype(int)
teams['lostPost'] = teams['lostPost'].fillna(0).astype(int)

# join awards column of players_teams.csv and coaches.csv with teams.csv
team_players_count = players_teams.groupby(['tmID', 'year'])['awards'].sum().reset_index()
team_players_count = team_players_count.rename(columns={'awards': 'awards_players'})
teams = teams.merge(team_players_count, on=['year','tmID'], how='left')
teams['awards_players'] = teams['awards_players'].fillna(0).astype(int)
team_coaches_count = coaches.groupby(['tmID', 'year'])['awards'].sum().reset_index()
team_coaches_count = team_coaches_count.rename(columns={'awards': 'awards_coaches'})
teams = teams.merge(team_coaches_count, on=['tmID', 'year'], how='left')
teams['awards_coaches'] = teams['awards_coaches'].fillna(0).astype(int)

# join teams.csv with players_teams.to_csv
players_teams = players_teams.merge(teams, on=['year', 'tmID'], how='left')

teams['playoff'] = teams['playoff'].map({'Y': 1, 'N': 0})
agg_data = teams.groupby('tmID').agg(avg_num=('awards_players', 'mean'),playoff_rate=('won', 'mean')).reset_index().sort_values(ascending=False, by='avg_num')
fig, ax1 = plt.subplots(figsize=(10, 6))
sns.barplot(x='tmID', y='avg_num', data=agg_data, alpha=0.7, label='Média de awards_players', ax=ax1)
ax2 = ax1.twinx()
sns.lineplot(x='tmID', y='playoff_rate', data=agg_data, marker='o', label='Proporção de playoff (Y)', ax=ax2)
ax1.set_xlabel('tmID', fontsize=14)
ax1.set_ylabel('Média de awards_players', fontsize=14)
ax2.set_ylabel('Proporção de playoff', fontsize=14)
plt.title('Média de awards_players e Proporção de Playoff por tmID', fontsize=16)
ax1.legend(loc='upper left')
ax2.legend(loc='upper right')
plt.tight_layout()
plt.show()

### 1.7 - Difference between home and away wins by Team and home and away losses by Team
All teams have more wins and fewer losses when playing at home. However, some seem to be more influenced than others. For example, Atlanta doesn't show a big difference, but Seattle does.

In [None]:
agg_data = teams.groupby('tmID')[['homeW', 'awayW']].sum()
agg_data['difference'] = agg_data['homeW'] - agg_data['awayW']
plt.figure(figsize=(8, 4))
plt.bar(agg_data.index, agg_data['difference'], color=['green' if x > 0 else 'red' for x in agg_data['difference']])
plt.title("Difference between home and away wins by Team", fontsize=10)
plt.xlabel("Team's ID", fontsize=8)
plt.ylabel("Difference (HomeW - AwayW)", fontsize=8)
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.xticks(rotation=90)
plt.tight_layout()
plt.show()

In [None]:
agg_data = teams.groupby('tmID')[['homeL', 'awayL']].sum()
agg_data['difference'] = agg_data['homeL'] - agg_data['awayL']
plt.figure(figsize=(8, 4))
plt.bar(agg_data.index, agg_data['difference'], color=['green' if x > 0 else 'red' for x in agg_data['difference']])
plt.title("Difference between home and away losses by Team", fontsize=10)
plt.xlabel("Team's ID", fontsize=8)
plt.ylabel("Difference (HomeL - AwayL)", fontsize=8)
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.xticks(rotation=90)
plt.tight_layout()
plt.show()

## 2 - Analysing teams_post.csv

### 2.1 - Wins and losses in teams_post
From teams_post it is possible to see that some teams have a high number of wins, as well as losses, which leads us to believe that they are the ones that go to the playoffs more often.

In [None]:
summary = teams_post.groupby("tmID")[["wonPost", "lostPost"]].sum()
summary.plot(kind="bar", figsize=(8, 6))
plt.title("Sum of victories (W) and losses (L) by team in post")
plt.xlabel("Team's ID")
plt.ylabel("Total")
plt.legend(["victories (W)", "losses (L)"])
plt.xticks(rotation=0)
plt.tight_layout()
plt.show()

### 2.2 - Distribution of post appearances
To prove what was predicted from figure 11, in figure 12 it is possible to see in fact the teams that stand out in the number of appearances in teams_post, as is the case of LAS and SAC.

In [None]:
tmID_counts = teams_post["tmID"].value_counts()
plt.figure(figsize=(10, 10))
tmID_counts.plot(kind="pie", autopct="%1.0f%%", startangle=90)
plt.title("Distribution of post appearances")
plt.ylabel("") 
plt.show()

## 3 - Analysing players_teams.csv

In [None]:
plt.figure(figsize=(10, 6))
plt.scatter(players_teams["tmID"], players_teams["fgMade"], color="blue", alpha=0.7)
plt.title("Number of goals made by each player (with the team associated)")
plt.xlabel("Team's ID")
plt.ylabel("Number of Goals")
plt.grid(axis="y", linestyle="--", alpha=0.7)
plt.tight_layout()
plt.show()

### 3.1 - Points made and minutes played
As expected, the more minutes played, the more points are scored. It will therefore be interesting to evaluate the number of points scored per minute to assess the player's performance.

In [None]:
plt.figure(figsize=(6, 6))
plt.scatter(players_teams["minutes"], players_teams["points"], color="blue", alpha=0.7)
plt.title("Points made and minutes played")
plt.xlabel("Number og goals made")
plt.ylabel("Number of Minutes Played")
plt.grid(axis="y", linestyle="--", alpha=0.7)
plt.tight_layout()
plt.show()

## 4 - Analysing players.csv

### 4.1 - Distribuition of the height and weight of the players
The height and weight of players always seem to be around the same values, but there are  cases where a player has a height of 0 and/or a weight of 0. This is impossible to happen.

In [None]:
df_clean = players.dropna(subset=['height', 'weight'])
fig, axes = plt.subplots(1, 2, figsize=(8, 3))
sns.histplot(df_clean['height'], color='skyblue', ax=axes[0], bins=20)
axes[0].set_title('Distribuition of Height', fontsize=10)
axes[0].set_xlabel('Height', fontsize=8)
axes[0].set_ylabel('Frequency', fontsize=8)
sns.histplot(df_clean['weight'], color='yellow', ax=axes[1], bins=20)
axes[1].set_title('Distribuition of Weight', fontsize=10)
axes[1].set_xlabel('Weight', fontsize=8)
axes[1].set_ylabel('Frequency', fontsize=8)
plt.tight_layout()
plt.show()

### 4.2 - Number of players with N presences in the league
There are several players who only played for one year, with few playing for years. The number of times they played can be an indication of their experience, and therefore their quality.

In [None]:
player_counts = players_teams['playerID'].value_counts()
count_of_counts = player_counts.value_counts().sort_index()
plt.figure(figsize=(5, 3))
plt.bar(count_of_counts.index, count_of_counts.values, color='forestgreen')
plt.title('Number of players with N presences in the league', fontsize=10)
plt.xlabel('Number of presences', fontsize=8)
plt.ylabel('Number of players', fontsize=8)
plt.xticks(count_of_counts.index)
plt.show()

### 4.3 - Distribuition of players by position
The distribution of player positions is not uniform. The "G" and "F" positions are much more represented in our dataset.

In [None]:
college_counts = players['pos'].value_counts()
plt.figure(figsize=(4, 4))
plt.pie(college_counts, labels=college_counts.index, autopct='%1.1f%%',colors=plt.cm.Paired.colors)
plt.title("Distribuition of players by position", fontsize=10)
plt.axis('equal')
plt.tight_layout()
plt.show()
