<a href="https://colab.research.google.com/github/rafabandoni/nfl-predict/blob/main/notebooks/01_nfl_predict_class.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import Image

In [None]:
pd.set_option('display.max_columns', None)

# 01. EDA (Exploratory data analysis)
As for creating a **classifying algorithm** - probably with a **tree method**, some of the main goals of our EDA are:
  - Find and understand outliers
  - Understand scale of features and if scaling is needed
  - Simplify our dataframe by removing complex features

For that we will use the following thecniques:
- Univariate analysis
  - Describe data
  - Barplot
  - Time series
  - Distribution

- Bivariate analysis
  - Dispersion

- Multivariate analysis
  - Boxplot
  - Dispersion
  - Correlation matrix

"EDA is never ready, only abandoned."


## Load data

In [None]:
PATH = 'https://raw.githubusercontent.com/rafabandoni/nfl-predict/refs/heads/main/data/output/'

In [None]:
games_score_df = pd.read_parquet(PATH + 'games_stats_nfl.parquet')
games_score_df.head()

### Enhancing dataframe

In [None]:
# Total points per game
games_score_df['total_score'] = games_score_df['score_home'] + games_score_df['score_away']

In [None]:
games_score_df.drop(['year_home_', 'year_away_'], axis=1, inplace=True) # dropping unnecessary columns

In [None]:
numeric_cols = games_score_df.select_dtypes(include='number').columns

## Univariate analysis

### Describing data

In [None]:
games_score_df.describe()

### Barplot

In [None]:
df = games_score_df[['schedule_season',
                     'home_winner']]
df['home_wins'] = df['home_winner'].map({True : 1, False : 0})
df['away_wins'] = df['home_winner'].map({True : 0, False : 1})
df = df.groupby('schedule_season', as_index=False).sum(numeric_only=True)
dfm = pd.melt(df[['schedule_season', 'home_wins', 'away_wins']], id_vars="schedule_season", var_name='home_or_away_wins', value_name='wins')

plt.figure(figsize=(10,7))
sns.barplot(
    data=dfm,
    x='schedule_season',
    y='wins',
    hue='home_or_away_wins'
)
plt.title('Home or away winners per season')
plt.ylabel('Wins')
plt.xlabel('Season')
# plt.show()
plt.savefig("home_away_winner_per_season.png")
plt.clf()
Image(filename='home_away_winner_per_season.png')

### Time series

In [None]:
num_cols = 3
num_rows = 2

fig, axes = plt.subplots(num_rows, num_cols, figsize=(15, 5 * num_rows))
axes = axes.flatten()  # Flatten the axes array for easy indexing

df = games_score_df[['schedule_date',
                     'home_winner',
                     'total_score']].groupby(['schedule_date',
                                              'home_winner'], as_index=False).mean()

sns.lineplot(
    data=df,
    x='schedule_date',
    y='total_score',
    hue='home_winner',
    ax=axes[0]
)
axes[0].set_title('Mean of total score per date')

sns.lineplot(
    data=games_score_df,
    x='schedule_date',
    y='total_score',
    hue='home_winner',
    ax=axes[1]
)
axes[1].set_title('Total score per date')

sns.lineplot(
    data=games_score_df,
    x='schedule_date',
    y='score_home',
    hue='home_winner',
    ax=axes[2]
)
axes[2].set_title('Home score per date')

sns.lineplot(
    data=games_score_df,
    x='schedule_date',
    y='score_home',
    hue='home_winner',
    ax=axes[3]
)
axes[3].set_title('Mean of home score per date')

sns.lineplot(
    data=games_score_df,
    x='schedule_date',
    y='score_away',
    hue='home_winner',
    ax=axes[4]
)
axes[4].set_title('Mean of away score per date')

plt.tight_layout(pad=3.0)
# plt.show()
plt.savefig("time_series.png")
plt.clf()
Image(filename='time_series.png')

### Distribution

In [None]:
sns.countplot(x='home_winner', data=games_score_df)
plt.title('Distribuição de Home Winner')
plt.show()

In [None]:
df = games_score_df[numeric_cols]

num_cols = 3
num_rows = (len(df.columns) - 1) // num_cols + 1

fig, axes = plt.subplots(num_rows, num_cols, figsize=(15, 5 * num_rows))
axes = axes.flatten()  # Flatten the axes array for easy indexing

for i in range(0, len(df.columns) - 1):
    col = df.columns[i]

    sns.histplot(
        df[col],
        kde=True,
        ax=axes[i]
    )
    axes[i].set_title(f'Histogram for {col}', fontsize=12)

for j in range(i + 1, len(axes)):
    axes[j].set_visible(False)

plt.tight_layout(pad=3.0)
plt.savefig("histplot.png")
plt.clf()
Image(filename='histplot.png')

In [None]:
df = games_score_df[numeric_cols]

num_cols = 3
num_rows = (len(df.columns) - 1) // num_cols + 1

fig, axes = plt.subplots(num_rows, num_cols, figsize=(15, 5 * num_rows))
axes = axes.flatten()  # Flatten the axes array for easy indexing

for i in range(0, len(df.columns) - 1):
    col = df.columns[i]

    sns.violinplot(
        df[col],
        # kde=True,
        ax=axes[i]
    )
    axes[i].set_title(f'Histogram for {col}', fontsize=12)

for j in range(i + 1, len(axes)):
    axes[j].set_visible(False)

plt.tight_layout(pad=3.0)
plt.savefig("violinplot.png")
plt.clf()
Image(filename='violinplot.png')

## Bivariate Analysis

### Dispersion

In [None]:
num_cols = 3
num_rows = (len(games_score_df.columns) - 1) // num_cols + 1

fig, axes = plt.subplots(num_rows, num_cols, figsize=(15, 5 * num_rows))
axes = axes.flatten()  # Flatten the axes array for easy indexing

for i in range(0, len(games_score_df.columns) - 1):
    col1 = games_score_df.columns[i]
    col2 = 'home_winner'

    if col1 != 'home_winner':
        sns.scatterplot(
            data=games_score_df,
            x=col2,
            y=col1,
            ax=axes[i]
        )
        axes[i].set_title(f'Relation between {col1} \nand home_winner', fontsize=12)

for j in range(i + 1, len(axes)):
    axes[j].set_visible(False)

plt.tight_layout(pad=3.0)
plt.savefig("dispersion_home_winner.png")
plt.clf()
Image(filename='dispersion_home_winner.png')

## Multivariate Analysis

### Boxplot

In [None]:
num_cols = 3
num_rows = (len(games_score_df.columns) - 1) // num_cols + 1

fig, axes = plt.subplots(num_rows, num_cols, figsize=(15, 5 * num_rows))
axes = axes.flatten()  # Flatten the axes array for easy indexing

for i in range(0, len(games_score_df.columns) - 1):
    col1 = games_score_df.columns[i]
    col2 = games_score_df.columns[i + 1]

    if col1 != 'home_winner' and col2 != 'home_winner':
        sns.boxplot(
            data=games_score_df,
            x=col1,
            y=col2,
            hue='home_winner',
            ax=axes[i]
        )
        axes[i].set_title(f'Dispersion for {col1} \nand {col2} by home_win', fontsize=12)

for j in range(i + 1, len(axes)):
    axes[j].set_visible(False)

plt.tight_layout(pad=3.0)
plt.savefig("boxplot_multi.png")
plt.clf()
Image(filename='boxplot_multi.png')

### Dispersion

In [None]:
num_cols = 3
num_rows = (len(games_score_df.columns) - 1) // num_cols + 1

fig, axes = plt.subplots(num_rows, num_cols, figsize=(15, 5 * num_rows))
axes = axes.flatten()  # Flatten the axes array for easy indexing

for i in range(0, len(games_score_df.columns) - 1):
    col1 = games_score_df.columns[i]
    col2 = games_score_df.columns[i + 1]

    if col1 != 'home_winner' and col2 != 'home_winner':
        sns.scatterplot(
            data=games_score_df,
            x=col1,
            y=col2,
            hue='home_winner',
            ax=axes[i]
        )
        axes[i].set_title(f'Dispersion for {col1} \nand {col2} by home_win', fontsize=12)

for j in range(i + 1, len(axes)):
    axes[j].set_visible(False)

plt.tight_layout(pad=3.0)
plt.savefig("dispersion_multi.png")
plt.clf()
Image(filename='dispersion_multi.png')

### Correlation

In [None]:
def get_columns(theme, columns):
  new_list = ['schedule_date', 'schedule_season', 'schedule_playoff', 'team_home',
              'score_home', 'score_away', 'team_away', 'stadium_neutral',
              'home_winner']
  for column in columns:
    if theme in column:
      new_list.append(column)
  return new_list

columns = games_score_df.columns

themes = ['defense', 'offense', 'special_teams']
for theme in themes:
  plt.figure(figsize=(30,25))
  sns.heatmap(
      data=games_score_df[get_columns(theme, columns)].corr(numeric_only=True),
      vmin=-1,
      vmax=1,
      linewidths=0.1,
      # annot=True,
      # cmap="YlOrBr"
      cmap="Reds"
  )
  plt.savefig(f"correlation_{theme}.png")
  plt.clf()

In [None]:
Image(filename=f'correlation_defense.png')

In [None]:
Image(filename=f'correlation_offense.png')

In [None]:
Image(filename=f'correlation_special_teams.png')

## Conclusion

A lot of data has correlation with other data from the same theme (defense with other defense data e.g.). We might want to clean data.

In [None]:
# def high_corr_columns_per_theme(df, theme, threshold=0.7):
#   theme_columns = []

#   for col in df.columns:
#     if theme in col:
#       theme_columns.append(col)

#   new_df = df[theme_columns]
#   corr_matrix = new_df.corr().abs()
#   upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
#   high_corr_columns = [column for column in upper.columns if any(upper[column] > threshold)]
#   return high_corr_columns

In [None]:
# defense_high_corr_columns = high_corr_columns_per_theme(games_score_df, 'defense')
# offense_high_corr_columns = high_corr_columns_per_theme(games_score_df, 'offense')
# special_teams_high_corr_columns = high_corr_columns_per_theme(games_score_df, 'special_teams')

In [None]:
# defense_high_corr_columns.extend(offense_high_corr_columns)
# defense_high_corr_columns.extend(special_teams_high_corr_columns)

In [None]:
# sliced_df = games_score_df.drop(defense_high_corr_columns, axis=1)

In [None]:
# sliced_df.head()

## Creating treated DF

In [None]:
# treated_df = games_score_df.drop(columns=to_remove_list)
# treated_df.head()

In [None]:
# treated_df = sliced_df.copy()

In [None]:
# sliced_df.to_parquet('treated_df.parquet')