<a href="https://colab.research.google.com/github/sahilfatima/NFL-Big-Data-Bowl-2024/blob/main/NFL_Big_Data_Bowl_2024.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import os
import sys
from tempfile import NamedTemporaryFile
from urllib.request import urlopen
from urllib.parse import unquote, urlparse
from urllib.error import HTTPError
from zipfile import ZipFile
import tarfile
import shutil

CHUNK_SIZE = 40960
DATA_SOURCE_MAPPING = 'nfl-big-data-bowl-2024:https%3A%2F%2Fstorage.googleapis.com%2Fkaggle-competitions-data%2Fkaggle-v2%2F60305%2F6654553%2Fbundle%2Farchive.zip%3FX-Goog-Algorithm%3DGOOG4-RSA-SHA256%26X-Goog-Credential%3Dgcp-kaggle-com%2540kaggle-161607.iam.gserviceaccount.com%252F20240202%252Fauto%252Fstorage%252Fgoog4_request%26X-Goog-Date%3D20240202T120005Z%26X-Goog-Expires%3D259200%26X-Goog-SignedHeaders%3Dhost%26X-Goog-Signature%3D6b9e94733853ad4d6dcc60b856f6af44c57526aa27ba0ca228dbecca981ce2e96fbe55d44cc77c5f06df984970c9984ec8d5560ad08f08c1481e11408c031ec9b0833ea47287400a8a85c089e3819e4842a2567296e87e424a535cc0a6b702cd94db823cde68f2bade8dd55993f8b972dc3ed2fe4f22e0217c49bb36c67028a85bbd6e8e302eaf096f55a77074a3899193d42f0052109366cc18f7b30c303a7bf50e1d997b3c214e496d7901fa771c63b501721f26478f71d4fd2ac863e46c3cd0683cf167080cba77989be82f173714b6c257e8da6edcaca0c1c32d7365c4486e6ea97a1f1bcc27c98a065ff120cb1edf1609c82ded92574bff7de6b1b174b9'

KAGGLE_INPUT_PATH='/kaggle/input'
KAGGLE_WORKING_PATH='/kaggle/working'
KAGGLE_SYMLINK='kaggle'

!umount /kaggle/input/ 2> /dev/null
shutil.rmtree('/kaggle/input', ignore_errors=True)
os.makedirs(KAGGLE_INPUT_PATH, 0o777, exist_ok=True)
os.makedirs(KAGGLE_WORKING_PATH, 0o777, exist_ok=True)

try:
  os.symlink(KAGGLE_INPUT_PATH, os.path.join("..", 'input'), target_is_directory=True)
except FileExistsError:
  pass
try:
  os.symlink(KAGGLE_WORKING_PATH, os.path.join("..", 'working'), target_is_directory=True)
except FileExistsError:
  pass

for data_source_mapping in DATA_SOURCE_MAPPING.split(','):
    directory, download_url_encoded = data_source_mapping.split(':')
    download_url = unquote(download_url_encoded)
    filename = urlparse(download_url).path
    destination_path = os.path.join(KAGGLE_INPUT_PATH, directory)
    try:
        with urlopen(download_url) as fileres, NamedTemporaryFile() as tfile:
            total_length = fileres.headers['content-length']
            print(f'Downloading {directory}, {total_length} bytes compressed')
            dl = 0
            data = fileres.read(CHUNK_SIZE)
            while len(data) > 0:
                dl += len(data)
                tfile.write(data)
                done = int(50 * dl / int(total_length))
                sys.stdout.write(f"\r[{'=' * done}{' ' * (50-done)}] {dl} bytes downloaded")
                sys.stdout.flush()
                data = fileres.read(CHUNK_SIZE)
            if filename.endswith('.zip'):
              with ZipFile(tfile) as zfile:
                zfile.extractall(destination_path)
            else:
              with tarfile.open(tfile.name) as tarfile:
                tarfile.extractall(destination_path)
            print(f'\nDownloaded and uncompressed: {directory}')
    except HTTPError as e:
        print(f'Failed to load (likely expired) {download_url} to path {destination_path}')
        continue
    except OSError as e:
        print(f'Failed to load {download_url} to path {destination_path}')
        continue

print('Data source import complete.')


In [None]:
import pandas as pd
import seaborn as sns
import numpy as np

import matplotlib.pyplot as plt
import plotly.io as pio

import datetime
from colorama import Fore, Style, init
import lightgbm as lgb

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier, VotingClassifier
from sklearn.metrics import accuracy_score, recall_score, classification_report, confusion_matrix
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC

import warnings
warnings.filterwarnings('ignore')

In [None]:
# Data Exploration and Preparation

# Summarizing Data Frame and Color printing
def PrintColor(text:str, color = Fore.BLUE, style = Style.BRIGHT):
    "Prints color outputs using colorama using a text F-string";
    print(style + color + text + Style.RESET_ALL);

def summarize_dataframe(df):
    summary_df = pd.DataFrame(df.dtypes, columns=['dtypes'])
    summary_df['missing#'] = df.isna().sum().values*100
    summary_df['missing%'] = (df.isna().sum().values*100)/len(df)
    summary_df['uniques'] = df.nunique().values
    summary_df['first_value'] = df.iloc[0].values
    summary_df['last_value'] = df.iloc[len(df)-1].values
    summary_df['count'] = df.count().values
    #sum['skew'] = df.skew().values
    desc = pd.DataFrame(df.describe().T)
    summary_df['min'] = desc['min']
    summary_df['max'] = desc['max']
    summary_df['mean'] = desc['mean']
    return summary_df

# Importing data files
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        PrintColor(os.path.join(dirname, filename))

In [None]:
# Games Data Inspection
data_path1 = '/kaggle/input/nfl-big-data-bowl-2024/games.csv'
games = pd.read_csv(data_path1, header=0)
games.head(5).style.set_caption("Sample of the games data"). \
set_properties(**{'border': '1.3px solid blue',
                          'color': 'grey'})

for col in games.columns:
    msg = 'column: {:>10}\t Percent of NaN value: {:.2f}%'.format(col,100*(games[col].isnull().sum()/games[col].shape[0]))
    PrintColor(f"\n---> {msg}");

games.info()

summarize_dataframe(games).style.background_gradient(cmap='Reds')

In [None]:
# Players Data Inspection
data_path2 = '/kaggle/input/nfl-big-data-bowl-2024/players.csv'
players = pd.read_csv(data_path2, header=0)
players.head(5).style.set_caption("Sample of the players data"). \
set_properties(**{'border': '1.3px solid blue',
                          'color': 'grey'})

for col in players.columns:
    msg = 'column: {:>10}\t Percent of NaN value: {:.2f}%'.format(col,100*(players[col].isnull().sum()/players[col].shape[0]))
    PrintColor(f"\n---> {msg}");

players.info()

summarize_dataframe(players).style.background_gradient(cmap='Reds')

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Players Data Inspection
data_path2 = '/kaggle/input/nfl-big-data-bowl-2024/players.csv'
players = pd.read_csv(data_path2, header=0)

# Display the first 5 rows of the players' data
players.head(5).style.set_caption("Sample of the players data"). \
    set_properties(**{'border': '1.3px solid blue', 'color': 'grey'})

# Display percentage of NaN values for each column
for col in players.columns:
    msg = 'column: {:>10}\t Percent of NaN value: {:.2f}%'.format(col, 100 * (players[col].isnull().sum() / players[col].shape[0]))
    PrintColor(f"\n---> {msg}")

# Display general information about the players' data
players.info()

# Summarize the dataframe and apply a background gradient using seaborn
summarize_dataframe(players).style.background_gradient(cmap='Reds')

# Plot a bar chart to visualize the percentage of missing values in each column
plt.figure(figsize=(12, 6))
sns.barplot(x=players.columns, y=players.isnull().mean() * 100, palette='viridis')
plt.title('Percentage of Missing Values in Each Column')
plt.xlabel('Columns')
plt.ylabel('Percentage of Missing Values')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()

In [None]:
# Plays Data Inspection
data_path3 = '/kaggle/input/nfl-big-data-bowl-2024/plays.csv'
plays = pd.read_csv(data_path3, header=0)
plays.head(5).style.set_caption("Sample of the plays data"). \
set_properties(**{'border': '1.3px solid blue',
                          'color': 'grey'})

for col in plays.columns:
    msg = 'column: {:>10}\t Percent of NaN value: {:.2f}%'.format(col,100*(plays[col].isnull().sum()/plays[col].shape[0]))
    PrintColor(f"\n---> {msg}");

plays.info()

summarize_dataframe(plays).style.background_gradient(cmap='Reds')

In [None]:
# Plays Data Inspection
data_path3 = '/kaggle/input/nfl-big-data-bowl-2024/plays.csv'
plays = pd.read_csv(data_path3, header=0)
plays.head(5).style.set_caption("Sample of the plays data"). \
set_properties(**{'border': '1.3px solid blue',
                          'color': 'grey'})

for col in plays.columns:
    msg = 'column: {:>10}\t Percent of NaN value: {:.2f}%'.format(col,100*(plays[col].isnull().sum()/plays[col].shape[0]))
    PrintColor(f"\n---> {msg}");

plays.info()

summarize_dataframe(plays).style.background_gradient(cmap='Reds')

In [None]:
# Tackles Data Inspection
data_path4 = '/kaggle/input/nfl-big-data-bowl-2024/tackles.csv'
tackles = pd.read_csv(data_path4, header=0)
tackles.head(5).style.set_caption("Sample of the tackles data"). \
set_properties(**{'border': '1.3px solid blue',
                          'color': 'grey'})

for col in tackles.columns:
    msg = 'column: {:>10}\t Percent of NaN value: {:.2f}%'.format(col,100*(tackles[col].isnull().sum()/tackles[col].shape[0]))
    PrintColor(f"\n---> {msg}");

tackles.info()

summarize_dataframe(tackles).style.background_gradient(cmap='Reds')

In [None]:
# Create a new variable in 'tackles' dataframe to hold the maximum value
tackles['attack'] = tackles[['tackle', 'assist', 'forcedFumble']].max(axis=1)
# Merge 'tackles' with 'players' on 'nflId'
final_df = pd.merge(tackles, players[['nflId', 'height', 'weight', 'birthDate', 'collegeName', 'position']], on='nflId', how='left')
# Merge 'tackles' with 'games' on 'gameId'
final_df = pd.merge(final_df, games[['gameId', 'week', 'homeTeamAbbr', 'visitorTeamAbbr']], on='gameId', how='left')
# Merge 'tackles' with 'plays' on 'gameId' and 'playId'
final_df = pd.merge(final_df, plays[['gameId', 'playId', 'quarter', 'down', 'yardsToGo', 'possessionTeam', 'defensiveTeam',
                                    'gameClock', 'yardlineNumber', 'preSnapHomeScore',
                                    'preSnapVisitorScore', 'defendersInTheBox', 'preSnapHomeTeamWinProbability',
                                    'preSnapVisitorTeamWinProbability']], on=['gameId', 'playId'], how='left')

# drop tackle, assist, forcedFumble, pff_missedTackle
final_df = final_df.drop(columns=['tackle', 'assist', 'forcedFumble', 'pff_missedTackle'])

# Display the final dataframe
PrintColor("Sample of New Data Frame:")
print(final_df.head())

In [None]:
# Check for non-numeric columns
non_numeric_columns_final_df = final_df.select_dtypes(exclude=[np.number]).columns
# Display the non-numeric columns in the merged data
PrintColor("Non-Numeric Columns in 'final_df' DataFrame:")
print(non_numeric_columns_final_df)

In [None]:
def convert_height_to_inches(height):
    if isinstance(height, int):
        return height  # Return the height unchanged if it's already an integer
    else:
        feet, inches = map(int, height.split('-'))
        return feet * 12 + inches

# Apply the function to the 'height' column
final_df['height'] = final_df['height'].apply(convert_height_to_inches)

# Convert gameClock from the given format to seconds
def convert_gameclock_to_seconds(gameclock):
    if pd.isna(gameclock):  # Handle missing values
        return np.nan

    minutes, seconds = map(int, gameclock.split(':'))
    total_seconds = minutes * 60 + seconds
    return total_seconds

# Apply the conversion function to the 'gameClock' column
final_df['gameClockSeconds'] = final_df['gameClock'].apply(convert_gameclock_to_seconds)
# Drop the original 'gameClock' column
final_df.drop('gameClock', axis=1, inplace=True)

# 'birthDate' is in string format, convert it to datetime
final_df['birthDate'] = pd.to_datetime(final_df['birthDate'], errors='coerce')
# Calculate age from 'birthDate'
final_df['age'] = (datetime.datetime.now() - final_df['birthDate']).dt.days // 365
# Handle NaN values by replacing them with the average age
average_age = final_df['age'].mean()
final_df['age'].fillna(average_age, inplace=True)
# Drop the original 'birthDate' column
final_df.drop('birthDate', axis=1, inplace=True)

In [None]:
# Encodoing cateogrical values to numerical values
# List of columns to be encoded
columns_to_encode = ['collegeName', 'position', 'homeTeamAbbr', 'visitorTeamAbbr', 'possessionTeam', 'defensiveTeam']
# Dictionary to store mappings
encoding_mappings = {}
# Encode each column
label_encoder = LabelEncoder()
for column in columns_to_encode:
    # Fill NaN values with a placeholder before encoding
    final_df[column].fillna('NaN', inplace=True)

    # Fit and transform with LabelEncoder
    final_df[column] = label_encoder.fit_transform(final_df[column])

    # Store the mappings
    encoding_mappings[column] = dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))

In [None]:
# Display the mappings
print("Encoding Mappings:")
for column, mapping in encoding_mappings.items():
    print(f"\n{column} Mapping:")
    print(mapping)

In [None]:
# Display the  DataFrame
PrintColor("Sample of Final Data Frame:")
print(final_df.head())
print(final_df)

In [None]:
## Correlation Analysis
# Check correlation between variables and 'attack' in 'final_df' DataFrame
correlation_final_df = final_df.corr()['attack']
# Display the correlation values
PrintColor("Correlation with 'attack' in 'final_df' DataFrame:")
print(correlation_final_df)
# Calculate the correlation matrix
correlation_matrix = final_df.corr()
# Set up the matplotlib figure
plt.figure(figsize=(12, 10))
# Draw the heatmap with the mask and correct aspect ratio
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt='.2f', linewidths=.5)
# Show the plot
plt.show()

In [None]:

## Modeling
# Feature Importance Analysis
# Drop rows with NaN values
final_df.dropna(inplace=True)
# Define features (X) and target variable (y)
X = final_df.drop(['attack'], axis=1)
y = final_df['attack']
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Create a Random Forest Classifier
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
# Fit the model
rf_classifier.fit(X_train, y_train)
# Get feature importances
feature_importances = rf_classifier.feature_importances_
# Create a DataFrame to visualize feature importances
feature_importance_df = pd.DataFrame({'Feature': X.columns, 'Importance': feature_importances})
# Sort the DataFrame by importance
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)
# Plot the feature importances
plt.figure(figsize=(10, 6))
plt.barh(feature_importance_df['Feature'], feature_importance_df['Importance'])
plt.xlabel('Importance')
plt.title('Feature Importances')
plt.show()


# Based on feature selection,
# Set a threshold for feature importance
threshold = 0.05
# Filter features above the threshold
selected_features = feature_importance_df[feature_importance_df['Importance'] >= threshold]['Feature']
# Subset the original DataFrame with the selected features
X_selected = X[selected_features]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_selected, y, test_size=0.2, random_state=42)
print('X_train:')
print(X_train)
print('X_test:')
print(X_test)
print('y_train:')
print(y_train)
print('y_test:')
print(y_test)

In [None]:
# Logistic Regression Model
# Initialize the Logistic Regression model
model1 = LogisticRegression()
# Train the model
model1.fit(X_train, y_train)
# Make predictions on the test set
predictions = model1.predict(X_test)
# Evaluate accuracy
accuracy = accuracy_score(y_test, predictions)
print(f"Accuracy of the model using Logistic Regresssion Classifier: {accuracy}")

# Random Forest Model
# Initialize the Random Forest model
model2 = RandomForestClassifier(n_estimators=100, random_state=42)
# Train the model
model2.fit(X_train, y_train)
# Make predictions on the test set
predictions = model2.predict(X_test)
# Evaluate accuracy
accuracy = accuracy_score(y_test, predictions)
print(f"Accuracy of the model using Random Forest Classifier : {accuracy}")

# SVM Model
# Scale the features (important for SVM)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_selected)
# Try different kernels
kernels = ['linear', 'rbf', 'poly', 'sigmoid']
for kernel in kernels:
    # Initialize the SVM model
    model3 = SVC(kernel=kernel, random_state=42)
    # Train the model
    model3.fit(X_train, y_train)
    # Make predictions on the test set
    predictions = model3.predict(X_test)
    # Evaluate accuracy
    accuracy = accuracy_score(y_test, predictions)
    print(f"Accuracy of the model using SVM Claissifier with {kernel} kernel: {accuracy}")

# Adaboost Model
# Initialize the AdaBoost model
model4 = AdaBoostClassifier(n_estimators=50, random_state=42)
# Train the model
model4.fit(X_train, y_train)
# Make predictions on the test set
predictions = model4.predict(X_test)
# Evaluate accuracy
accuracy = accuracy_score(y_test, predictions)
print(f"Accuracy of the model using Adaboost Classifier: {accuracy}")