# Getting Started

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import re
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

import tensorflow as tf

In [None]:
data = pd.read_csv('../input/college-football-attendance-2000-to-2018/CFBeattendance.csv', encoding='latin-1')

In [None]:
data

# Preprocessing

In [None]:
features_to_drop = ['Date', 'Site', 'Team', 'Opponent']

data.drop(features_to_drop, axis=1, inplace=True)

## Missing Values

In [None]:
data.isna().sum()

## Encoding

In [None]:
data.dtypes

In [None]:
categorical_features = ['Time', 'Rank', 'TV', 'Opponent_Rank', 'Conference']

In [None]:
def get_uniques(df, columns):
    return {column: list(df[column].unique()) for column in columns}

In [None]:
get_uniques(data, categorical_features)

In [None]:
binary_features = ['TV', 'New Coach', 'Tailgating']

ordinal_features = ['Time', 'Rank', 'Opponent_Rank']

nominal_features = ['Conference']

### Binary Encoding

In [None]:
data['TV'].value_counts()

In [None]:
data['TV'] = data['TV'].apply(lambda x: 0 if x == 'Not on TV' else 1)

In [None]:
data['New Coach'] = data['New Coach'].astype(np.int)
data['Tailgating'] = data['Tailgating'].astype(np.int)

In [None]:
data

### Ordinal Encoding

In [None]:
data['Rank'].unique()

In [None]:
data['Rank'] = data['Rank'].apply(lambda x: 26 if x == 'NR' else np.int(x))
data['Opponent_Rank'] = data['Opponent_Rank'].apply(lambda x: 26 if x == 'NR' else np.int(x))

In [None]:
time_ordering = sorted(data['Time'].unique())

In [None]:
data['Time'] = data['Time'].apply(lambda x: time_ordering.index(x))

In [None]:
data

### Nominal Encoding

In [None]:
data['Conference'].unique()

In [None]:
def onehot_encode(df, column):
    dummies = pd.get_dummies(df[column])
    df = pd.concat([df, dummies], axis=1)
    df.drop(column, axis=1, inplace=True)
    return df

In [None]:
data = onehot_encode(data, 'Conference')

### Label Encoding

In [None]:
data.drop([4355, 5442, 5449, 5456], axis=0, inplace=True)

In [None]:
y = data['Result']
X = data.drop('Result', axis=1)

In [None]:
y

In [None]:
y = y.apply(lambda x :re.search(r'^[^\s]*', x).group(0))

In [None]:
y.unique()

In [None]:
y[(y == 'NC') | (y == 'White') | (y == 'Blue')]

In [None]:
label_encoder = LabelEncoder()

y = label_encoder.fit_transform(y)
y_mappings = {index: value for index, value in enumerate(label_encoder.classes_)}
y_mappings

In [None]:
y

## Scaling

In [None]:
X

In [None]:
scaler = MinMaxScaler()

X = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)
X

# Training

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7)

In [None]:
inputs = tf.keras.Input(shape=(33,))
x = tf.keras.layers.Dense(16, activation='relu')(inputs)
x = tf.keras.layers.Dense(16, activation='relu')(x)
outputs = tf.keras.layers.Dense(1, activation='sigmoid')(x)

model = tf.keras.Model(inputs=inputs, outputs=outputs)


optimizer = tf.keras.optimizers.Adam(learning_rate=0.001)
metrics = [
    tf.keras.metrics.BinaryAccuracy(name='acc'),
    tf.keras.metrics.AUC(name='auc')
]

model.compile(
    optimizer=optimizer,
    loss='binary_crossentropy',
    metrics=metrics
)


batch_size = 32
epochs = 10

history = model.fit(
    X_train,
    y_train,
    validation_split=0.2,
    batch_size=batch_size,
    epochs=epochs,
    verbose=0
)

# Results

In [None]:
plt.figure(figsize=(14, 10))

epochs_range = range(1, epochs + 1)
train_loss = history.history['loss']
val_loss = history.history['val_loss']

plt.plot(epochs_range, train_loss, label="Training Loss")
plt.plot(epochs_range, val_loss, label="Validation Loss")

plt.title("Training and Validation Loss")
plt.xlabel("Epoch")
plt.ylabel("Loss")
plt.legend()

plt.show()

In [None]:
np.argmin(val_loss)

In [None]:
model.evaluate(X_test, y_test)

In [None]:
y.sum() / len(y)