In [1]:
import pandas as pd

# Combine excel files into one DataFrame
def read_clean():
    years = list(range(2020,2021))
    dfs = []
    for year in years:
        df = pd.read_csv('NFL_{}.csv'.format(year))
        df = df.dropna(subset=['Week'])
        post_season = ['WildCard','Division','ConfChamp','SuperBowl']
        post_season_df = df[df['Week'].isin(post_season)]
        df = pd.concat([df,post_season_df]).drop_duplicates(keep=False)
        dfs.append(df)
    read_clean.NFL_2000_2016 = pd.DataFrame()
    for df in dfs:
        read_clean.NFL_2000_2016 = pd.concat([read_clean.NFL_2000_2016,df])
    read_clean.NFL_2000_2016 = read_clean.NFL_2000_2016.rename(columns={'Unnamed: 5':'Home/Away'}).reset_index(drop=True)
#     print(read_clean.NFL_2000_2016)
read_clean()

In [20]:
read_clean.NFL_2000_2016[read_clean.NFL_2000_2016['Winner/tie'] == 'San Diego Chargers']

Unnamed: 0,Week,Day,Date,Time,Winner/tie,Home/Away,Loser/tie,Unnamed: 7,Pts,Pts.1,...,Home Team,Away Team,HomePts,VisitorPts,HomeWin,HomeLastWin,VisitorLastWin,HomeWinStreak,VisitorWinStreak,HomeTeamWonLast


In [21]:
unique_teams = []
for x in read_clean.NFL_2000_2016['Winner/tie']:
    unique_teams.append(x)
for x in read_clean.NFL_2000_2016['Loser/tie']:
    unique_teams.append(x)
for x in pd.Series(unique_teams).unique():
    print(x)

Kansas City Chiefs
Seattle Seahawks
Buffalo Bills
Washington Football Team
Green Bay Packers
Baltimore Ravens
New England Patriots
Jacksonville Jaguars
Las Vegas Raiders
Chicago Bears
Los Angeles Chargers
New Orleans Saints
Arizona Cardinals
Los Angeles Rams
Pittsburgh Steelers
Tennessee Titans
Cleveland Browns
San Francisco 49ers
Tampa Bay Buccaneers
Dallas Cowboys
Indianapolis Colts
Miami Dolphins
Philadelphia Eagles
Carolina Panthers
Detroit Lions
Denver Broncos
Minnesota Vikings
Cincinnati Bengals
Houston Texans
Atlanta Falcons
New York Giants
New York Jets


# Feature Engineering

In [4]:
# Feature Engineering: Which team was the home team and which team was the away team?
def home_away():
    def home_team (row):
        if row['Home/Away'] != '@':
            return row['Winner/tie']
        else:
            return row['Loser/tie']
    def away_team (row):
        if row['Home/Away'] != '@':
            return row['Loser/tie']
        else:
            return row['Winner/tie']

    read_clean.NFL_2000_2016['Home Team'] = read_clean.NFL_2000_2016.apply(lambda row: home_team(row), axis=1)
    read_clean.NFL_2000_2016['Away Team'] = read_clean.NFL_2000_2016.apply(lambda row: away_team(row), axis=1)
    # NFL_2000_2016['Away Team']
#     print(read_clean.NFL_2000_2016)
home_away()

In [5]:
# Feature Engineering: HomePts vs VisitorPts
def points():
    def home_pts (row):
        if row['Home/Away'] != '@':
            return row['Pts']
        else:
            return row['Pts.1']
    def away_pts (row):
        if row['Home/Away'] != '@':
            return row['Pts.1']
        else:
            return row['Pts']
    read_clean.NFL_2000_2016['HomePts'] = read_clean.NFL_2000_2016.apply(lambda row: home_pts(row), axis=1)
    read_clean.NFL_2000_2016['VisitorPts'] = read_clean.NFL_2000_2016.apply(lambda row: away_pts(row), axis=1)
#     print(read_clean.NFL_2000_2016)
points()

# Baseline?

In most sports the home teams win games more frequently. This will be a good baseline for the models.

In [6]:
# Create target class
def target_class():
    read_clean.NFL_2000_2016['HomeWin'] = read_clean.NFL_2000_2016['VisitorPts'] < read_clean.NFL_2000_2016['HomePts']
    # Class Values
    target_class.y_true = read_clean.NFL_2000_2016['HomeWin'].values
#     print(read_clean.NFL_2000_2016)
target_class()

In [7]:
# n_games = NFL_2000_2016['HomeWin'].count()
# n_homewins = NFL_2000_2016['HomeWin'].sum()
# win_percentage = n_homewins/n_games *100
# print(f"Home Win Percentage: {win_percentage}%")

In [8]:
# # Convert to an f1 measure; simple classifier that predicts home team wins more frequently
# # This will be score we need to beat with model
# from sklearn.metrics import f1_score
# y_pred = [1] * len(y_true)
# print("F1: {:.4f}".format(f1_score(y_true, y_pred, pos_label=None, average='weighted')))

# More Feature Engineering

In [9]:
# Feature Engineering: Did teams win their previous game?
def prev_win():
    read_clean.NFL_2000_2016['HomeLastWin'] = False
    read_clean.NFL_2000_2016['VisitorLastWin'] = False

    from collections import defaultdict
    won_last = defaultdict(int)

    for index, row in read_clean.NFL_2000_2016.iterrows():
        home_team = row['Home Team']
        visitor_team = row['Away Team']
        row["HomeLastWin"] = won_last[home_team]
        row["VisitorLastWin"] = won_last[visitor_team]
        read_clean.NFL_2000_2016.iloc[index] = row
        # Set current win
        won_last[home_team] = row["HomeWin"]
        won_last[visitor_team] = not row['HomeWin']
#     print(read_clean.NFL_2000_2016)
prev_win()

# Basic Classification with a Decision Tree

In [10]:
# from sklearn.tree import DecisionTreeClassifier
# clf = DecisionTreeClassifier(random_state=14)
# y_true

In [11]:
# from sklearn.model_selection import cross_val_score
# import numpy as np

# # f1 is loosely correlated to the accuracy but is less likely to bias by certain factors
# from sklearn.metrics import f1_score, make_scorer, classification_report
# scorer = make_scorer(f1_score, pos_label=None, average='weighted')

# # Create a dataset with the necessary info
# X_previouswins = NFL_2000_2016[["HomeLastWin","VisitorLastWin"]].values
# clf = DecisionTreeClassifier(random_state=14)
# # X_previouswin to decide if target value (y_true; aka was it a Homewin)
# scores = cross_val_score(clf, X_previouswins, y_true, scoring = scorer)
# print("Using just the result of whether or not the home team or visitor team won their last games we get an F1 score of:")
# print("F1: {0:.4f}".format(np.mean(scores)))

# More Feature Engineering

In [12]:
# Feature Engineering: Win streaks.
def win_streaks():
    read_clean.NFL_2000_2016["HomeWinStreak"]=0
    read_clean.NFL_2000_2016["VisitorWinStreak"]=0
    from collections import defaultdict
    win_streak = defaultdict(int)

    for index, row in read_clean.NFL_2000_2016.iterrows():
        home_team = row["Home Team"]
        visitor_team = row["Away Team"]
        row["HomeWinStreak"] = win_streak[home_team]
        row["VisitorWinStreak"] = win_streak[visitor_team]
        read_clean.NFL_2000_2016.iloc[index] = row
        # Set current win
        if row["HomeWin"]:
            win_streak[home_team] +=1
            win_streak[visitor_team] = 0
        else:
            win_streak[home_team] = 0
            win_streak[visitor_team] += 1
    print(read_clean.NFL_2000_2016)
win_streaks()

    Week  Day       Date    Time                Winner/tie Home/Away  \
0      1  Thu  9/10/2020  8:20PM        Kansas City Chiefs       NaN   
1      1  Sun  9/13/2020  1:00PM          Seattle Seahawks         @   
2      1  Sun  9/13/2020  1:00PM             Buffalo Bills       NaN   
3      1  Sun  9/13/2020  1:00PM  Washington Football Team       NaN   
4      1  Sun  9/13/2020  1:00PM         Green Bay Packers         @   
..   ...  ...        ...     ...                       ...       ...   
251   17  Sun   1/3/2021  4:25PM         Green Bay Packers         @   
252   17  Sun   1/3/2021  4:25PM      Los Angeles Chargers         @   
253   17  Sun   1/3/2021  4:25PM         Las Vegas Raiders         @   
254   17  Sun   1/3/2021  4:25PM          Tennessee Titans         @   
255   17  Sun   1/3/2021  8:20PM  Washington Football Team         @   

               Loser/tie Unnamed: 7   Pts  Pts.1  ...  TOL  \
0         Houston Texans   boxscore  34.0   20.0  ...  1.0   
1        At

In [13]:
# Feature Engineering: Did the home team win the last game between the two teams?
def home_team_win_last():
    import collections
    last_match_winner = collections.defaultdict(int)
    
    def home_team_won_last(row):
        home_team = row["Home Team"]
        visitor_team = row["Away Team"]

        # Sort for consistent ordering
        teams = tuple(sorted([home_team,visitor_team]))
        result = 1 if last_match_winner[teams] == row["Home Team"] else 0
        # Update record for next encounter
        winner = row["Home Team"] if row["HomeWin"] else row["Away Team"]

        last_match_winner[teams] = winner

        return result
    read_clean.NFL_2000_2016["HomeTeamWonLast"] = read_clean.NFL_2000_2016.apply(home_team_won_last, axis=1)
#     print(read_clean.NFL_2000_2016)
home_team_win_last()

In [14]:
# OneHotEncoding
def OHE():
    OHE.NFL_model = read_clean.NFL_2000_2016[["Home Team","Away Team","HomeLastWin","VisitorLastWin","HomeWinStreak","VisitorWinStreak","HomeTeamWonLast"]]
    OHE.NFL_model = pd.get_dummies(OHE.NFL_model)
#     for i in OHE.NFL_model.columns:
#         print(i)
OHE()

# Select features we want to feed into the models

In [15]:
def create_X_all():
    import numpy as np
    create_X_all.X_all = np.hstack([OHE.NFL_model])
create_X_all()

In [16]:
# def test_models():
#     import pandas as pd
#     from sklearn.linear_model import LogisticRegression
#     from sklearn import tree
#     from sklearn.naive_bayes import GaussianNB
#     from sklearn.neighbors import KNeighborsClassifier
#     from sklearn.ensemble import RandomForestClassifier
#     from xgboost import XGBClassifier
#     from sklearn.datasets import make_classification
#     from sklearn.model_selection import train_test_split
#     from sklearn.metrics import accuracy_score
#     import numpy as np

#     # features = NFL_2000_2016[["HomeLastWin","VisitorLastWin","HomeWinStreak","VisitorWinStreak","HomeTeamWonLast"]]
#     test_models.X_all = np.hstack([OHE.NFL_model])

#     class Classifiers():
#         def __init__(self, x_train, x_test, y_train):
#             self.x_train = x_train
#             self.y_train = y_train
#             self.x_test = x_test
#         def logistic_regression(self):
#             model = LogisticRegression()
#             model.fit(self.x_train, self.y_train)
#             predicted = model.predict(self.x_test)
#             return predicted
#         def decision_tree(self):
#             model = tree.DecisionTreeClassifier(criterion='gini')
#             model.fit(self.x_train, self.y_train)
#             predicted = model.predict(self.x_test)
#             return predicted
#         def naive_bayes(self):
#             model = GaussianNB()
#             model.fit(self.x_train, self.y_train)
#             predicted = model.predict(self.x_test)
#             return predicted
#         def knn(self):
#             model = KNeighborsClassifier(n_neighbors=6)
#             model.fit(self.x_train, self.y_train)
#             predicted= model.predict(self.x_test)
#             return predicted
#         def random_forest(self):
#             model= RandomForestClassifier()
#             model.fit(self.x_train, self.y_train)
#             predicted = model.predict(self.x_test)
#             return predicted
#         def xgboost(self):
#             model = XGBClassifier()
#             model.fit(self.x_train, self.y_train)
#             predicted = model.predict(self.x_test)
#             return predicted
#     # def test():
#     #     X, y = make_classification(n_samples=1000, n_features=2, n_redundant=0, n_informative=2, random_state=1, n_clusters_per_class=1)
#     #     X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
#     #     data_to_classify = Classifiers(X_train, X_test, y_train)
#     #     d = {'original_result': y_test, 'predicted_result': data_to_classify.random_forest()}
#     #     df = pd.DataFrame(data=d)
#     #     df["correct_prediction"] = (df.original_result == df.predicted_result)
#     #     print(df, f"Accuracy Score: \n{df.correct_prediction.value_counts(normalize=True)}")
#     # test()

#     def compare_Classifiers_accuracy(X_train, X_test, y_train, y_test):
#         outcome_accuracy = {
#             "logistic_regression": accuracy_score(y_test, Classifiers(X_train, X_test, y_train).logistic_regression()),
#             "decision_tree": accuracy_score(y_test, Classifiers(X_train, X_test, y_train).decision_tree()),
#             "naive_bayes": accuracy_score(y_test, Classifiers(X_train, X_test, y_train).naive_bayes()),
#             "knn": accuracy_score(y_test, Classifiers(X_train, X_test, y_train).knn()),
#             "random_forest": accuracy_score(y_test, Classifiers(X_train, X_test, y_train).random_forest()),
#             "xgboost": accuracy_score(y_test, Classifiers(X_train, X_test, y_train).xgboost()),
#         }
#         return outcome_accuracy
#     def main():
#         seed = 1
#     #     X_all, y_true = make_classification(n_samples=10000, n_features=2, n_redundant=0, n_informative=2, random_state=1, n_clusters_per_class=1)
#         X_train, X_test, y_train, y_test = train_test_split(create_X_all.X_all, target_class.y_true, test_size=0.33, random_state=seed)
#         # Validation
#         # X_validation_train, X_validation, y_validation_train, y_validation = train_test_split(X_train, y_train, test_size=0.33, random_state=seed) 
#         print(compare_Classifiers_accuracy(X_train, X_test, y_train, y_test))
#         # print(compare_Classifiers_accuracy(X_validation_train,X_validation))
#     main()
# test_models()

In [17]:
# Building Chosen Model: Random Forest
def build_RF():
    from sklearn.model_selection import train_test_split
    from sklearn.ensemble import RandomForestClassifier
    seed = 1
    X_train, X_test, y_train, y_test = train_test_split(create_X_all.X_all, target_class.y_true, test_size=0.33, random_state=seed)

    model= RandomForestClassifier()
    model.fit(X_train, y_train)
    # predicted = model.predict(X_test)
    # print(X_all)
build_RF()

In [18]:
# Create user input from front end
HT = ['New England Patriots']
AT = ['Kansas City Chiefs']
HLW = [0]
VLW = [0]
HWS = [0]
VWS = [0]
HTWL = [0]
def user_input(HT, AT, HLW, VLW, HWS, VWS, HTWL):
    user_input.NFL_model_ui = read_clean.NFL_2000_2016[["Home Team","Away Team","HomeLastWin","VisitorLastWin","HomeWinStreak","VisitorWinStreak","HomeTeamWonLast"]]
    user_input.user_input = pd.DataFrame({'Home Team':HT,'Away Team':AT,'HomeLastWin':HLW,'VisitorLastWin':VLW,'HomeWinStreak':HWS,'VisitorWinStreak':VWS,'HomeTeamWonLast':HTWL})
    user_input.NFL_model_ui = pd.concat([user_input.NFL_model_ui,user_input.user_input], axis = 0)
    user_input.NFL_model_ui = pd.get_dummies(user_input.NFL_model_ui)
    user_input.NFL_model_ui = pd.DataFrame(user_input.NFL_model_ui.iloc[-1]).T
user_input(HT, AT, HLW, VLW, HWS, VWS, HTWL)

In [19]:
# Test Prediction
def predict():
    from sklearn.model_selection import train_test_split
    from sklearn.ensemble import RandomForestClassifier
    seed = 1
    X_train, X_test, y_train, y_test = train_test_split(create_X_all.X_all, target_class.y_true, test_size=0.33, random_state=seed)

    model= RandomForestClassifier()
    model.fit(X_train, y_train)
    predicted = model.predict(user_input.NFL_model_ui)
    if predicted == [False]:
        print(f'{AT} will win.')
    else:
        print(f'{HT} will win.')
predict()

['New England Patriots'] will win.
