# This is where we trained a deep learning model to try and classify headlines with positive or negative sentiment. 



Heavily inspired by this article:
https://towardsdatascience.com/sentiment-analysis-on-news-headlines-classic-supervised-learning-vs-deep-learning-approach-831ac698e276

In [1]:
import io
import pandas as pd
#NLP packages
import spacy
from nltk.corpus import stopwords
import string
import nltk
import nltk.tokenize
punc = string.punctuation
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
#Supervised learning
from tqdm import tqdm_notebook as tqdm
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
##Deep learning libraries and APIs
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

ModuleNotFoundError: No module named 'spacy'

In [None]:
#load labeled headline data

df = pd.read_csv('/content/just_labels_and_lines.csv')
df = df.rename(columns = {'headlines': 'text'})
df['text'] = df['text'].astype(str)
# Remove leading and trailing whitespace from the 'headlines' column
df['text'] = df['text'].str.strip()


df.tail(n=5)

In [None]:
#Filtering out unlabeled data points
df= df.loc[df.labels.isin([0,1]), :]
#find the label counts
df['labels'].value_counts()

In [None]:
#make a copy of the dataframe
data = df.copy()


In [None]:
#define a function which handles the text preprocessing
def preparation_text_data(data):
    """
    This pipeline prepares the text data, conducting the following steps:
    1) Tokenization
    2) Lemmatization
    4) Removal of stopwords
    5) Removal of punctuation
    """
    # initialize spacy object
    nlp = spacy.load('en_core_web_sm')
    # select raw text
    raw_text = data.text.values.tolist()
    # tokenize
    tokenized_text = [[nlp(i.lower().strip())] for i in tqdm(raw_text)]
    #define the punctuations and stop words
    punc = string.punctuation
    stop_words = set(stopwords.words('english'))
    #lemmatize, remove stopwords and punctuationd
    corpus = []
    for doc in tqdm(tokenized_text):
        corpus.append([word.lemma_ for word in doc[0] if (word.lemma_ not in stop_words and word.lemma_ not in punc)])
    # add prepared data to df
    data["text"] = corpus
    return data

In [None]:
processed_data = preparation_text_data(data)

In [None]:
#copies
data1 = processed_data.copy()
data2 = processed_data.copy()

Now train the deep learning model now that the text is in an appropriate format

In [None]:
##store headlines and labels in respective lists
text = list(data2['text'])
labels = list(data2['labels'])





total_samples = len(text)
train_samples = int(total_samples * 0.75) #75% for training
test_samples = total_samples - train_samples #25% testing

#training and testing sets
training_text = text[:train_samples]
testing_text = text[train_samples:]
training_labels = labels[:train_samples]
testing_labels = labels[train_samples:]

In [None]:
#preprocess
tokenizer = Tokenizer(num_words=10000, oov_token= "<OOV>")
tokenizer.fit_on_texts(training_text)
word_index = tokenizer.word_index
training_sequences = tokenizer.texts_to_sequences(training_text)
training_padded = pad_sequences(training_sequences, maxlen=120, padding='post', truncating='post')
testing_sequences = tokenizer.texts_to_sequences(testing_text)
testing_padded = pad_sequences(testing_sequences, maxlen=120, padding='post', truncating='post')
# convert lists into numpy arrays to make it work with TensorFlow
training_padded = np.array(training_padded)
training_labels = np.array(training_labels)
testing_padded = np.array(testing_padded)
testing_labels = np.array(testing_labels)

In [None]:
import keras
from keras import layers

In [None]:
model = keras.Sequential([
    keras.layers.Embedding(10000, 16, input_length=120),
    keras.layers.GlobalAveragePooling1D(),
    keras.layers.Dense(24, activation='relu'),
    keras.layers.Dense(1, activation='sigmoid')
])
##compile the model
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])

model.summary()

In [None]:
num_epochs = 10
history = model.fit(training_padded,
                    training_labels,
                    epochs=num_epochs,
                    validation_data=(testing_padded, testing_labels),
                    verbose=2)

In [None]:

#test on new headline
new_headline = ["Chargers' Jim Harbaugh calls J.J. McCarthy best QB in NFL draft"]
sequences = tokenizer.texts_to_sequences(new_headline)
padded_seqs = pad_sequences(sequences, maxlen=120, padding='post', truncating='post')
print(model.predict(padded_seqs, verbose=0 ))

# Adding weekly sentiment score to each game in the odds/game database

## (this part is in R)

In [2]:
# install.packages('readxl')
library(readxl)
library(dplyr)
library(lubridate)


#load first odds/game dataset
df4 <- read_excel('/content/nfl (2).xlsx')

#load second odds/game dataset
df3 <-read.csv('/content/NFL_Odds_Data_2017-2022.csv')

df2 <-read.csv('/content/df_scored_9.csv')

head(df2)

#select columns of interest
selected_columns <- c("Date", "Home Team", "Away Team", "Home Odds Close", "Home Odds Open", "Away Odds Open", "Away Odds Close", "Home Score", "Away Score")
df4 <- df4[, selected_columns]

#rename
names(df4) <- c("Date", "Home_Team", "Away_Team", "Home_Odds_Close", "Home_Odds_Open", "Away_Odds_Open", "Away_Odds_Close", "Home_Score", "Away_Score")


#set date range
start_date <- as.Date("2017-09-01")
end_date <- as.Date("2022-02-20")

#edit df4 to fit date range
df4 <- df4[df4$Date >= start_date & df4$Date <= end_date, ]

#convert to date format
df3$Date <- as.POSIXct(df3$Date, format = "%Y-%m-%d")

SyntaxError: invalid syntax (3632415048.py, line 19)

In [None]:
#load first odds/game dataset
df4 <- read_excel('/content/nfl (2).xlsx')

In [None]:
#select columns of interest
selected_columns <- c("Date", "Home Team", "Away Team", "Home Odds Close", "Home Odds Open", "Away Odds Open", "Away Odds Close", "Home Score", "Away Score")
df4 <- df4[, selected_columns]

#rename
names(df4) <- c("Date", "Home_Team", "Away_Team", "Home_Odds_Close", "Home_Odds_Open", "Away_Odds_Open", "Away_Odds_Close", "Home_Score", "Away_Score")



In [None]:
# Create a new column "winner"
df4$winner <- ifelse(df4$Home_Score > df4$Away_Score, df4$Home_Team,
                    ifelse(df4$Away_Score > df4$Home_Score, df4$Away_Team, "Tie"))



In [None]:
# Function to count wins and losses for a team among the last 8 games
count_wins_losses <- function(team, game_date) {
  # Subset the dataframe for the last 8 games where the team is either the home or away team
  recent_games <- df4[(df4$Home_Team == team | df4$Away_Team == team) & df4$Date < game_date, ]
  recent_games <- recent_games[order(recent_games$Date, decreasing = TRUE), ]  # Order by Date descending

  # Select the most recent 8 games or all available games if fewer than 8
  recent_games <- head(recent_games, 8)

  # Count the number of wins and losses for the team among the selected games
  wins <- sum(recent_games$winner == team, na.rm = TRUE)
  losses <- sum(recent_games$winner != team & !is.na(recent_games$winner), na.rm = TRUE)

  return(c(wins, losses))
}

df4$Home_Wins_Last_8 <- apply(df4, 1, function(x) count_wins_losses(x["Home_Team"], x["Date"])[1])
df4$Home_Losses_Last_8 <- apply(df4, 1, function(x) count_wins_losses(x["Home_Team"], x["Date"])[2])



df4$Away_Wins_Last_8 <- apply(df4, 1, function(x) count_wins_losses(x["Away_Team"], x["Date"])[1])
df4$Away_Losses_Last_8 <- apply(df4, 1, function(x) count_wins_losses(x["Away_Team"], x["Date"])[2])




In [None]:
#select columns of interest
selected_columns <- c("Date", "Home_Team", "Away_Team", "Home_Odds_Close", "Home_Odds_Open", "Away_Odds_Open", "Away_Odds_Close", "Home_Score", "Away_Score","winner","Home_Wins_Last_8","Home_Losses_Last_8","Away_Wins_Last_8","Away_Losses_Last_8")
df4 <- df4[, selected_columns]

#rename
names(df4) <- c("Date", "Home_Team", "Away_Team", "Home_Odds_Close", "Home_Odds_Open", "Away_Odds_Open", "Away_Odds_Close", "Home_Score", "Away_Score","winner","Home_Wins_Last_8","Home_Losses_Last_8","Away_Wins_Last_8","Away_Losses_Last_8")


#edit df4 to fit date range
df4 <- df4[df4$Date >= start_date & df4$Date <= end_date, ]

In [None]:
# dictionary to standardize team names

nfl_teams <- c(
  "Arizona Cardinals" = c("Cardinals", "Arizona", "Redbirds", "arizona", "Cards", "AZ", "ARI", "Cardinal"),
  "Atlanta Falcons" = c("Falcons", "Atlanta", "Falcs", " ATL", "Dirty Birds", "Falcon"),
  "Baltimore Ravens" = c("Ravens", "Baltimore", "BAL", "Flock", "Raven"),
  "Buffalo Bills" = c("Bills", "Buffalo", "Mafia", "BUF", "BUFFS", "Bill"),
  "Carolina Panthers" = c("Panthers", "Carolina", "Cats", "CAR", "Panthers"),
  "Chicago Bears" = c("Bears", "Chicago", "CHI", "Bear", "Midway"),
  "Cincinnati Bengals" = c("Bengals", "Cincinnati", "Bungles", "Bengal", "Stripes", "Tiger"),
  "Cleveland Browns" = c("Browns", "Cleveland", "Brown", "Dawgs", "Dawg", "CLE"),
  "Dallas Cowboys" = c("Cowboys", "Dallas", "DAL", "America's Team", "Dem Boy"),
  "Denver Broncos" = c("Broncos", "Denver", "Orange Crush", "DEN"),
  "Detroit Lions" = c("Lions", "Detroit", "Motor City", "Kitties", "DET"),
  "Green Bay Packers" = c("Packers", "Green Bay", "GB", "Pack", "Cheeseheads", "Cheese Heads", "Packer","GreenBay"),
  "Houston Texans" = c("Texans", "Houston", "Hou", "H-Town", "Texan"),
  "Indianapolis Colts" = c("Colts", "Indianapolis", "IND", "Colt"),
  "Jacksonville Jaguars" = c("Jaguars", "Jacksonville", "JAX", "Jags", "Jaguar", "Jag"),
  "Kansas City Chiefs" = c("Chiefs", "Kansas City", "KC", "Chief", "Kansas","KansasCity","KCChiefs"),
  "Los Angeles Chargers" = c("Chargers", "LA Chargers", "Charger", "LAC", "SD", "San Diego", "Bolts","LAChargers"),
  "Las Vegas Raiders" = c("Raiders", "Las Vegas", "LV", "Oakland", "OAK", "Silver and Black", "Raider","LasVegas","LVRaiders"),
  "Los Angeles Rams" = c("Rams", "LA Rams", "LAR", "St. Louis", "STL", "LAR", "Saint Louis", "Ram","LARams"),
  "Miami Dolphins" = c("Dolphins", "Miami", "MIA", "Dolphin", "Fins"),
  "Minnesota Vikings" = c("Vikings", "Minnesota", "Purple", "MIN", "Vikes", "Viking"),
  "New England Patriots" = c("Patriots", "New England", "Pats", "NE", "Patriot", "NewEngland"),
  "New Orleans Saints" = c("Saints", "New Orleans", "NO", "Saint","NewOrleans"),
  "New York Giants" = c("Giants", "NY Giants", "NYG", "Giant", "G-Men","NYGiants"),
  "New York Jets" = c("Jets", "NY Jets", "NYJ", "Jet", "NYJets"),
  "Philadelphia Eagles" = c("Eagles", "Philadelphia", "Philly", "Birds", "PHI", "Eagle"),
  "Pittsburgh Steelers" = c("Steelers", "Pittsburgh", "PIT", "Black and Gold", "Steeler"),
  "San Francisco 49ers" = c("49ers", "San Francisco", "9ers", "Niners", "SF", "Niner","SanFrancisco"),
  "Seattle Seahawks" = c("Seahawks", "Seattle", "Legion of Boom", "SEA", "Seahawk"),
  "Tampa Bay Buccaneers" = c("Buccaneers", "Tampa Bay", "TB", "Bucs", "Tampa", "Buccaneer","TampaBay"),
  "Tennessee Titans" = c("Titans", "Tennessee", "Music City", "Titan", "TEN"),
  "Washington Football Team" = c("Washington", "Football Team", "Redskins", "Skins", "Washington Football Team", "WFT", "Commanders", "Commies","Washingtom")
)


# Function to replace team names
replace_team <- function(team_name) {
  for (team in names(nfl_teams)) {
    if (team_name %in% nfl_teams[[team]]) {
      return(team)
    }
  }
  return(team_name)  # Return original name if no match found
}





In [None]:

df3$Visitor <- sapply(df3$Visitor, replace_team)
df3$Home <- sapply(df3$Home, replace_team)


In [None]:
remove_numbers <- function(team_name) {
  # Use regular expression to remove numbers at the end of the team names
  clean_name <- gsub("\\d+$", "", team_name)
  return(clean_name)
}

df3$Visitor <-remove_numbers(df3$Visitor)
df3$Home <-remove_numbers(df3$Home)

df3$Date <- as.POSIXct(df3$Date, format = "%Y-%m-%d")
head(df3)

In [None]:
#check to make sure the 32 teams are present only under one name each

unique_home_values <- unique(df3$Home)
print(unique_home_values)

In [None]:
#select columns of interest
selected_columns <- c("Date", "Away_final_score", "Home_final_score", "Visitor", "Home", "Moneyline_away", "Moneyline_home")
df3 <- df3[, selected_columns]

In [None]:
#merge the two odds dataframes
merged_df <- merge(df4, df3, by.x = c("Date", "Home_Team", "Away_Team"), by.y = c("Date", "Home", "Visitor"), all.x = TRUE)


In [None]:
#standardize ml odds to one format (decimal)

american_to_decimal <- function(moneyline) {
  decimal_odds <- ifelse(moneyline > 0, (moneyline / 100) + 1, (100 / abs(moneyline)) + 1)
  return(decimal_odds)
}
temp_scores <- merged_df$Moneyline_away

merged_df$Moneyline_away <- merged_df$Moneyline_home
merged_df$Moneyline_home <- temp_scores

#calculate an average ml value
merged_df$Decimal_Odds_Away <- american_to_decimal(merged_df$Moneyline_away)
merged_df$Decimal_Odds_Home <- american_to_decimal(merged_df$Moneyline_home)

merged_df$Home_ML_AVG <- (merged_df$Home_Odds_Close + merged_df$Decimal_Odds_Home) / 2
merged_df$Away_ML_AVG <- (merged_df$Away_Odds_Close + merged_df$Decimal_Odds_Away) / 2


In [None]:
merged_df$Difference <- merged_df$Home_Odds_Close - merged_df$Decimal_Odds_Home

# Create a line plot for the difference between two odds sources
ggplot(merged_df, aes(x = 1:nrow(merged_df), y = Difference)) +
  geom_line(color = "blue") +
  labs(x = "Index", y = "Difference (Home_Odds_Close - Decimal_Odds_Home)") +
  theme_minimal()

In [None]:

#some outliers seem too big to not be errors so default to the dataset that seems more accurate that we didn't have to completely reformat in those cases.

# Calculate the difference between Home_Odds_Close and Decimal_Odds_Home
merged_df$Difference <- abs(merged_df$Home_Odds_Close - merged_df$Decimal_Odds_Home)

# Update Home_ML_AVG based on the condition
merged_df$Home_ML_AVG <- ifelse(merged_df$Difference > 1.5, merged_df$Home_Odds_Close, merged_df$Home_ML_AVG)

# Calculate the difference between Away_Odds_Close and Decimal_Odds_Away
merged_df$DifferenceA <- abs(merged_df$Away_Odds_Close - merged_df$Decimal_Odds_Away)

# Update Away_ML_AVG based on the condition
merged_df$Away_ML_AVG <- ifelse(merged_df$DifferenceA > 1.5, merged_df$Away_Odds_Close, merged_df$Away_ML_AVG)

# Replace NA values in Home_ML_AVG and Away_ML_AVG with respective odds close values
merged_df$Away_ML_AVG[is.na(merged_df$Away_ML_AVG)] <- merged_df$Away_Odds_Close[is.na(merged_df$Away_ML_AVG)]
merged_df$Home_ML_AVG[is.na(merged_df$Home_ML_AVG)] <- merged_df$Home_Odds_Close[is.na(merged_df$Home_ML_AVG)]




In [3]:
library(ggplot2)



# Calculate and plot the difference between Home_Odds_Close and Home_ML_AVG to see if the average makes sense
merged_df$Difference_Home <- merged_df$Home_Odds_Close - merged_df$Home_ML_AVG

ggplot(merged_df, aes(x = 1:nrow(merged_df), y = Difference_Home)) +
  geom_line(color = "blue") +
  labs(x = "Index", y = "Difference (Home_Odds_Close - Home_ML_AVG)") +
  theme_minimal()

SyntaxError: invalid syntax (2183194485.py, line 6)

In [None]:
# #condense the data down to just the variables we need

condensed_df <- merged_df[, c("Date", "Home_Team", "Away_Team", "Home_Score", "Away_Score",'winner','Home_Wins_Last_8','Home_Losses_Last_8','Away_Wins_Last_8','Away_Losses_Last_8', "Home_ML_AVG", "Away_ML_AVG")]





head(condensed_df)

# Adding the sentiment score to the new odds dataframe

In [None]:
# Step 1: Calculate the date range for the past week
date_range <- function(date) {
  start_date <- as.Date(date) - 7
  end_date <- as.Date(date)
  return(list(start_date, end_date))
}

# Step 2: Function to calculate sentiment_past_week
calculate_sentiment <- function(team, date, df2) {
  date_range <- date_range(date)
  start_date <- date_range[[1]]
  end_date <- date_range[[2]]
  sentiment <- sum(df2$total_score[df2$team == team & df2$archiveDate >= start_date & df2$archiveDate <= end_date], na.rm = TRUE)
  return(sentiment)
}

In [None]:
# Step 3: Apply the function to each row in
condensed_df$sentiment_past_week_away <- mapply(calculate_sentiment, condensed_df$Away_Team, condensed_df$Date, list(df2))
condensed_df$sentiment_past_week_home <- mapply(calculate_sentiment, condensed_df$Home_Team, condensed_df$Date, list(df2))

In [None]:

# sort by date descending, also swap home and away for continuity
condensed_df <- condensed_df[order(condensed_df$Date, decreasing = TRUE), ]

col_names <- colnames(condensed_df)

index_away <- which(col_names == "sentiment_past_week_away")
index_home <- which(col_names == "sentiment_past_week_home")

col_names[index_away] <- "sentiment_past_week_home"
col_names[index_home] <- "sentiment_past_week_away"

condensed_df <- condensed_df[, col_names]



head(condensed_df,n=100 )

In [None]:
library(dplyr)

# Remove rows where all columns are NA or 0
condensed_df <- condensed_df %>%
  filter(rowSums(is.na(.) | . == 0, na.rm = TRUE) < ncol(.))

In [None]:
head(condensed_df)

In [8]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score


In [14]:
overall = pd.read_csv('condensed_df_sentiment2.csv')

In [15]:
overall

Unnamed: 0.1,Unnamed: 0,Date,Home_Team,Away_Team,Home_Score,Away_Score,winner,Home_Wins_Last_8,Home_Losses_Last_8,Away_Wins_Last_8,Away_Losses_Last_8,Home_ML_AVG,Away_ML_AVG,sentiment_past_week_home,sentiment_past_week_away
0,1,2022-02-13,Cincinnati Bengals,Los Angeles Rams,20,23,Los Angeles Rams,6,2,7,1,2.800000,1.450000,1.233828,0.075111
1,2,2022-01-30,Kansas City Chiefs,Cincinnati Bengals,24,27,Cincinnati Bengals,7,1,5,3,1.306290,3.700000,-0.640934,0.727999
2,3,2022-01-30,Los Angeles Rams,San Francisco 49ers,20,17,Los Angeles Rams,7,1,6,2,1.562778,2.625000,0.939055,1.002302
3,4,2022-01-23,Kansas City Chiefs,Buffalo Bills,42,36,Kansas City Chiefs,7,1,6,2,1.740370,2.150000,-0.836742,0.862375
4,5,2022-01-23,Tampa Bay Buccaneers,Los Angeles Rams,27,30,Los Angeles Rams,7,1,6,2,1.663333,2.300000,0.452804,0.987093
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1350,1351,2017-09-10,Los Angeles Rams,Indianapolis Colts,46,9,Los Angeles Rams,1,7,5,3,1.540658,2.625000,0.790335,0.000000
1351,1352,2017-09-10,San Francisco 49ers,Carolina Panthers,3,23,Carolina Panthers,1,7,3,5,2.835000,1.473391,0.000000,0.000000
1352,1353,2017-09-10,Tennessee Titans,Oakland Raiders,16,26,Oakland Raiders,5,3,5,3,1.729000,2.240000,1.630709,0.000000
1353,1354,2017-09-10,Washington Redskins,Philadelphia Eagles,17,30,Philadelphia Eagles,4,4,3,5,1.980000,1.925000,0.000000,0.000000


In [18]:
overall = overall.replace(0, pd.NA).dropna()
overall

RecursionError: maximum recursion depth exceeded while calling a Python object

In [None]:
sentiment_home = overall['sentiment_past_week_home'].values
sentiment_away = overall['sentiment_past_week_away'].values

moneyline_home = overall['Home_ML_AVG'].values
moneyline_away = overall['Away_ML_AVG'].values

sentiment_home1 = sentiment_home.reshape(-1,1)
sentiment_away1 = sentiment_away.reshape(-1,1)

moneyline_home1 = moneyline_home.reshape(-1,1)
moneyline_away1 = moneyline_away.reshape(-1,1)

In [None]:
## train and test sets
## home games
X_train, X_test, y_train, y_test = train_test_split(sentiment_home1, moneyline_home1, test_size = 0.2, random_state = 42)

## away games
X_train1, X_test1, y_train1, y_test1 = train_test_split(sentiment_away1, moneyline_away1, test_size = 0.2, random_state = 42)

## linear model
model = LinearRegression()
model1 = LinearRegression()
model.fit(X_train, y_train)

model1.fit(X_train1, y_train1)

## predictions
predict = model.predict(X_test)
predict1 = model1.predict(X_test1)


# Evaluate the model
score = r2_score(y_test, predict)
print("R-squared score:", score)

score1 = r2_score(y_test1, predict1)
print("R-squared score:", score1)

In [None]:
## use a random forest 

model = RandomForestRegressor(n_estimators = 100, random_state = 42)
model1 = RandomForestRegressor(n_estimators = 100, random_state = 42)

model.fit(X_train, y_train)
model1.fit(X_train1, y_train1)

predictions = model.predict(X_test)
predictions1 = model1.predict(X_test1)

# Evaluate the model
score = r2_score(y_test, predictions)
print("R-squared score:", score)

score1 = r2_score(y_test1, predictions)
print("R-squared score:", score1)