In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
data = pd.read_csv("../input/nfl-scores-and-betting-data/spreadspoke_scores.csv")
teams = pd.read_csv("../input/nfl-scores-and-betting-data/nfl_teams.csv")

In [None]:
data = data.replace(r'^\s*$', np.nan, regex=True)
data = data[(data.score_home.isnull() == False)&(data.team_favorite_id.isnull() == False)&(data.over_under_line.isnull() == False)&(data.schedule_season >= 1980)]
data.reset_index(drop=True, inplace=True)

In [None]:
data.loc[(data.schedule_week == '18'), 'schedule_week'] = '17'
data.loc[(data.schedule_week == 'Wildcard') | (data.schedule_week == 'WildCard'), 'schedule_week'] = '18'
data.loc[(data.schedule_week == 'Division'), 'schedule_week'] = '19'
data.loc[(data.schedule_week == 'Conference'), 'schedule_week'] = '20'
data.loc[(data.schedule_week == 'Superbowl') | (data.schedule_week == 'SuperBowl'), 'schedule_week'] = '21'

In [None]:
data['team_home'] = data.team_home.map(teams.set_index('team_name')['team_id'].to_dict())
data['team_away'] = data.team_away.map(teams.set_index('team_name')['team_id'].to_dict())
data['over_under_line'] = data.over_under_line.astype(float)
data['schedule_week'] = data.schedule_week.astype(int)
data["team_favorite_away"] = (data["team_favorite_id"] == data["team_away"]).astype(int)
data["team_favorite_home"] = (data["team_favorite_id"] == data["team_home"]).astype(int)
data["schedule_playoff"] = data["schedule_playoff"].astype(int)
data["stadium_neutral"] = data["stadium_neutral"].astype(int)
data.schedule_season = data.schedule_season-2000

In [None]:
data["result"] = (data["score_home"]>=data["score_away"]).astype(int)
data = data.drop(["score_away","score_home","team_favorite_id","schedule_date","stadium_neutral","weather_detail"],axis = 1)
data = data.fillna(0)
data["stadium"]=pd.factorize(data.stadium)[0]
data["team_away"]=pd.factorize(data.team_away)[0]
data["team_home"]=pd.factorize(data.team_home)[0]
data['weather_humidity'] = pd.to_numeric(data['weather_humidity'])

In [None]:
from sklearn.model_selection import train_test_split
#using top 10 features from previous part
#data = data.drop(["weather_temperature","weather_wind_mph","weather_humidity","team_favorite_away","team_favorite_home"],1)
labels = np.array(data['result'])
train_data= data.drop('result', axis = 1)
feature_list = list(train_data.columns)
train_features, test_features, train_labels, test_labels = train_test_split(train_data, labels, test_size = 0.20)

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn import metrics
from sklearn.ensemble import BaggingClassifier
from sklearn.model_selection import LeaveOneOut
import warnings
warnings.filterwarnings("ignore")
from sklearn.model_selection import cross_val_score
model = DecisionTreeClassifier()
model.fit(train_features,train_labels)
predictions = model.predict(test_features)
errors = np.sum(abs(predictions - test_labels))/len(predictions)
print('Accuracy ',1 - errors)

#Cross validation
model = DecisionTreeClassifier()
scores = cross_val_score(model,train_data,labels, cv=5)
print('Cross-Validation Accuracy Scores', scores)
print('Accuracy: %.3f (%.3f)' % (np.mean(scores),np.std(scores)))


#Bagging 
pipeline = make_pipeline(StandardScaler(),DecisionTreeClassifier())
bgclassifier = BaggingClassifier(base_estimator=pipeline, n_estimators=100,max_features=10, max_samples=100, random_state=1, n_jobs=5)
bgclassifier.fit(train_data,labels)
print()
print("Bagging")
print('Model training Score: %.3f' %bgclassifier.score(train_data,labels))

#LOOCV 
cv = LeaveOneOut()
model = DecisionTreeClassifier()
scores = cross_val_score(model,train_data,labels, cv=cv)
print()
print("LOOCV")
print('Accuracy: %.3f (%.3f)' % (np.mean(scores),np.std(scores)))

#Colleccting data while changing the training set from 10% to 90%
tree_stats = {}  #dict from %train data to accuracy
for i in range(1,10):
    train_features, test_features, train_labels, test_labels = train_test_split(train_data, labels, test_size = 0.10*i)
    model = DecisionTreeClassifier()
    model.fit(train_features,train_labels)
    predictions = model.predict(test_features)
    Accuracy = 1 - np.sum(abs(predictions - test_labels))/len(predictions)
    tree_stats[100 - 10*i]  = Accuracy
print()
plt.title("Decision Tree, % training set vs Test Set Accuracy")
plt.ylim(0.5,0.6)
plt.bar(tree_stats.keys(),tree_stats.values(),width = 8)

In [None]:
#logistic regression
from sklearn.linear_model import LogisticRegression
model = LogisticRegression()
model.fit(train_features,train_labels)
predictions = model.predict(test_features)
errors = np.sum(abs(predictions - test_labels))/len(predictions)
print('Accuracy ',1 - errors)

#Cross validation
model = LogisticRegression()
scores = cross_val_score(model,train_data,labels, cv=5)
print('Cross-Validation Accuracy Scores', scores)
print('Accuracy: %.3f (%.3f)' % (np.mean(scores),np.std(scores)))

#LOOCV 
cv = LeaveOneOut()
model = LogisticRegression()
scores = cross_val_score(model,train_data,labels, cv=cv)
print()
print("LOOCV")
print('Accuracy: %.3f (%.3f)' % (np.mean(scores),np.std(scores)))

#Bagging 
pipeline = make_pipeline(StandardScaler(),LogisticRegression())
bgclassifier = BaggingClassifier(base_estimator=pipeline, n_estimators=100,max_features=10, max_samples=100, random_state=1, n_jobs=5)
bgclassifier.fit(train_data,labels)
print()
print("Bagging")
print('Model training Score: %.3f' %bgclassifier.score(train_data,labels))

logreg_stats = {}  #dict from %train data to accuracy
for i in range(1,10):
    train_features, test_features, train_labels, test_labels = train_test_split(train_data, labels, test_size = 0.10*i)
    model = LogisticRegression()
    model.fit(train_features,train_labels)
    predictions = model.predict(test_features)
    Accuracy = 1 - np.sum(abs(predictions - test_labels))/len(predictions)
    logreg_stats[100 - 10*i]  = Accuracy
print()
plt.title("Logistic Regression, % training set vs Test Set Accuracy")
plt.ylim(0.6,0.7)
plt.bar(logreg_stats.keys(),logreg_stats.values(),width = 8)

In [None]:
#Guassian Naive Bayes
from sklearn.naive_bayes import GaussianNB
model = GaussianNB()
model.fit(train_features,train_labels)
predictions = model.predict(test_features)
errors = np.sum(abs(predictions - test_labels))/len(predictions)
print('Accuracy ',1 - errors)

#Cross validation
model = GaussianNB()
scores = cross_val_score(model,train_data,labels, cv=5)
print('Cross-Validation Accuracy Scores', scores)
print('Accuracy: %.3f (%.3f)' % (np.mean(scores),np.std(scores)))

#LOOCV 
cv = LeaveOneOut()
model = GaussianNB()
scores = cross_val_score(model,train_data,labels, cv=cv)
print()
print("LOOCV")
print('Accuracy: %.3f (%.3f)' % (np.mean(scores),np.std(scores)))

#Bagging 
pipeline = make_pipeline(StandardScaler(),GaussianNB())
bgclassifier = BaggingClassifier(base_estimator=pipeline, n_estimators=100,max_features=10, max_samples=100, random_state=1, n_jobs=5)
bgclassifier.fit(train_data,labels)
print()
print("Bagging")
print('Model training Score: %.3f' %bgclassifier.score(train_data,labels))

nb_stats = {}  #dict from %train data to accuracy
for i in range(1,10):
    train_features, test_features, train_labels, test_labels = train_test_split(train_data, labels, test_size = 0.10*i)
    model = LogisticRegression()
    model.fit(train_features,train_labels)
    predictions = model.predict(test_features)
    Accuracy = 1 - np.sum(abs(predictions - test_labels))/len(predictions)
    nb_stats[100 - 10*i]  = Accuracy
print()
print("Guassian NB, Training set % mapped to test accuracy")
plt.ylim(0.6,0.7)
plt.bar(nb_stats.keys(),nb_stats.values(),width = 8)

In [None]:
acc = {}
RMSE = {}

from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
model = BaggingClassifier(base_estimator = DecisionTreeClassifier())
train_features, test_features, train_labels, test_labels = train_test_split(train_data, labels, test_size = 0.15)
model.fit(train_features,train_labels)
predictions = model.predict(test_features)
print("Bagging Classifier")
print('Mean Absolute Error:', metrics.mean_absolute_error(test_labels,predictions)) 
print('Mean Squared Error:', metrics.mean_squared_error(test_labels,predictions)) 
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(test_labels,predictions)))
print()
acc["BaggingClassifier"] = 1 - metrics.mean_absolute_error(test_labels,predictions)
RMSE["BaggingClassifier"] = np.sqrt(metrics.mean_squared_error(test_labels,predictions))

from sklearn.ensemble import RandomForestClassifier
model =RandomForestClassifier(n_estimators=150)
train_features, test_features, train_labels, test_labels = train_test_split(train_data, labels, test_size = 0.15)
model.fit(train_features,train_labels)
predictions = model.predict(test_features)
print("Random Forest Classifier")
print('Mean Absolute Error:', metrics.mean_absolute_error(test_labels,predictions)) 
print('Mean Squared Error:', metrics.mean_squared_error(test_labels,predictions)) 
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(test_labels,predictions)))
print()
acc["RandomForestClassifier"] = 1 - metrics.mean_absolute_error(test_labels,predictions)
RMSE["RandomForestClassifier"] = np.sqrt(metrics.mean_squared_error(test_labels,predictions))


from sklearn.ensemble import GradientBoostingClassifier
model = GradientBoostingClassifier()
train_features, test_features, train_labels, test_labels = train_test_split(train_data, labels, test_size = 0.15)
model.fit(train_features,train_labels)
predictions = model.predict(test_features)
print("Gradient Boosting Classifier")
print('Mean Absolute Error:', metrics.mean_absolute_error(test_labels,predictions)) 
print('Mean Squared Error:', metrics.mean_squared_error(test_labels,predictions)) 
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(test_labels,predictions)))
print()
acc["GradientBoostingClassifier"] = 1 - metrics.mean_absolute_error(test_labels,predictions)
RMSE["GradientBoostingClassifier"] = np.sqrt(metrics.mean_squared_error(test_labels,predictions))


from xgboost import XGBClassifier
model = XGBClassifier()
train_features, test_features, train_labels, test_labels = train_test_split(train_data, labels, test_size = 0.15)
model.fit(train_features,train_labels)
predictions = model.predict(test_features)
print("XGBClassifier")
print('Mean Absolute Error:', metrics.mean_absolute_error(test_labels,predictions)) 
print('Mean Squared Error:', metrics.mean_squared_error(test_labels,predictions)) 
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(test_labels,predictions)))
print()
acc["XGBClassifier"] = 1 - metrics.mean_absolute_error(test_labels,predictions)
RMSE["XGBClassifier"] = np.sqrt(metrics.mean_squared_error(test_labels,predictions))

In [None]:
acc

In [None]:
RMSE

In [None]:
plt.title("Accuracy of different models")
plt.ylim(0.3,0.7)
plt.xticks(rotation=90) 
plt.bar(acc.keys(),acc.values())
plt.show()

plt.title("RMSE of different models")
plt.ylim(0.3,0.7)
plt.xticks(rotation=90) 
plt.bar(RMSE.keys(),acc.values())
plt.show()