In [1]:
import pandas as pd

In [2]:
olympicTeams = pd.read_csv("teams.csv")

In [3]:
print(olympicTeams)

     team      country  year  events  athletes   age  height  weight  medals  \
0     AFG  Afghanistan  1964       8         8  22.0   161.0    64.2       0   
1     AFG  Afghanistan  1968       5         5  23.2   170.2    70.0       0   
2     AFG  Afghanistan  1972       8         8  29.0   168.3    63.8       0   
3     AFG  Afghanistan  1980      11        11  23.6   168.4    63.2       0   
4     AFG  Afghanistan  2004       5         5  18.6   170.8    64.8       0   
...   ...          ...   ...     ...       ...   ...     ...     ...     ...   
2139  ZIM     Zimbabwe  2000      19        26  25.0   179.0    71.1       0   
2140  ZIM     Zimbabwe  2004      11        14  25.1   177.8    70.5       3   
2141  ZIM     Zimbabwe  2008      15        16  26.1   171.9    63.7       4   
2142  ZIM     Zimbabwe  2012       8         9  27.3   174.4    65.2       0   
2143  ZIM     Zimbabwe  2016      13        31  27.5   167.8    62.2       0   

      prev_medals  prev_3_medals  
0   

In [4]:
correlation = olympicTeams.drop(["team", "country"], axis=1).corr()["medals"]
print(correlation)

year            -0.021603
events           0.771330
athletes         0.840817
age              0.025096
height           0.141055
weight           0.090577
medals           1.000000
prev_medals      0.920048
prev_3_medals    0.918438
Name: medals, dtype: float64


In [None]:
import seaborn as sns

In [None]:
sns.lmplot(x="athletes",y="medals", data=olympicTeams,fit_reg=True, ci=None)
sns.lmplot(x="prev_medals",y="medals", data=olympicTeams,fit_reg=True, ci=None)
sns.lmplot(x="prev_3_medals",y="medals", data=olympicTeams,fit_reg=True, ci=None)
##Examples of good linear relationships

In [None]:
#Example of bad linear relationship

In [None]:
sns.lmplot(x="age", y="medals", data=olympicTeams, fit_reg=True, ci=0)


In [None]:
olympicTeams.plot.hist(y="medals")
# Most countries have earned few medals while few have earned greater than 100
# This may affect how accurate our model is for countries with few medals won

In [None]:
olympicTeams[olympicTeams.isnull().any(axis=1)].head(20)

In [None]:
#remove any teams without data
olympicTeams = olympicTeams.dropna()

In [None]:
olympicTeams

In [None]:
#training and testing data w/80-20 split
train = olympicTeams[olympicTeams["year"] < 2012].copy()
test = olympicTeams[olympicTeams["year"] >=2012].copy()

In [None]:
train.shape

In [None]:
test.shape

In [None]:
from sklearn.linear_model import LinearRegression
regression = LinearRegression()

In [None]:
predictors = ["athletes","prev_medals","athletes"]
target = "medals"

In [None]:
regression.fit(train[predictors], train["medals"])

In [None]:
predictions = regression.predict(test[predictors])

In [None]:
predictions

In [None]:
test["predictions"] = predictions

In [None]:
test

In [None]:
test.loc[test["predictions"] < 0, "predictions"] = 0

In [None]:
test["predictions"] = test["predictions"].round()

In [None]:
test

In [None]:
from sklearn.metrics import mean_absolute_error

In [None]:
error = mean_absolute_error(test["medals"],test["predictions"])

In [None]:
error

In [None]:
olympicTeams.describe()["medals"]

In [None]:
test[test["team"] == "USA"]

In [None]:
test[test["team"] == "IRI"]

In [None]:
errors = (test["medals"] - test["predictions"]).abs()

In [None]:
errors

In [None]:
error_by_team = errors.groupby(test["team"]).mean()

In [None]:
error_by_team

In [None]:
medals_by_team = test["medals"].groupby(test["team"]).mean()

In [None]:
errorRatio = error_by_team / medals_by_team

In [None]:
errorRatio

In [None]:
errorRatio[~pd.isnull(errorRatio)]

In [None]:
import numpy as np
errorRatio = errorRatio[np.isfinite(errorRatio)]

In [None]:
errorRatio

In [None]:
errorRatio.plot.hist()

In [None]:
errorRatio.sort_values()

In [None]:
#We can conclude that this model is effective for predicting medals for countries that: 
#particiapte in high number of events and have a high medal count

In [None]:
##Linear Regression

In [None]:
train = olympicTeams.loc[(olympicTeams["year"] < 2012) & (olympicTeams["medals"]<10) ]
test = olympicTeams.loc[(olympicTeams["year"] >= 2016) & (olympicTeams["medals"]<10) ]

In [None]:
model = LinearRegression()

In [None]:
x_train = train.drop(columns=["medals","country","team","age"])
y_train = train['medals']
x_test = test.drop(columns=["medals","country","team","age"])
y_test = test['medals']

In [None]:
x_train.shape
y_train.shape

In [None]:
model.fit(x_train,y_train)

In [None]:
y_pred=model.predict(x_test)

In [None]:
from sklearn.metrics import mean_squared_error, r2_score

# Calculate evaluation metrics
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error: {mse}")
print(f"R-squared: {r2}")

In [None]:
'''The model demonstrates relatively low error with an MSE of 1.88%, indicating that predictions are generally close to the actual values. 
However, with an R-squared of 58%, there is still significant unexplained variance (42%), 
suggesting that additional factors or features might improve the model's ability to predict Olympic medal outcomes more accurately.'''

In [None]:
results =pd.DataFrame( {
    'Actual': y_test,
    'Predicted': y_pred
})

In [None]:
sns.lmplot(x='Predicted',y='Actual',data=results,fit_reg=True,ci=None)