## Dependencies

In [1]:
import numpy as np
import pandas as pd
import tensorflow

# Olympic Data
We pared down a Kaggle data set to the Summer data. We included only the data necessary for our searching.
## The Dataset
The following information about each participant in each event are included within the CSV:

* ID: unique identifier for each participant
* Name: each participant's name
* Sex: M for male, F for female
* Age: in years (11 to 71)
* Height: in cm
* Weight: in kilograms
* Team: Group the participant is competing with
* NOC: Three letter country abbreviation
* Year: Year of Olympic event (1896-2016)
* Sport: Category of competetion
* Medal: NA(no medal), Bronze, Silver, Gold

In [2]:
olympics_unfiltered = pd.read_csv('Summer_Olympics.csv')
olympics_unfiltered.head()

Unnamed: 0,ID,Name,Sex,Age,Height,Weight,Team,NOC,Year,Sport,Medal_Type,Medals
0,1,A Dijiang,M,24,180,80.0,China,CHN,1992,Basketball,No Medal,No Medal
1,2,A Lamusi,M,23,170,60.0,China,CHN,2012,Judo,No Medal,No Medal
2,12,Jyri Tapani Aalto,M,31,172,70.0,Finland,FIN,2000,Badminton,No Medal,No Medal
3,13,Minna Maarit Aalto,F,30,159,55.5,Finland,FIN,1996,Sailing,No Medal,No Medal
4,13,Minna Maarit Aalto,F,34,159,55.5,Finland,FIN,2000,Sailing,No Medal,No Medal


# Country Data
From the world bank we collected each country's population and GDP after 1960. Due to some limitations of the data, not all countries had available information for each year. 

* Country
* Population
* GDP


In [3]:
country_df = pd.read_csv("Population_&_GDP_by_Year.csv")
country_df.head()

country_df = country_df[country_df.GDP != ".."].reset_index()
country_df.head()

Unnamed: 0.1,index,Unnamed: 0,Year,Country,Population,GDP
0,0,0,1960,CHN,667070000.0,59716467625.0
1,1,1,1960,FIN,4429634.0,5224102196.0
2,3,3,1960,NOR,3581239.0,5163271598.0
3,5,5,1960,FRA,46621669.0,62651474947.0
4,6,6,1960,ESP,30455000.0,12072126075.0


# Merging the Datasets


We merged the dataframes using the country and year as the identifying keys.

Because we do not have data before 1960 for GDP and Population we filtered the Olympic data to the Games held in 1960 and after.


In [4]:
olympics_filtered = olympics_unfiltered[olympics_unfiltered["Year"] > 1960].reset_index()
olympics_filtered = olympics_unfiltered[olympics_unfiltered["Sport"] != "Art Competitions"].reset_index()

In [7]:
olympics_df = pd.merge(country_df, olympics_filtered, how='right', left_on=["Year", "Country"], right_on= ["Year", "NOC"])
olympics_df.head()

Unnamed: 0.1,index_x,Unnamed: 0,Year,Country,Population,GDP,index_y,ID,Name,Sex,Age,Height,Weight,Team,NOC,Sport,Medal_Type,Medals
0,1.0,1.0,1960,FIN,4429634.0,5224102196.0,1618,1427,Matti Olavi Aho,M,26,181,82.5,Finland,FIN,Boxing,No Medal,No Medal
1,1.0,1.0,1960,FIN,4429634.0,5224102196.0,1623,1431,Viljo Johannes Aho,M,27,174,68.5,Finland,FIN,Boxing,No Medal,No Medal
2,1.0,1.0,1960,FIN,4429634.0,5224102196.0,1654,1461,Viktor Ahven,M,31,187,103.0,Finland,FIN,Wrestling,No Medal,No Medal
3,1.0,1.0,1960,FIN,4429634.0,5224102196.0,1867,1614,Rainer Robert kerfelt,M,25,177,74.0,Finland,FIN,Canoeing,No Medal,No Medal
4,1.0,1.0,1960,FIN,4429634.0,5224102196.0,1868,1615,Rolf Rafael kerfelt,M,19,180,73.0,Finland,FIN,Canoeing,No Medal,No Medal


In [8]:
physical_event_country_df = olympics_df[["Sex", "Age", "Height", "Weight", "Year", "Sport", "Medals", "NOC", "GDP", "Population"]]
physical_event_country_df.head()

Unnamed: 0,Sex,Age,Height,Weight,Year,Sport,Medals,NOC,GDP,Population
0,M,26,181,82.5,1960,Boxing,No Medal,FIN,5224102196.0,4429634.0
1,M,27,174,68.5,1960,Boxing,No Medal,FIN,5224102196.0,4429634.0
2,M,31,187,103.0,1960,Wrestling,No Medal,FIN,5224102196.0,4429634.0
3,M,25,177,74.0,1960,Canoeing,No Medal,FIN,5224102196.0,4429634.0
4,M,19,180,73.0,1960,Canoeing,No Medal,FIN,5224102196.0,4429634.0


## Data Pre-Processing Medal

In [9]:
X = physical_event_country_df.drop("Medals", axis=1)
y = physical_event_country_df["Medals"]
print(X.shape, y.shape)

(166677, 9) (166677,)


In [10]:
X = pd.get_dummies(X)
X.head()
X.shape

(166677, 2559)

## Dependencies

In [11]:
import tensorflow.keras
from sklearn.preprocessing import *
from keras.models import Sequential
from keras.utils import to_categorical
from tensorflow.keras.layers import Dense
from sklearn.model_selection import train_test_split
from tensorflow.keras.utils import to_categorical

Using TensorFlow backend.


In [34]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, random_state=1, stratify=y)

## One-hot encode the labels

In [35]:
## y_train Encoder
label_encoder = LabelEncoder()
label_encoder.fit(y_train)
encoded_y = label_encoder.transform(y_train)
len(encoded_y)

125007

In [36]:
one_hot_y = to_categorical(encoded_y)
len(one_hot_y)

125007

In [37]:
## y_test Encoder
label_encoder = LabelEncoder()
label_encoder.fit(y_test)
encoded_y_test = label_encoder.transform(y_test)
len(encoded_y_test)

41670

In [38]:
one_hot_y_test = to_categorical(encoded_y_test)
len(one_hot_y_test)

41670

# Create a Deep Learning Model

In [39]:
#Scale the Dataset
X_scaler = MinMaxScaler().fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

  return self.partial_fit(X, y)


In [18]:
#Create the Model
model = tensorflow.keras.Sequential()
model.add(Dense(units=300, activation='relu', input_dim=2559))
model.add(Dense(units=200, activation='relu'))
model.add(Dense(units=2, activation='softmax'))

Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor


In [19]:
# Compile the model
model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy'])
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 300)               768000    
_________________________________________________________________
dense_1 (Dense)              (None, 200)               60200     
_________________________________________________________________
dense_2 (Dense)              (None, 2)                 402       
Total params: 828,602
Trainable params: 828,602
Non-trainable params: 0
_________________________________________________________________


In [20]:
# Fit the model
model.fit(
    X_train_scaled,
    one_hot_y,
    epochs=10,
    shuffle=True,
    verbose=2,)

Epoch 1/10
125007/125007 - 28s - loss: 0.3293 - acc: 0.8714
Epoch 2/10
125007/125007 - 29s - loss: 0.2731 - acc: 0.8895
Epoch 3/10
125007/125007 - 30s - loss: 0.2579 - acc: 0.8925
Epoch 4/10
125007/125007 - 29s - loss: 0.2497 - acc: 0.8938
Epoch 5/10
125007/125007 - 29s - loss: 0.2437 - acc: 0.8952
Epoch 6/10
125007/125007 - 29s - loss: 0.2395 - acc: 0.8961
Epoch 7/10
125007/125007 - 28s - loss: 0.2361 - acc: 0.8964
Epoch 8/10
125007/125007 - 29s - loss: 0.2340 - acc: 0.8971
Epoch 9/10
125007/125007 - 29s - loss: 0.2313 - acc: 0.8978
Epoch 10/10
125007/125007 - 29s - loss: 0.2296 - acc: 0.8981


<tensorflow.python.keras.callbacks.History at 0xb37274a58>

In [22]:
model_loss, model_accuracy = model.evaluate(
    X_test_scaled, one_hot_y_test, verbose=2)
print(
    f"Normal Neural Network - Loss: {model_loss}, Accuracy: {model_accuracy}")

41670/41670 - 5s - loss: 0.2827 - acc: 0.8919
Normal Neural Network - Loss: 0.2827485149001686, Accuracy: 0.8918886780738831


In [24]:
## https://www.tensorflow.org/model_optimization/guide/pruning/pruning_with_keras
import tensorflow_model_optimization as tfmot

In [40]:
pruning_schedule = tfmot.sparsity.keras.PolynomialDecay(
                        initial_sparsity=0.5, final_sparsity=.9,
                        begin_step=1000, end_step=5000)

model_for_pruning = tfmot.sparsity.keras.prune_low_magnitude(model, pruning_schedule=pruning_schedule)



In [41]:
model_for_pruning.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy'])

In [None]:
model_for_pruning.fit(
    X_train_scaled,
    one_hot_y,
    batch_size=4000,
    epochs=2,
    shuffle=True,
    verbose=1,
    validation_data=(X_test_scaled, one_hot_y_test)
)