## Turkey recognition

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sb              # convenient plotting functionality

## Import data

In [None]:
data_train = pd.read_json('../input/train.json')
data_test  = pd.read_json('../input/test.json')

In [None]:
print("Number of training samples: \t", data_train.shape[0])
print("Number of test samples: \t", data_test.shape[0])

## What does the data look like?

In [None]:
data_train.head()

## How many are turkeys and how many aren't turkeys?

In [None]:
sb.countplot(data_train['is_turkey'])

## Let's get all of the audio files the same size. 
## What are their current sizes?

In [None]:
# Above you can see the audio_embedding field of the dataframe has the data
# For example,
image = np.array(data_train['audio_embedding'][0])
plt.imshow(image)
plt.colorbar()

In [None]:
image = np.array(data_train['audio_embedding'][100])
plt.imshow(image)
plt.colorbar()

In [None]:
# So what we're really being provided here are spectrograms.
# Let's make sure all the data is the same size, in terms of length and width.

data_train['length'] = data_train['audio_embedding'].apply(len)
plt.yscale('log')
sb.countplot('length', hue='is_turkey', data=data_train)
plt.show()

## Let's make each spectrogram sized 10 by 128

In [None]:
new_length   = 10
feature_size = 128

from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split

In [None]:
X = pad_sequences(data_train['audio_embedding'], maxlen=new_length, padding='post')
X_test = pad_sequences(data_test['audio_embedding'], maxlen=new_length, padding='post')

In [None]:
plt.imshow(X_test[0])

In [None]:
# define the target variable
y = data_train['is_turkey'].values

In [None]:
X.shape

## Let's build our model

In [None]:
from keras.models import Model, Sequential
from keras.layers import Dense, Dropout, Bidirectional, LSTM, Reshape, GlobalMaxPooling1D, GlobalAveragePooling1D, Input, concatenate, BatchNormalization, Dense, Conv2D, MaxPooling2D, Flatten,Activation,Embedding
from keras.optimizers import Adam

In [None]:
model = Sequential()

model.add(Conv2D(128*2,(3,3), input_shape=(10,128,1) ))
model.add(Activation("relu"))
model.add(MaxPooling2D(pool_size=(2,2)))

model.add(Conv2D(128,(3,3)))
model.add(Activation("relu"))
model.add(MaxPooling2D(pool_size=(2,2)))

model.add(Flatten())
model.add(Dense(128, activation="relu"))
model.add(Dense(128, activation="relu"))


model.add(Dropout(0.1))

model.add(Dense(1))
model.add(Activation('sigmoid'))

model.compile(loss='binary_crossentropy', optimizer=Adam(lr=0.0001), metrics=['accuracy'])

In [None]:
# reshape the training data for the 2d Conv Net
n_images = X.shape[0]
X_reshaped = X.reshape(n_images, 10, 128, 1)

n_images_test = X_test.shape[0]
X_test_reshaped = X_test.reshape(n_images_test, 10, 128, 1)

In [None]:
model.summary()

In [None]:
# save the randomly initialized weights
model.save_weights('model.h5')

## Run the model

In [None]:
from keras.callbacks import ReduceLROnPlateau

In [None]:
reduce_lr = ReduceLROnPlateau(monitor='acc', factor=0.1, patience=2, verbose=1, min_lr=1e-8)

In [None]:
model.fit(X_reshaped, y, epochs=20, batch_size=256, verbose=2, callbacks=[reduce_lr])

## Now use the model on test data

In [None]:
y_test = model.predict(X_test_reshaped, verbose=1)

In [None]:
submission = pd.DataFrame({'vid_id': data_test['vid_id'].values,
                           'is_turkey (pred)': list(y_test.flatten()) })

In [None]:
submission.head()

In [None]:
submission.to_csv("submission.csv", index=False)

## Since the test data doesn't provide y values to check against, let's run the model again but this time split the training data into training and validation subsets.

In [None]:
from sklearn.model_selection import train_test_split 
X_train, X_val, y_train, y_val = train_test_split(X_reshaped, y, test_size=0.2)

In [None]:
# load the original model weights
model.load_weights('model.h5')
# train the model
history = model.fit(X_train, y_train, batch_size=256, epochs=20, validation_data=[X_val, y_val], callbacks=[reduce_lr], verbose=2)

## How well did we do with the validation set?

In [None]:
from sklearn.metrics import accuracy_score

y_pred_val = model.evaluate(X_val, y_val, verbose=1)
print("Train accuracy : ", y_pred_val[1])