In [None]:
import numpy as np
np.random.seed(42)
import pandas as pd
import bs4
from sklearn.utils import class_weight
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from keras.models import model_from_json
from keras.models import Model
from keras.layers import Input, Dense, Embedding, SpatialDropout1D, concatenate
from keras.layers import GRU, Bidirectional, GlobalAveragePooling1D, GlobalMaxPooling1D
from keras.preprocessing import text, sequence
from keras.callbacks import Callback

import warnings
warnings.filterwarnings('ignore')

import os
os.environ['OMP_NUM_THREADS'] = '4'

In [None]:
EMBEDDING_FILE = '../input/crawl300d2m/crawl-300d-2M.vec'

train=pd.read_csv('../input/sdg-train-test-dataset/Devex_train.csv',encoding='latin-1')
test=pd.read_csv('../input/sdg-train-test-dataset/Devex_test_questions.csv',encoding='latin-1')
submission=pd.read_csv('../input/sdg-train-test-dataset/Devex_submission_format.csv')

In [None]:
train['Text'] = train['Text'].apply(lambda x: bs4.BeautifulSoup(x, 'lxml').get_text())
#train.head()

In [None]:
class_names =list(submission.columns.values)[1:]
train = train.reindex(columns=train.columns.tolist() + class_names)
#filling the NaN values
train=train.select_dtypes(include=['object','float']).fillna('None')

In [None]:
#Transforming the target variables into a useful format
train["targets"]=train["Label 1"].apply(lambda x: x[:5] )
targets=pd.get_dummies(train["targets"])
#targets.select_dtypes(include=['object']).fillna(0)
for column in train.columns[4:15]:
    train["targets"]=train[column].apply(lambda x: x[:5] if (type(x)==str) else "None")
    new_targets=pd.get_dummies(train["targets"])
    diff=list(set(list(targets.columns))-set(list(new_targets.columns)))
    new_targets[diff]=targets[diff]*0
    #print(len(new_targets.columns))
    targets=targets + new_targets
    #print(targets["3.b.2"][0])
    if column=="Label 12":
        break
#targets

In [None]:
train_text = train['Text']
test_text = test['Text']

X_train = train_text.fillna("fillna").values
y_train = targets[class_names].values
X_test = test_text.fillna("fillna").values



max_features = 30000
maxlen = 200
embed_size = 300
# converting the training text into intergers for training
tokenizer = text.Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(list(X_train) + list(X_test))
X_train = tokenizer.texts_to_sequences(X_train)
X_test = tokenizer.texts_to_sequences(X_test)
x_train = sequence.pad_sequences(X_train, maxlen=maxlen)
x_test = sequence.pad_sequences(X_test, maxlen=maxlen)

# importing the trained embeddings
def get_coefs(word, *arr): return word, np.asarray(arr, dtype='float32')
embeddings_index = dict(get_coefs(*o.rstrip().rsplit(' ')) for o in open(EMBEDDING_FILE))

word_index = tokenizer.word_index
nb_words = min(max_features, len(word_index))
embedding_matrix = np.zeros((nb_words, embed_size))
for word, i in word_index.items():
    if i >= max_features: continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None: embedding_matrix[i] = embedding_vector



In [None]:
# Defining the deep neural network model(Bidirectional)
def get_model():
    inp = Input(shape=(maxlen, ))
    x = Embedding(max_features, embed_size, weights=[embedding_matrix])(inp)
    x = SpatialDropout1D(0.2)(x)
    x = Bidirectional(GRU(80, return_sequences=True))(x)
    avg_pool = GlobalAveragePooling1D()(x)
    max_pool = GlobalMaxPooling1D()(x)
    conc = concatenate([avg_pool, max_pool])
    outp = Dense(27, activation="sigmoid")(conc)
    
    model = Model(inputs=inp, outputs=outp)
    model.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])

    return model

model = get_model()


batch_size = 32
epochs = 5

X_tra, X_val, y_tra, y_val = train_test_split(x_train, y_train, train_size=0.95, random_state=233)
#Balancing the target classes
sample_weights = class_weight.compute_sample_weight('balanced', y_tra)
#Model training
hist = model.fit(X_tra, y_tra, batch_size=batch_size, epochs=epochs, validation_data=(X_val, y_val),sample_weight=sample_weights,
                  verbose=2)

In [None]:
#model prediction 
y_pred = model.predict(x_test, batch_size=1024)
submission[class_names] = y_pred
submission.to_csv('submission_gru.csv', index=False)

In [None]:
submitted=pd.read_csv('submission_gru.csv')

In [None]:
#converting model output to 1's and 0's for submission
for column in submitted.columns[1:]:
    submitted[column]=submitted[column].apply(lambda x: 1 if (x >=.98 ) else 0)

In [None]:
submitted.to_csv('./submission_gru_98.csv',index=False)

In [None]:
#saving the trained model.
# serialize model to JSON
model_json = model.to_json()
with open("model.json", "w") as json_file:
    json_file.write(model_json)
# serialize weights to HDF5
model.save_weights("model.h5")
print("Saved model to disk")
 
