In [9]:
### The purpose of this section is to evaluate model performance on trained models 
## first we will generate Y predictions using all trained models 
# including all NN iterations. 
import numpy as np
import pandas as pd
from scipy.sparse import save_npz, load_npz
from sklearn.linear_model import LogisticRegression
from sklearn.externals import joblib
from keras.models import Model
from keras.layers import Input, Dense, Embedding, SpatialDropout1D, concatenate
from keras.layers import GRU, Bidirectional, GlobalAveragePooling1D, GlobalMaxPooling1D
from keras.preprocessing import text, sequence
from sklearn.metrics import roc_auc_score

In [6]:
# Load Data

train_features = load_npz('inputs/train_cv_sparse.npz')
test_features = load_npz('inputs/val_sparse.npz')

train_y = pd.read_csv('inputs/train_cv.csv')
test_y = pd.read_csv('inputs/val_data.csv')

In [13]:
# Prediction using Log_Reg Models
class_names = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']


for class_name in class_names:
    filename = 'Model_W/'+class_name+'_LR.sav'
    loaded_model = joblib.load(filename)
    mp_colname = class_name+'LR_P'
    train_y[mp_colname] = loaded_model.predict(train_features)
    test_y[mp_colname] = loaded_model.predict(test_features)

dtype('O')

In [25]:
Metrics_Names = ['roc_auc','Accuracy', 'Precision', 'recall', 'F1', 'log_loss']
rows = ['train_LR', 'test_LR','GRU_Train','GRU_Test']
Metrics = pd.DataFrame(data=0.00,columns=Metrics_Names,index=rows)
Metrics.head()


print(roc_auc_score(train_y[3:9], train_y[9:16],average='macro'))

TypeError: '>' not supported between instances of 'int' and 'str'

In [3]:
# Load data for NN predictions

X_train = train_y["comment_text"].fillna("fillna").values
y_train = train_y[["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]].values
X_test = test_y["comment_text"].fillna("fillna").values

max_features = 30000
maxlen = 150
embed_size = 300

tokenizer = text.Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(list(X_train) + list(X_test))
X_train = tokenizer.texts_to_sequences(X_train)
X_test = tokenizer.texts_to_sequences(X_test)
x_train = sequence.pad_sequences(X_train, maxlen=maxlen)
x_test = sequence.pad_sequences(X_test, maxlen=maxlen)

embedding_path = 'inputs/glove.840B.300d.txt'
embeddings = pd.read_table(embedding_path, sep=" ", index_col=0, header=None,
                           quoting=csv.QUOTE_NONE)
emb_mean, emb_std = np.mean(embeddings.values), np.std(embeddings.values)

print(emb_mean)

KeyboardInterrupt: 

In [None]:
from datetime import datetime
now = datetime.now()

word_index = tokenizer.word_index
nb_words = min(max_features, len(word_index))
embedding_matrix = np.zeros((nb_words, embed_size))

p = 0
for word, i in word_index.items():
    if i >= max_features: continue
    embedding_vector = None
    if (word in embeddings.index) == True: embedding_vector = embeddings.loc[word]
    if embedding_vector is not None: embedding_matrix[i] = embedding_vector.as_matrix()
    p = p + 1
    if p % 3000 == 0:
        print(word)

In [None]:
# Define Models and load weights

def get_model():
    inp = Input(shape=(maxlen, ))
    x = Embedding(max_features, embed_size, weights=[embedding_matrix])(inp)
    x = SpatialDropout1D(0.2)(x)
    x = Bidirectional(GRU(60, return_sequences=True))(x)
    avg_pool = GlobalAveragePooling1D()(x)
    max_pool = GlobalMaxPooling1D()(x)
    conc = concatenate([avg_pool, max_pool])
    outp = Dense(6, activation="sigmoid")(conc)
    
    model = Model(inputs=inp, outputs=outp)
    model.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])
    model.summary()

    return model

model = get_model()

model.load_weights(filepath=file_path)

In [None]:
# Predictions using NN to get log loss