In [None]:
import sys
from preprocess import MyTokenizer
from preprocess import get_vocab,get_corpus
from preprocess import get_uncommon_words
from preprocess import remove_uncommon_words_from_df
from preprocess import preprocess_df

from utilities import load_df
from utilities import get_vocab_encoder
from utilities import get_embedding_encoder

from shape_data import prep_data
from shape_data import random_embedding_array,embedding_array

from train import Train
from models import cnn_gru,cnn_gru_star,channel_model

from utilities import word_has_vec
import time
import pickle as pkl
import pandas as pd
import os

from keras import backend as K
from keras.wrappers.scikit_learn import KerasClassifier
from keras.preprocessing import text, sequence
from keras.optimizers import SGD
from keras.models import Model, Sequential
from keras.models import load_model
from keras.layers import Input, Embedding
from keras.layers import SpatialDropout1D, Dropout, Activation
from keras.layers import MaxPooling1D,GlobalAveragePooling1D, GlobalMaxPooling1D
from keras.layers import Dense, Conv1D, GRU, Bidirectional,LSTM
from keras.layers import Flatten, concatenate
#from keras.layers import Sequential
from keras.callbacks import Callback, EarlyStopping, ModelCheckpoint
from keras.layers import average

In [4]:
#X_COLS=['original','german','spanish']
X_COLS=['original','german','french']
#X_COLS=['comment_text']
Y_COL=['label']
#Y_COL=['identity_hate']
VAL_SIZE=0.1 # if zero, no validation set 
Y_MAP = {'none': 0,'racism':1,'sexism':2}#or None for toxic embedding dataset
EMBEDDINGS_FP='/Users/ellie/Downloads/embeddings/glove.840B.300d.txt'
MAX_LEN=33
MODELDIR='../models'
BATCH_SIZE=1000
EPOCHS=50
DIM=300
NUM_CLASSES=3 #2
MON_SCORE='val_acc'
MON_MODE='max'
WEIGHTS_FILEP=''
DATA_PATH='../cdata.csv'
MODEL_FUNC=cnn_gru_star#alternatively 'cnn_gru_star' or 'channel_model'
MODEL_NAME='mymodel'
TRACKER_FN='tracker.csv'
SAVE_MODEL_WEIGHTS=False
LOAD_MODEL_WEIGHTS=False
WEIGHTS_FILEP=''
#TRAIN_TEST DATA STRUCTURE
TRAIN_CHANNELS=False #change to true for the channel model
TEST_CHANNELS=True 
VAL_SIZE=0.1
CROSS_VAL=5

#PREDICTION METHOD
TRAIN_TEST_AUG=True 
NO_TEST_AUG=False #change to False if predicting on origial tweets only


In [None]:
tokenizer=MyTokenizer(lower=True,remove_stopwords=False,remove_markers=False)

In [None]:
If LOAD_MODEL_WEIGHTS==True:
    with open(os.path.join(MODELDIR,WEIGHTS_FILEP), 'rb') as fp:
        embeddings = pkl.load(fp)

In [None]:
df=load_df(DATA_PATH,x_cols=X_COLS,y_col=Y_COL,y_map=Y_MAP)
ndf=preprocess_df(df,x_cols=X_COLS,tokenizer=tokenizer.preprocess_text)
corp=get_corpus(ndf,x_cols=X_COLS)

#uncomment the below lines to remove low frequency words
#uncommon=get_uncommon_words(corp,thresh=4)
#udf=remove_uncommon_words_from_df(ndf,x_cols=X_COLS,uncommon_list=uncommon)
#corp=get_corpus(udf,x_cols=X_COLS)
vocab=get_vocab(corp)
encoder,decoder=get_vocab_encoder(vocab)


vec_array=random_embedding_array(len(encoder),dim=DIM)
#uncomment the below line if using pretrained embeddings
#vec_array=embedding_array(encoder,embeddings)
folds=prep_data(df=ndf,x_cols=X_COLS,y_col=Y_COL,encoder=encoder,cv=CROSS_VAL,
                val_size=VAL_SIZE,max_len=MAX_LEN,train_channels=TRAIN_CHANNELS,test_channels=TEST_CHANNELS)


In [None]:
for i,fold in enumerate(folds): 
    fold_num=i+1

    model_name=MODEL_NAME
    if (fold_num) <=1:
        path_datetime=time.strftime('%d-%H%M')
        path=model_name+'_'+path_datetime+'fold-'+str(fold_num)
    else:
        path=model_name+'_'+path_datetime+'fold-'+str(fold_num)
    
    print('----------------')
    print(path)
    print('----------------')

    model,summary=MODEL_FUNC(pad_len=MAX_LEN,vec_array=vec_array,num_classes=NUM_CLASSES)
    train_model=Train(model,model_name,summary,x_cols=X_COLS,modeldir=MODELDIR)
    
    train_model.train(fold,model,path,
                      total_epochs=EPOCHS,
                      batch_size=BATCH_SIZE,
                      monitor_score=MON_SCORE,monitor_mode=MON_MODE,
                      patience=5)
    train_model.get_best_model()
    #train_model.evaluate(fold,y_map=Y_MAP,tracker_fn='tracker.csv',predict_english_only=True)
    
    #to predict on all languages (by averaging probabilities)
    train_model.evaluate(fold,y_map=Y_MAP,tracker_fn=TRACKER_FN,
                         channel_probs=TRAIN_TEST_AUG,
                         predict_english_only=NO_TEST_AUG,)

In [None]:
if SAVE_MODEL_WEIGHTS==True:
    weights=model.get_weights()[0]
    embeddings={}
    for word,ix in encoder.items():
        embeddings[word]=weights[ix]
    with open(os.path.join(MODELDIR,WEIGHTS_FILEP), 'wb') as fp:
        pkl.dump(embeddings,fp)    
