In [None]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import numpy as np
import pandas as pd
import os
import random
import string
import matplotlib.pyplot as plt
import argparse
import time
import tensorflow as tf

from tensorflow.python.keras.preprocessing import sequence
from tensorflow.python.keras.preprocessing import text
from tensorflow.python.keras import models
from tensorflow.python.keras import initializers
from tensorflow.python.keras import regularizers

from tensorflow.python.keras.layers import Dense
from tensorflow.python.keras.layers import Dropout
from tensorflow.python.keras.layers import Embedding
from tensorflow.python.keras.layers import SeparableConv1D
from tensorflow.python.keras.layers import MaxPooling1D
from tensorflow.python.keras.layers import GlobalAveragePooling1D

input_path='../input/feedback-prize-effectiveness/'
test_path=os.path.join(input_path,'test/')
train_path=os.path.join(input_path,'train/')

train_df=pd.read_csv(os.path.join(input_path,'train.csv'))
test_df=pd.read_csv(os.path.join(input_path,'test.csv'))


In [None]:
#How many training examples do we have? 
train_len=len(train_df)

#How common is each rating in the training set? 
adq=train_df["discourse_effectiveness"].value_counts()['Adequate']
eff=train_df["discourse_effectiveness"].value_counts()['Effective']
ineff=train_df["discourse_effectiveness"].value_counts()['Ineffective']

#Calculate the percentages
pct_adq = adq/train_len
pct_eff = eff/train_len
pct_ineff = ineff/train_len

print(pct_adq,pct_eff,pct_ineff)


In [None]:
#inspect the training data
train_df

In [None]:
#inspect the discourse types
#Note, for this basic model we will ignore the different types
train_df.discourse_type.unique()

In [None]:
#Adapted from 
#https://developers.google.com/machine-learning/guides/text-classification/step-2

lbl_dict={ 'Adequate':0,
            'Effective':1,
            'Ineffective':2 }

stoplist = set('for a of the and to in'.split(' ')) #optional: Note, this particular list didn't improve the model

def fix_text(txt):
    fix_txt=''
    #Optional - remove punctuation
    #Note: Removing punctuation here does not improve the model
#    txt=txt.translate(str.maketrans('', '', string.punctuation))
    # Lowercase each item, optionally filter out stopwords
    words=[word for word in txt.lower().split()] ## optionally, add--> if word not in stoplist]
    for word in words:
        fix_txt+=(word+' ')
    return fix_txt

def load_train_val(df,seed=123):
    len_df=len(df)
    train_txt=[]
    train_lbl=[]
    #Embed the type into the text at end, so never truncated
    #& encode the effectiveness label as integer
    for i in range(len_df):
        txt=fix_text(df.iloc[i].discourse_text)
        train_txt.append(txt)
        lbl=lbl_dict[df.iloc[i].discourse_effectiveness]
        train_lbl.append(lbl)
    #Shuffle all text and labels
    random.seed(seed)
    random.shuffle(train_txt)
    random.seed(seed)
    random.shuffle(train_lbl)
    #Split the training and validation sets
    train_len=int(len_df*.8)
    val_txt=train_txt[train_len:]
    train_txt=train_txt[:train_len]
    val_lbl=train_lbl[train_len:]
    train_lbl=train_lbl[:train_len]
    #Return the training and validation sets
    return ((train_txt,np.array(train_lbl)),(val_txt,np.array(val_lbl)))


In [None]:
(tr_txt,tr_lbl),(val_txt,val_lbl)=load_train_val(train_df)

In [None]:
#inspect some of the data
print(tr_txt[0],tr_lbl[0])
print(len(tr_txt))
print(val_txt[0],val_lbl[0])
print(len(val_txt))

In [None]:
#Adapted From 
#https://developers.google.com/machine-learning/guides/text-classification/step-2

def get_num_words_per_sample(sample_texts):
    """Returns the median number of words per sample given corpus.

    # Arguments
        sample_texts: list, sample texts.

    # Returns
        int, median number of words per sample.
    """
    num_words = [len(s.split()) for s in sample_texts]
    return np.median(num_words)

def plot_sample_length_distribution(sample_texts):
    """Plots the sample length distribution.

    # Arguments
        samples_texts: list, sample texts.
    """
    plt.hist([len(s.split()) for s in sample_texts], 50)
    plt.xlabel('Length of a sample in words')
    plt.ylabel('Number of samples')
    plt.title('Sample length distribution')
    plt.show()

In [None]:
#inspect 
print(get_num_words_per_sample(tr_txt))
plot_sample_length_distribution(tr_txt)

With ~37,000 training samples and there is a median of 30 words per sample, the ratio of # samples / # of words per sample is greater than 1,500 but less than 15,000.  

According to Google's analysis posted here, 
https://developers.google.com/machine-learning/guides/text-classification/step-2-5
we will likely get best results by tokenizing the text as sequences and using sepCNN model to classify them, using a fine-tuned pre-trained embedding. 

For this notebook, I have not included a pre-trained embedding.


In [None]:
#Adapted from:
#https://developers.google.com/machine-learning/guides/text-classification/step-3

# Vectorization parameters
# Limit on the number of features
TOP_K = 10000 #can try different values here

# Limit on the length of text sequences. Sequences longer than this
# will be truncated.
MAX_SEQUENCE_LENGTH = 100 #can try different values here

def sequence_vectorize(train_texts, val_texts):
    """Vectorizes texts as sequence vectors.
    1 text = 1 sequence vector with fixed length.
    # Arguments
        train_texts: list, training text strings.
        val_texts: list, validation text strings.
    # Returns
        x_train, x_val, word_index: vectorized training and validation
            texts and word index dictionary.
    """
    # Create vocabulary with training texts.
    tokenizer = text.Tokenizer(num_words=TOP_K)
    tokenizer.fit_on_texts(train_texts)

    # Vectorize training and validation texts.
    x_train = tokenizer.texts_to_sequences(train_texts)
    x_val = tokenizer.texts_to_sequences(val_texts)

    # Fix sequence length to max value. Sequences shorter than the length are
    # padded in the beginning and sequences longer are truncated
    # at the beginning.
    x_train = sequence.pad_sequences(x_train, maxlen=MAX_SEQUENCE_LENGTH)
    x_val = sequence.pad_sequences(x_val, maxlen=MAX_SEQUENCE_LENGTH)
    return x_train, x_val, tokenizer

In [None]:
#From:
#https://developers.google.com/machine-learning/guides/text-classification/step-4

def _get_last_layer_units_and_activation(num_classes):
    """Gets the # units and activation function for the last network layer.

    # Arguments
        num_classes: int, number of classes.

    # Returns
        units, activation values.
    """
    if num_classes == 2:
        activation = 'sigmoid'
        units = 1
    else:
        activation = 'softmax'
        units = num_classes
    return units, activation


def sepcnn_model(blocks,
                 filters,
                 kernel_size,
                 embedding_dim,
                 dropout_rate,
                 pool_size,
                 input_shape,
                 num_classes,
                 num_features,
                 use_pretrained_embedding=False,
                 is_embedding_trainable=False,
                 embedding_matrix=None):
    """Creates an instance of a separable CNN model.

    # Arguments
        blocks: int, number of pairs of sepCNN and pooling blocks in the model.
        filters: int, output dimension of the layers.
        kernel_size: int, length of the convolution window.
        embedding_dim: int, dimension of the embedding vectors.
        dropout_rate: float, percentage of input to drop at Dropout layers.
        pool_size: int, factor by which to downscale input at MaxPooling layer.
        input_shape: tuple, shape of input to the model.
        num_classes: int, number of output classes.
        num_features: int, number of words (embedding input dimension).
        use_pretrained_embedding: bool, true if pre-trained embedding is on.
        is_embedding_trainable: bool, true if embedding layer is trainable.
        embedding_matrix: dict, dictionary with embedding coefficients.

    # Returns
        A sepCNN model instance.
    """
    op_units, op_activation = _get_last_layer_units_and_activation(num_classes)
    model = models.Sequential()

    # Add embedding layer. If pre-trained embedding is used add weights to the
    # embeddings layer and set trainable to input is_embedding_trainable flag.
    if use_pretrained_embedding:
        model.add(Embedding(input_dim=num_features,
                            output_dim=embedding_dim,
                            input_length=input_shape[0],
                            weights=[embedding_matrix],
                            trainable=is_embedding_trainable))
    else:
        model.add(Embedding(input_dim=num_features,
                            output_dim=embedding_dim,
                            input_length=input_shape[0]))

    for _ in range(blocks-1):
        model.add(Dropout(rate=dropout_rate))
        model.add(SeparableConv1D(filters=filters,
                                  kernel_size=kernel_size,
                                  activation='relu',
                                  bias_initializer='random_uniform',
                                  depthwise_initializer='random_uniform',
                                  padding='same'))
        model.add(SeparableConv1D(filters=filters,
                                  kernel_size=kernel_size,
                                  activation='relu',
                                  bias_initializer='random_uniform',
                                  depthwise_initializer='random_uniform',
                                  padding='same'))
        model.add(MaxPooling1D(pool_size=pool_size))

    model.add(SeparableConv1D(filters=filters * 2,
                              kernel_size=kernel_size,
                              activation='relu',
                              bias_initializer='random_uniform',
                              depthwise_initializer='random_uniform',
                              padding='same'))
    model.add(SeparableConv1D(filters=filters * 2,
                              kernel_size=kernel_size,
                              activation='relu',
                              bias_initializer='random_uniform',
                              depthwise_initializer='random_uniform',
                              padding='same'))
    model.add(GlobalAveragePooling1D())
    model.add(Dropout(rate=dropout_rate))
    model.add(Dense(op_units, activation=op_activation))
    return model

In [None]:
#Adapted From: 
#https://github.com/google/eng-edu/blob/main/ml/guides/text_classification/train_sequence_model.py

"""Module to train sequence model.

Vectorizes training and validation texts into sequences and uses that for
training a sequence model - a sepCNN model. We use sequence model for text
classification when the ratio of number of samples to number of words per
sample for the given dataset is very large (>~15K).
"""

FLAGS = None

def train_sequence_model(data,
                         learning_rate=1e-3,
                         epochs=1000,       #we'll set to auto terminate when validation loss stabilizes
                         batch_size=128,
                         blocks=2,
                         filters=64,
                         dropout_rate=0.2,
                         embedding_dim=200,  #can try different parameters here
                         kernel_size=6,      #can try different CNN kernel sizes here
                         pool_size=3):
    """Trains sequence model on the given dataset.

    # Arguments
        data: tuples of training and test texts and labels.
        learning_rate: float, learning rate for training model.
        epochs: int, number of epochs.
        batch_size: int, number of samples per batch.
        blocks: int, number of pairs of sepCNN and pooling blocks in the model.
        filters: int, output dimension of sepCNN layers in the model.
        dropout_rate: float: percentage of input to drop at Dropout layers.
        embedding_dim: int, dimension of the embedding vectors.
        kernel_size: int, length of the convolution window.
        pool_size: int, factor by which to downscale input at MaxPooling layer.

    # Raises
        ValueError: If validation data has label values which were not seen
            in the training data.
    """
    # Get the data.
    (train_texts, train_labels), (val_texts, val_labels) = data

    # Verify that validation labels are in the same range as training labels.
    num_classes = 3 #Hardcoded for this exercise
    unexpected_labels = [v for v in val_labels if v not in range(num_classes)]
    if len(unexpected_labels):
        raise ValueError('Unexpected label values found in the validation set:'
                         ' {unexpected_labels}. Please make sure that the '
                         'labels in the validation set are in the same range '
                         'as training labels.'.format(
                             unexpected_labels=unexpected_labels))

    # Vectorize texts.
    x_train, x_val, tokenizer = sequence_vectorize(
            train_texts, val_texts)

    # Number of features will be the embedding input dimension. Add 1 for the
    # reserved index 0.
    num_features = min(len(tokenizer.word_index) + 1, TOP_K)

    # Create model instance.
    model = sepcnn_model(blocks=blocks,
                        filters=filters,
                        kernel_size=kernel_size,
                        embedding_dim=embedding_dim,
                        dropout_rate=dropout_rate,
                        pool_size=pool_size,
                        input_shape=x_train.shape[1:],
                        num_classes=num_classes,
                        num_features=num_features)

    # Compile model with learning parameters.
#    optimizer = tf.keras.optimizers.Adam(); I got errors so use 'adam'
    model.compile(loss='sparse_categorical_crossentropy',optimizer='adam',metrics=['accuracy'])

    # Create callback for early stopping on validation loss. If the loss does
    # not decrease in two consecutive tries, stop training.
    callbacks = [tf.keras.callbacks.EarlyStopping(
        monitor='val_loss', patience=2)]   #This auto terminates when validation loss stabilizes

    # Train and validate model.
    history = model.fit(
            x_train,
            train_labels,
            epochs=epochs,
            callbacks=callbacks,
            validation_data=(x_val, val_labels),
            verbose=2,  # Logs once per epoch.
            batch_size=batch_size)

    # Save model.
    model.save('effective_arg_sepcnn_model.h5')
    return model, tokenizer

In [None]:
#Load the data
data = load_train_val(train_df)

In [None]:
#Train the model
model, tokenizer = train_sequence_model(data)

In [None]:
#prepare model inputs in same format as training
test_txt=[]
for i in range(len(test_df)):
    txt=fix_text(test_df.iloc[i].discourse_text)
    test_txt.append(txt)
test=tokenizer.texts_to_sequences(test_txt)
test=sequence.pad_sequences(test, maxlen=MAX_SEQUENCE_LENGTH)
#make the predictions
pred=model.predict(test)

In [None]:
#inspect
test_txt

In [None]:
#Prepare the data frame for our subsmission
sub=pd.DataFrame(columns={'discourse_id','Ineffective','Adequate','Effective'})
sub.discourse_id=test_df.discourse_id
for i in range(len(sub)):
    sub.Adequate[i]=pred[i][lbl_dict['Adequate']]
    sub.Effective[i]=pred[i][lbl_dict['Effective']]
    sub.Ineffective[i]=pred[i][lbl_dict['Ineffective']]

In [None]:
#Write the submission file
sub=sub[["discourse_id","Ineffective","Adequate","Effective"]]
sub.to_csv('submission.csv',index=False)
sub


In [None]:
#inspect
pred