In [17]:
#import libraries
import numpy as np
import matplotlib.pyplot as plt
import os
import tensorflow as tf
import random
import pandas as pd
import os
print(os.listdir("../input"))

['train.tsv', 'test.tsv', 'sampleSubmission.csv']


In [18]:
data_path = os.path.join("../input", 'train.tsv')
test_data_path = os.path.join("../input", 'test.tsv')
data = pd.read_csv(data_path, sep='\t')
test_data = pd.read_csv(test_data_path, sep='\t')
data.describe()

Unnamed: 0,PhraseId,SentenceId,Sentiment
count,156060.0,156060.0,156060.0
mean,78030.5,4079.732744,2.063578
std,45050.785842,2502.764394,0.893832
min,1.0,1.0,0.0
25%,39015.75,1861.75,2.0
50%,78030.5,4017.0,2.0
75%,117045.25,6244.0,3.0
max,156060.0,8544.0,4.0


In [19]:
data.head()

Unnamed: 0,PhraseId,SentenceId,Phrase,Sentiment
0,1,1,A series of escapades demonstrating the adage ...,1
1,2,1,A series of escapades demonstrating the adage ...,2
2,3,1,A series,2
3,4,1,A,2
4,5,1,series,2


In [20]:
from sklearn.model_selection import train_test_split
train_texts = data['Phrase']
train_labels = np.array(data['Sentiment'])

#train_labels = pd.get_dummies(train_labels)

test_texts = test_data['Phrase']
X_train, X_test, y_train, y_test = train_test_split(train_texts, train_labels, test_size=0.33, random_state=42)
y_temp = y_train
y_train = pd.get_dummies(y_train)
y_test = pd.get_dummies(y_test)

data = ((X_train, np.array(y_train)),(X_test, np.array(y_test)))

In [34]:
from sklearn.feature_extraction import text
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif

NGRAM_RANGE = (1,3)
TOP_K = 9000
TOKEN_MODE = 'word'
MIN_DOCUMENT_FREQUENCY = 5

In [30]:
def ngram_vectorize(train_texts, train_labels, val_texts, test_texts):
    """Vectorizes texts as n-gram vectors.
    # Arguments
        train_texts: list, training text strings.
        train_labels: np.ndarray, training labels.
        val_texts: list, validation text strings.

    # Returns
        x_train, x_val: vectorized training and validation texts
    """
    # Create keyword arguments to pass to the 'tf-idf' vectorizer.
    
        # Create keyword arguments to pass to the 'tf-idf' vectorizer.
    stop_words_lst = text.ENGLISH_STOP_WORDS.union(["\'s"])
    kwargs = {
            'ngram_range': NGRAM_RANGE,  # Use 1-grams + 2-grams.
            'dtype': 'int32',
            'strip_accents': 'unicode',
            'decode_error': 'replace',
            'analyzer': TOKEN_MODE,  # Split text into word tokens.
            'min_df': MIN_DOCUMENT_FREQUENCY,
            'stop_words' : stop_words_lst
    }
    vectorizer = TfidfVectorizer(**kwargs)

    # Learn vocabulary from training texts and vectorize training texts.
    x_train = vectorizer.fit_transform(train_texts)

    # Vectorize validation texts.
    x_val = vectorizer.transform(val_texts)
    
    # Vectorize test texts.
    x_test = vectorizer.transform(test_texts)
    
    # Select top 'k' of the vectorized features.
    selector = SelectKBest(f_classif, k=min(TOP_K, x_train.shape[1]))
    selector.fit(x_train, y_temp)
    
    
    x_train = selector.transform(x_train).astype('float32')
    x_val = selector.transform(x_val).astype('float32')
    x_test = selector.transform(x_test).astype('float32')
    return x_train, x_val, x_test

In [23]:
from tensorflow.python.keras import models
from tensorflow.python.keras.layers import Dense
from tensorflow.python.keras.layers import Dropout

In [27]:
def mlp_model(layers, units, dropout_rate, input_shape, num_classes):
    """Creates an instance of a multi-layer perceptron model.

    # Arguments
        layers: int, number of `Dense` layers in the model.
        units: int, output dimension of the layers.
        dropout_rate: float, percentage of input to drop at Dropout layers.
        input_shape: tuple, shape of input to the model.
        num_classes: int, number of output classes.

    # Returns
        An MLP model instance.
    """
    print('input shape : ', input_shape)
    model = models.Sequential()
    model.add(Dropout(dropout_rate, input_shape=input_shape))
    i=0
    for _ in range(layers-1):
        model.add(Dense(units=units, activation='relu'))
        model.add(Dropout(rate=dropout_rate))
        pass
    
    model.add(Dense(units=64, activation='relu'))
    model.add(Dropout(rate=dropout_rate))
        
    model.add(Dense(units=num_classes, activation='softmax', name='d2'))
    return model

In [25]:
def train_ngram_model(X_train, 
                      X_label,
                      X_validate,
                      val_label,
                      learning_rate=1e-3,
                      epochs=50,
                      batch_size=150,
                      layers=3,
                      units=128,
                      dropout_rate=0.2):
    """Trains n-gram model on the given dataset.

    # Arguments
        data: tuples of training and test texts and labels.
        learning_rate: float, learning rate for training model.
        epochs: int, number of epochs.
        batch_size: int, number of samples per batch.
        layers: int, number of `Dense` layers in the model.
        units: int, output dimension of Dense layers in the model.
        dropout_rate: float: percentage of input to drop at Dropout layers.
    """
    num_classes = 5
    
    x_train, x_val = X_train, X_validate
    val_labels = val_label
    # Create model instance.
    model = mlp_model(layers=layers,
                      units=units,
                      dropout_rate=dropout_rate,
                      input_shape=x_train.shape[1:],
                      num_classes=num_classes)
    loss = 'categorical_crossentropy'
    optimizer = tf.keras.optimizers.Adam(lr=learning_rate)
    model.compile(optimizer=optimizer, loss=loss, metrics=['acc'])
   
    callbacks = [tf.keras.callbacks.EarlyStopping(
        monitor='val_loss', patience=6)]
    # Train and validate model.
    history = model.fit(
            x_train,
            X_label,
            epochs=epochs,
            callbacks=callbacks,
            validation_data=(x_val, val_labels),
            verbose=2,  # Logs once per epoch.
            batch_size=batch_size)
    # Print results.
    history = history.history
    print('Validation accuracy: {acc}, loss: {loss}'.format(
            acc=history['val_acc'][-1], loss=history['val_loss'][-1]))
    
    return model

In [35]:
(train_texts, train_labels), (val_texts, val_labels) = data
x_train, x_val, x_test = ngram_vectorize(train_texts, train_labels, val_texts, test_texts)
model = train_ngram_model(x_train, train_labels, x_val, val_labels)



input shape :  (9000,)
Train on 104560 samples, validate on 51500 samples
Epoch 1/50
 - 7s - loss: 1.1304 - acc: 0.5661 - val_loss: 0.9996 - val_acc: 0.6028
Epoch 2/50
 - 6s - loss: 0.9893 - acc: 0.6208 - val_loss: 0.9797 - val_acc: 0.6127
Epoch 3/50
 - 6s - loss: 0.9623 - acc: 0.6335 - val_loss: 0.9780 - val_acc: 0.6162
Epoch 4/50
 - 6s - loss: 0.9496 - acc: 0.6397 - val_loss: 0.9734 - val_acc: 0.6203
Epoch 5/50
 - 6s - loss: 0.9413 - acc: 0.6431 - val_loss: 0.9707 - val_acc: 0.6221
Epoch 6/50
 - 6s - loss: 0.9317 - acc: 0.6466 - val_loss: 0.9692 - val_acc: 0.6225
Epoch 7/50
 - 6s - loss: 0.9248 - acc: 0.6504 - val_loss: 0.9696 - val_acc: 0.6255
Epoch 8/50
 - 6s - loss: 0.9199 - acc: 0.6517 - val_loss: 0.9725 - val_acc: 0.6257
Epoch 9/50
 - 6s - loss: 0.9165 - acc: 0.6529 - val_loss: 0.9741 - val_acc: 0.6269
Epoch 10/50
 - 6s - loss: 0.9111 - acc: 0.6557 - val_loss: 0.9807 - val_acc: 0.6273
Epoch 11/50
 - 6s - loss: 0.9067 - acc: 0.6567 - val_loss: 0.9762 - val_acc: 0.6272
Epoch 12/50

In [None]:
#make prediction
y_test = model.predict(x_test)
y_class = np.argmax(y_test, axis=1)

#write output
my_submission = pd.DataFrame({'PhraseId': test_data.PhraseId, 'Sentiment': y_class})
my_submission.to_csv('submission.csv', index=False)