In [4]:
import warnings
warnings.filterwarnings('ignore')

In [5]:
import os
import numpy as np
import pandas as pd
from argparse import ArgumentParser
from keras.preprocessing.text import one_hot
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential # NN Activation
from keras.layers import Embedding # Embedding layer
from keras.layers import Flatten
from keras.layers import Dense # Fully Connected Networks

In [6]:
# Pre-processing: reading and parsing data
file_training_label = 'data/training_label.txt'
file_training_unlabel = 'data/training_nolabel.txt'

In [7]:
with open(file_training_label, 'r', encoding='utf-8') as f:
    lines = f.readlines()
    sentiment = []
    text = []
    for line in lines:
        parsed = line.split(maxsplit=2)
        sentiment.append(int(parsed[0]))
        text.append(parsed[2].rstrip('\n'))
        
    data_training_label = pd.DataFrame({'sentiment': sentiment, 'text': text})

with open(file_training_unlabel, 'r', encoding='utf-8') as f:
    lines = f.readlines()    
    lines = [line.rstrip('\n') for line in lines]
    
    data_training_unlabel = pd.DataFrame({'text': lines})

In [8]:
# Word embedding method 1: BOW(bag of words)
# Use Gensim module
import gensim
import pprint
from gensim import corpora
from gensim.utils import simple_preprocess

doc_tokenized = [simple_preprocess(doc) for doc in data_training_label['text'].tolist()]
dictionary = corpora.Dictionary()
BoW_corpus = [dictionary.doc2bow(doc, allow_update=True) for doc in doc_tokenized]
id_words = [[(dictionary[id], count) for id, count in line] for line in BoW_corpus]

In [55]:
# integer encode the documents
vocab_size = 50
encoded_docs_label = [one_hot(d, vocab_size) for d in data_training_label['text']]
# pad documents to a max length of 4 words
max_length = 16
padded_docs_label = pad_sequences(encoded_docs_label, maxlen=max_length, padding='post')

In [56]:
encoded_docs_unlabel = [one_hot(d, vocab_size) for d in data_training_unlabel['text']]
padded_docs_unlabel = pad_sequences(encoded_docs_unlabel, maxlen=max_length, padding='post')

In [57]:
model = new_model()
labels = np.array(data_training_label['sentiment'])
model.fit(padded_docs_label, labels, epochs=50, verbose=0)
loss, accuracy = model.evaluate(padded_docs_label, labels, verbose=0)
print('Accuracy: %f' % (accuracy*100))

Model: "sequential_11"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_11 (Embedding)     (None, 16, 8)             400       
_________________________________________________________________
flatten_11 (Flatten)         (None, 128)               0         
_________________________________________________________________
dense_24 (Dense)             (None, 1)                 129       
Total params: 529
Trainable params: 529
Non-trainable params: 0
_________________________________________________________________
None
Accuracy: 61.030501


In [68]:
new_labels = model.predict(padded_docs_unlabel)
new_labels[new_labels>=0.5] = 1
new_labels[new_labels<0.5] = 0

array([[1.],
       [0.],
       [0.],
       ...,
       [1.],
       [1.],
       [1.]], dtype=float32)

In [76]:
new_labels = new_labels.reshape(new_labels.shape[0],)

In [82]:
labels_augmented.shape

(1378614,)

In [80]:
padded_docs_augmented = np.concatenate((padded_docs_label, padded_docs_unlabel), axis=0)
labels_augmented = np.concatenate((labels, new_labels))

In [79]:
padded_docs_augmented

array([[ 7, 17, 15, ...,  0,  0,  0],
       [ 7, 22, 21, ...,  0,  0,  0],
       [ 7, 38,  7, ..., 48,  0,  0],
       ...,
       [47,  7, 26, ...,  0,  0,  0],
       [41, 29, 38, ..., 17,  7, 21],
       [43, 20,  7, ...,  0,  0,  0]])

In [83]:
model = new_model()
model.fit(padded_docs_augmented, labels_augmented, epochs=50, verbose=0)
loss, accuracy = model.evaluate(padded_docs_augmented, labels_augmented, verbose=0)
print('Accuracy: %f' % (accuracy*100))

Model: "sequential_12"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_12 (Embedding)     (None, 16, 8)             400       
_________________________________________________________________
flatten_12 (Flatten)         (None, 128)               0         
_________________________________________________________________
dense_25 (Dense)             (None, 1)                 129       
Total params: 529
Trainable params: 529
Non-trainable params: 0
_________________________________________________________________
None
Accuracy: 92.730522


In [54]:
# define the model
# DNN for classification (1: positive, 0: negative)
def new_model():
    model = Sequential()
    model.add(Embedding(vocab_size, 8, input_length=max_length))
    model.add(Flatten())
    model.add(Dense(units=1, activation='sigmoid'))
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    
    print(model.summary())
    
    return model

In [42]:
# fit the model
model.fit(padded_docs, labels, epochs=50, verbose=0)
# evaluate the model
loss, accuracy = model.evaluate(padded_docs, labels, verbose=0)
print('Accuracy: %f' % (accuracy*100))

Accuracy: 60.949498


In [45]:
encoded_docs = [one_hot(d, vocab_size) for d in data_training_unlabel['text']]
padded_docs = pad_sequences(encoded_docs, maxlen=max_length, padding='post')

In [None]:
with open(file_training_unlabel, 'r', encoding='utf-8') as f:
    lines = f.readlines()    
    lines = [line.rstrip('\n') for line in lines]
    
    data_training_unlabel = pd.DataFrame({'text': lines})

In [None]:
# Plot learning curve
# Fit the model
history = model.fit(X, Y, validation_split=0.33, epochs=150, batch_size=10, verbose=0)
# list all data in history
print(history.history.keys())
# summarize history for accuracy
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.show()
# summarize history for loss
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.show()

In [None]:
# Saving model
model.save('hw4.h5')

In [None]:
# Loading Model: download if doesn't exist in current directory
model = keras.models.load_model('hw4.h5')

In [38]:
file_testing = 'data/testing_data.txt'
file_prediction = 'prediction.csv'

In [45]:
with open(file_testing, 'r', encoding='utf-8') as f:
    lines = f.readlines()
    text = []
    for line in lines[1:]:
        parsed = line.split(',', maxsplit=1)
        text.append(parsed[1].rstrip('\n'))
        
    data_testing = pd.DataFrame({'text': text})

In [None]:
model_best.eval()
prediction = []
with torch.no_grad():
    for i, data in enumerate(test_loader):
        test_pred = model_best(data.cuda())
        test_label = np.argmax(test_pred.cpu().data.numpy(), axis=1)
        for y in test_label:
            prediction.append(y)

In [46]:
# Write prediction results into a csv file
with open("predict.csv", 'w') as f:
    f.write('id,label\n')
    for i, y in  enumerate(prediction):
        f.write('{},{}\n'.format(i, y))

Unnamed: 0,text
0,"my dog ate our dinner . no , seriously ... he ..."
1,omg last day sooon n of primary noooooo x im g...
2,stupid boys .. they ' re so .. stupid !
3,hi ! do u know if the nurburgring is open for ...
4,"having lunch in the office , and thinking of h..."
5,shopping was fun
6,wondering where all the nice weather has gone .
7,morning ! yeeessssssss new mimi in aug
8,umm ... maybe that ' s how the british spell it ?
9,yes it ' s 3 : 50 am . yes i ' m still awake ....


In [61]:
a=np.array([1,2,3])
b=np.array([11,22,33])
c=np.array([44,55,66])
print(np.concatenate((a,b,c),axis=0))

[ 1  2  3 11 22 33 44 55 66]


In [62]:
a.shape

(3,)