In [0]:
# from google.colab import drive
# drive.mount('/content/gdrive')

In [13]:
import pandas as pd
import numpy
from numpy import array
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.preprocessing import OneHotEncoder
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from keras.models import Sequential
from keras.preprocessing.text import text_to_word_sequence
from keras.layers import Conv1D, MaxPooling1D, Flatten, Dense, Dropout, UpSampling1D
from keras.layers import Activation
from keras import backend as K

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [0]:
def load_dataset(url, unique=True):
  df = pd.read_csv(url, sep='\t')
  label = ['negative', 'somewhat negative', 'neutal', 'somewhat positive', 'positive']
  for index, sentiment in enumerate(label):
    df.loc[df['Sentiment'] == index, 'Sentiment_Label'] = sentiment
  if unique:
    df.drop_duplicates(subset='SentenceId', inplace=True)
  x = df['Phrase']
  y = df['Sentiment']
  return x, y, df

def encode_labels(label):
  label = array(label)
  label = label.reshape(len(label), 1)
  encoder = OneHotEncoder(sparse=False, categories='auto')
  encoded = encoder.fit_transform(label)
  return encoded




def doc_stemmer(docs):
    stemmer = PorterStemmer()
    new_docs = list()
    for sentence in docs:
        encoded = text_to_word_sequence(sentence)
        encoded = [stemmer.stem(word) for word in encoded]
        encoded = ' '.join(encoded)
        new_docs.append(encoded)
    return new_docs

In [0]:
stop_words = set(stopwords.words('english'))

BoW_Vectorizer = CountVectorizer(strip_accents='unicode',
                                ngram_range=(1,1),
                                min_df=3,
                                max_df=0.5)

Tfidf_Vectorizer = TfidfVectorizer(strip_accents='unicode',
                                ngram_range=(1,1),
                                min_df=3,
                                max_df=0.5)


In [0]:
def recall_m(y_true, y_pred):
  true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
  possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
  recall = true_positives / (possible_positives + K.epsilon())
  return recall

def precision_m(y_true, y_pred):
  true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
  predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
  precision = true_positives / (predicted_positives + K.epsilon())
  return precision

def f1_m(y_true, y_pred):
  precision = precision_m(y_true, y_pred)
  recall = recall_m(y_true, y_pred)
  f1 = 2 * ((precision * recall) / (precision + recall + K.epsilon()))
  return f1

In [0]:
def baseline_cnn_model(fea_matrix, n_class, mode, compiler):
  model = Sequential()
  model.add(Conv1D(filters=32, kernel_size=3, activation='relu', padding='valid', strides=1, input_shape=(fea_matrix.shape[1], fea_matrix.shape[2])))
  model.add(MaxPooling1D(pool_size=2))
  model.add(Conv1D(filters=32, kernel_size=3, activation='relu'))
  model.add(MaxPooling1D(pool_size=2))
  model.add(Dropout(0.8))
  model.add(Flatten())
  model.add(UpSampling1D(size=2))
  model.add(Dense(250, activation='relu'))
  model.add(Dropout(0.8))
  model.summary()
  model.add(Dense(n_class))
  if n_class == 1 and mode == 'cla':
    model.add(Activation('sigmoid'))
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy', f1_m, precision_m, recall_m])
  else:
    model.add(Activation('softmax'))
    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy', f1_m, precision_m, recall_m])
  return model

In [18]:
# load dataset and initialize vectorizer
url =  'https://raw.githubusercontent.com/cacoderquan/Sentiment-Analysis-on-the-Rotten-Tomatoes-movie-review-dataset/master/train.tsv'
x, y, df = load_dataset(url)
seed = 4
stemmed_x = array(doc_stemmer(x))
x_train, x_test, y_train, y_test = train_test_split(stemmed_x, y, test_size=0.3, random_state=seed)
vectorizer = BoW_Vectorizer
vectorizer.fit(list(stemmed_x))
print(x_train.shape, y_train.shape, x_test.shape, y_test.shape)
print(df.head(10))

(5970,) (5970,) (2559,) (2559,)
     PhraseId  SentenceId  ... Sentiment    Sentiment_Label
0           1           1  ...         1  somewhat negative
63         64           2  ...         4           positive
81         82           3  ...         1  somewhat negative
116       117           4  ...         3  somewhat positive
156       157           5  ...         1  somewhat negative
166       167           6  ...         4           positive
198       199           7  ...         1  somewhat negative
213       214           8  ...         3  somewhat positive
247       248           9  ...         1  somewhat negative
259       260          10  ...         1  somewhat negative

[10 rows x 5 columns]


In [19]:
print(df.head(10))

     PhraseId  SentenceId  ... Sentiment    Sentiment_Label
0           1           1  ...         1  somewhat negative
63         64           2  ...         4           positive
81         82           3  ...         1  somewhat negative
116       117           4  ...         3  somewhat positive
156       157           5  ...         1  somewhat negative
166       167           6  ...         4           positive
198       199           7  ...         1  somewhat negative
213       214           8  ...         3  somewhat positive
247       248           9  ...         1  somewhat negative
259       260          10  ...         1  somewhat negative

[10 rows x 5 columns]


In [20]:
# derive train and test feature vectors
train_vec = vectorizer.transform(x_train)
train_vec = train_vec.toarray()

test_vec = vectorizer.transform(x_test)
test_vec = test_vec.toarray()

# reshape feature vectors
X_train = train_vec.reshape(train_vec.shape[0], train_vec.shape[1], 1)
X_test = test_vec.reshape(test_vec.shape[0], test_vec.shape[1], 1)

# onehot encode labels for multiclass classification
y_train = encode_labels(y_train)
y_test = encode_labels(y_test)
print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)
feature_vec = pd.DataFrame(test_vec, columns=vectorizer.get_feature_names())
feature_vec.head(10)

(5970, 4699, 1) (5970, 5) (2559, 4699, 1) (2559, 5)


Unnamed: 0,10,100,101,11,12,13,15,19,1950,1952,1960,1970,1984,19th,20,2002,20th,21st,22,30,300,3000,40,4ever,50,51,60,65,70,80,84,86,88,90,95,abandon,abil,abl,abli,about,...,wreak,wreck,wrench,wri,write,writer,written,wrong,wrote,wrought,xtc,xxx,ya,yard,yarn,yawn,ye,year,yearn,yellow,yesterday,yet,yiddish,york,yorker,you,young,younger,your,yourself,youth,yu,zeal,zealand,zero,zhang,zinger,zippi,zombi,zone
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
6,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
7,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
8,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
9,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [21]:
#training params
n_class = y_train.shape[1]
mode = 'cla'
compiler = 'adam'
batch_size = 100

 
num_epochs = 20

# train the model
model = baseline_cnn_model(X_train, n_class, mode, compiler)
# model.summary()
model.fit(X_train, y_train, batch_size=batch_size, epochs=num_epochs, verbose=1, validation_split=0.1)
loss, acc, f1, precision, recall = model.evaluate(X_test, y_test, verbose=1)






Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv1d_1 (Conv1D)            (None, 4697, 32)          128       
_________________________________________________________________
max_pooling1d_1 (MaxPooling1 (None, 2348, 32)          0         
_________________________________________________________________
conv1d_2 (Conv1D)            (None, 2346, 32)          3104      
_________________________________________________________________
max_pooling1d_2 (MaxPooling1 (None, 1173, 32)          0         
_________________________________________________________________
dropout_1 (Dropout)          (None, 1173, 32)          0         
_________________________________________________________________
flatten_1 (Flatten)          (None, 37536)             0         


In [22]:
print(f'Accuracy: {numpy.round(acc*100, 2)}')
print(f'F1 Score: {numpy.round(f1, 3)}')
print(f'Precision: {numpy.round(precision, 3)}')
print(f'Recall: {numpy.round(recall, 3)}')

Accuracy: 40.33
F1 Score: 0.249
Precision: 0.459
Recall: 0.174


In [23]:
loss, acc, f1, precision, recall = model.evaluate(X_test, y_test, verbose=1)



In [0]:
model.save('1113852_1dconv_reg')