# Tensor flow
Following: https://www.tensorflow.org/tutorials/keras/basic_text_classification

In [1]:
from __future__ import absolute_import, division, print_function

import tensorflow as tf
from tensorflow import keras

import numpy as np
import pandas as pd

print(tf.__version__)

1.13.1


In [2]:
train_data = pd.read_csv('reviews_train.tsv', sep='\t', engine='python')
val_data = pd.read_csv('reviews_val.tsv', sep='\t', engine='python')
test_data = pd.read_csv('reviews_test.tsv', sep='\t', engine='python')

In [3]:
token = tf.keras.preprocessing.text.Tokenizer()

token.fit_on_texts(train_data['text'])
Xtrain = token.texts_to_sequences(train_data['text'])
Xval = token.texts_to_sequences(val_data['text'])
Xtest = token.texts_to_sequences(test_data['text'])

# Need to padd out sequences to common length
length = max(map(len, np.concatenate((Xtrain, Xval))))
Xtrain = np.array([xi+[0]*(length-len(xi)) for xi in Xtrain])
Xval = np.array([xi+[0]*(length-len(xi)) for xi in Xval])
Xtest = np.array([xi+[0]*(length-len(xi)) for xi in Xtest])

# convert sentiment (y variable) -1,+1 to 0,1
ytrain = (train_data['sentiment'] + 1)/2
yval = (val_data['sentiment'] + 1)/2
ytest = (test_data['sentiment'] + 1)/2

In [4]:
# input shape is the vocabulary count used for the movie reviews (10,000 words)
vocab_size = 15000

model = keras.Sequential()
model.add(keras.layers.Embedding(vocab_size, 16))
model.add(keras.layers.GlobalAveragePooling1D())
model.add(keras.layers.Dense(16, activation=tf.nn.relu))
model.add(keras.layers.Dense(1, activation=tf.nn.sigmoid))

model.summary()

Instructions for updating:
Colocations handled automatically by placer.
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, None, 16)          240000    
_________________________________________________________________
global_average_pooling1d (Gl (None, 16)                0         
_________________________________________________________________
dense (Dense)                (None, 16)                272       
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 17        
Total params: 240,289
Trainable params: 240,289
Non-trainable params: 0
_________________________________________________________________


In [5]:
model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['acc'])

In [6]:
history = model.fit(Xtrain,
                    ytrain,
                    epochs=40,
                    batch_size=512,
                    validation_data=(Xval, yval),
                    verbose=1)

Train on 4000 samples, validate on 500 samples
Instructions for updating:
Use tf.cast instead.
Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 7/40
Epoch 8/40
Epoch 9/40
Epoch 10/40
Epoch 11/40
Epoch 12/40
Epoch 13/40
Epoch 14/40
Epoch 15/40
Epoch 16/40
Epoch 17/40
Epoch 18/40
Epoch 19/40
Epoch 20/40
Epoch 21/40
Epoch 22/40
Epoch 23/40
Epoch 24/40
Epoch 25/40
Epoch 26/40
Epoch 27/40
Epoch 28/40
Epoch 29/40
Epoch 30/40
Epoch 31/40
Epoch 32/40
Epoch 33/40
Epoch 34/40
Epoch 35/40
Epoch 36/40
Epoch 37/40
Epoch 38/40
Epoch 39/40
Epoch 40/40


In [7]:
test_pred = np.array(model.predict(Xtest) > 0.5, dtype=int).squeeze()
print('Test Set Accuracy = %6.3f' % (np.sum(np.equal(test_pred, ytest))/len(ytest)))

Test Set Accuracy =  0.654


# Kaggle
Basic models from the Learn modules

In [9]:
from sklearn.tree import DecisionTreeRegressor
dt_model = DecisionTreeRegressor(random_state=1)
dt_model.fit(Xtrain, ytrain)
dt_pred = dt_model.predict(Xtest)
print('Decisiton Tree Accuracy = %6.3f' % (np.sum(np.equal(dt_pred, ytest))/len(ytest)))

Decisiton Tree Accuracy =  0.576


In [10]:
from sklearn.ensemble import RandomForestRegressor

forest_model = RandomForestRegressor(random_state=1)
forest_model.fit(Xtrain, ytrain)
forest_preds = forest_model.predict(Xtest)
print('Forest Tree Accuracy = %6.3f' % (np.sum(np.equal(forest_preds, ytest))/len(ytest)))



Forest Tree Accuracy =  0.004


In [11]:
from xgboost import XGBRegressor

xgb_model = XGBRegressor(n_estimators=500)
xgb_model.fit(Xtrain, ytrain, early_stopping_rounds=5,
             eval_set=[(Xval, yval)])
xgb_pred = xgb_model.predict(Xtest)
print('Forest Tree Accuracy = %6.3f' % (np.sum(np.equal(xgb_pred, ytest))/len(ytest)))

  if getattr(data, 'base', None) is not None and \


[0]	validation_0-rmse:0.498099
Will train until validation_0-rmse hasn't improved in 5 rounds.
[1]	validation_0-rmse:0.497101
[2]	validation_0-rmse:0.496042
[3]	validation_0-rmse:0.495215
[4]	validation_0-rmse:0.49481
[5]	validation_0-rmse:0.494227
[6]	validation_0-rmse:0.494191
[7]	validation_0-rmse:0.494109
[8]	validation_0-rmse:0.493912
[9]	validation_0-rmse:0.493585
[10]	validation_0-rmse:0.493387
[11]	validation_0-rmse:0.493721
[12]	validation_0-rmse:0.493443
[13]	validation_0-rmse:0.494047
[14]	validation_0-rmse:0.494341
[15]	validation_0-rmse:0.494411
Stopping. Best iteration:
[10]	validation_0-rmse:0.493387

Forest Tree Accuracy =  0.000


# scikit-learn
Following: https://scikit-learn.org/stable/tutorial/text_analytics/working_with_text_data.html

In [12]:
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(train_data['text'])
X_train_counts.shape


(4000, 13522)

In [13]:
from sklearn.feature_extraction.text import TfidfTransformer
tf_transformer = TfidfTransformer(use_idf=False).fit(X_train_counts)
X_train_tf = tf_transformer.transform(X_train_counts)
X_train_tf.shape

(4000, 13522)

In [15]:
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
X_train_tfidf.shape

(4000, 13522)

In [16]:
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB().fit(X_train_tfidf, ytrain)

In [17]:
from sklearn.pipeline import Pipeline
text_clf = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', MultinomialNB()),
])

In [18]:
text_clf.fit(train_data['text'], ytrain)  

Pipeline(memory=None,
     steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip...inear_tf=False, use_idf=True)), ('clf', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))])

In [21]:
mnb_pred = text_clf.predict(test_data['text'])
print('Multinomial Bayes Accuracy = %6.3f' % (np.sum(np.equal(mnb_pred, ytest))/len(ytest)))

Multinomial Bayes Accuracy =  0.790


In [23]:
from sklearn.linear_model import SGDClassifier
svm_model = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', SGDClassifier(loss='hinge', penalty='l2',
                          alpha=1e-3, random_state=42,
                          max_iter=5, tol=None)),
])

svm_model.fit(train_data['text'], ytrain)  
svm_pred = svm_model.predict(test_data['text'])
print('SVM Accuracy = %6.3f' % (np.sum(np.equal(svm_pred, ytest))/len(ytest)))   

SVM Accuracy =  0.786




In [29]:
from sklearn.model_selection import GridSearchCV
parameters = {
    'vect__ngram_range': [(1, 1), (1, 2)],
    'tfidf__use_idf': (True, False),
    'clf__alpha': (1e-2, 1e-3),
}
gs_clf = GridSearchCV(svm_model, parameters, cv=5, iid=False, n_jobs=-1)
gs_clf = gs_clf.fit(train_data['text'], ytrain)



In [30]:
gs_pred = gs_clf.predict(test_data['text'])
print('GS Accuracy = %6.3f' % (np.sum(np.equal(gs_pred, ytest))/len(ytest)))   

GS Accuracy =  0.786


In [31]:
gs_clf.best_params_

{'clf__alpha': 0.001, 'tfidf__use_idf': True, 'vect__ngram_range': (1, 1)}