### Passage is a little library for text analysis using RNN

You can download from the following Github locataion https://github.com/IndicoDataSolutions/Passage

In [1]:
import numpy as np
import pandas as pd

from passage.models import RNN
from passage.updates import Adadelta
from passage.layers import Embedding, GatedRecurrent, Dense
from passage.preprocessing import Tokenizer

### Load data

In [2]:
df = pd.read_csv('Data/Review_Data.csv')

df['Sentiment'] = df['Sentiment'].replace(['positive', 'negative'], [1, 0])

X = df['Review'].values
y = df['Sentiment'].values

df['Sentiment'].value_counts()

1    1122
0     186
Name: Sentiment, dtype: int64

In [3]:
tokenizer = Tokenizer(min_df=10, max_features=100000)
X = tokenizer.fit_transform(X)

print("Training data tokenized.")
layers = [
    Embedding(size=256, n_features=tokenizer.n_features),
    GatedRecurrent(size=512, activation='tanh', gate_activation='steeper_sigmoid', init='orthogonal', seq_output=False, p_drop=0.75),
    Dense(size=1, activation='sigmoid', init='orthogonal')
]

model = RNN(layers=layers, cost='bce', updater=Adadelta(lr=0.5))
model.fit(X, y, n_epochs=5)

Training data tokenized.
Epoch 0 Seen 1294 samples Avg cost 0.4941 Time elapsed 49 seconds
Epoch 1 Seen 2588 samples Avg cost 0.4322 Time elapsed 99 seconds
Epoch 2 Seen 3882 samples Avg cost 0.4072 Time elapsed 148 seconds
Epoch 3 Seen 5176 samples Avg cost 0.4087 Time elapsed 198 seconds
Epoch 4 Seen 6470 samples Avg cost 0.3916 Time elapsed 247 seconds


[array(0.6927503862794484),
 array(0.6633593584972964),
 array(0.6399034523980139),
 array(0.6162563613468576),
 array(0.6198616191761955),
 array(0.5890216396588616),
 array(0.5417145781177113),
 array(0.5588367544568321),
 array(0.4547270941304046),
 array(0.45360932088691647),
 array(0.4555397983807551),
 array(0.40373163745212526),
 array(0.34829811244592157),
 array(0.21819362517178276),
 array(0.4767267934295591),
 array(0.33737805363917694),
 array(0.353903272935296),
 array(0.5222953350184656),
 array(0.48713113082761333),
 array(0.5083882498828556),
 array(0.43486536496458256),
 array(0.5442879729822081),
 array(0.4261337915195932),
 array(0.5128224382316354),
 array(0.39083141560972273),
 array(0.40334206958023366),
 array(0.3307378827936723),
 array(0.4098851975953147),
 array(0.430134062190428),
 array(0.27972721850577137),
 array(0.4781791226284806),
 array(0.4664643785726648),
 array(0.5045802553170653),
 array(0.3002709192986779),
 array(0.34863665164215224),
 array(0.56

In [8]:
from sklearn import metrics

y_pred = model.predict(X).flatten()

y_pred = y_pred  > 0.5 
y_pred = y_pred.astype(int)

print "Accuracy: ", metrics.accuracy_score(y, y_pred)

0.857798165138
