In [1]:
import sys
import os
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import linear_model
from sklearn.metrics import classification_report, accuracy_score
import numpy as np


In [2]:
# directories 'pos' and 'neg' sould be in the same folder as the code
# here I just parse the data in a nice way
# data is already sorted in the groups for cross-validation
# I use 'cv9...' as a test and the rest as training data 
print('data parsing...')
data_dir = ''
classes = ['pos', 'neg']

train_data = []
train_labels = []
test_data = []
test_labels = []

for curr_class in classes:
    dirname = os.path.join(data_dir, curr_class)
    for fname in os.listdir(dirname):
        with open(os.path.join(dirname, fname), 'r') as f:
            content = f.read()
            if fname.startswith('cv9'):
                test_data.append(content)
                test_labels.append(curr_class)
            else:
                train_data.append(content)
                train_labels.append(curr_class)
print('data parsed!')


data parsing...
data parsed!


In [3]:
# there is a good function in scikit-learn to vectorize data
# I make unigrams with the words occuring more that 5 times and also cut words with frequencies upper 20%  
print('vectorisation for unigrams...')
vectorizer = TfidfVectorizer(min_df=5, max_df = 0.8, sublinear_tf=True, use_idf=True, ngram_range = (1,1))
train_vectors = vectorizer.fit_transform(train_data)
test_vectors = vectorizer.transform(test_data)
print('vectorisation for unigrams is done!')

vectorisation for unigrams...
vectorisation for unigrams is done!


In [4]:
print('training logistic regression classifier for unigrams...')
lr = linear_model.LogisticRegression()
lr.fit(train_vectors.astype(np.float), train_labels)
prediction_lr = lr.predict(test_vectors.astype(np.float))
print('model is trained!')

training logistic regression classifier for unigrams...
model is trained!


In [5]:
print(classification_report(np.asarray(test_labels), prediction_lr))
print(accuracy_score(np.asarray(test_labels), prediction_lr))

             precision    recall  f1-score   support

        neg       0.89      0.90      0.90       100
        pos       0.90      0.89      0.89       100

avg / total       0.90      0.90      0.89       200

0.895


In [6]:
print('vectorisation for bigrams...')
vectorizer2 = TfidfVectorizer(min_df=5, max_df = 0.8, sublinear_tf=True, use_idf=True, ngram_range = (2,2))
train_vectors2 = vectorizer2.fit_transform(train_data)
test_vectors2 = vectorizer2.transform(test_data)
print('vectorisation for bigrams is done!')


vectorisation for bigrams...
vectorisation for bigrams is done!


In [7]:
print('training logistic regression classifier for bigrams...')
lr2 = linear_model.LogisticRegression()
lr2.fit(train_vectors2.astype(np.float), train_labels)
prediction_lr2 = lr2.predict(test_vectors2.astype(np.float))
print('model is trained!')

training logistic regression classifier for bigrams...
model is trained!


In [8]:
print(classification_report(np.asarray(test_labels), prediction_lr2))
print(accuracy_score(np.asarray(test_labels), prediction_lr2))

             precision    recall  f1-score   support

        neg       0.89      0.88      0.88       100
        pos       0.88      0.89      0.89       100

avg / total       0.89      0.89      0.88       200

0.885


In [9]:
from sknn.mlp import Classifier, Layer

In [10]:
# I decided to train my own w2v model from train data
# also I formatted the original data in the nice way to train the model
import gensim, logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)


In [11]:
import nltk

In [12]:

nltk.download('punkt')

[nltk_data] Downloading package punkt to /home/a/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [13]:
print('building data to feed to w2v...')
tokenizer = nltk.data.load('nltk:tokenizers/punkt/english.pickle')
tokenizedbysentence = []
tokenized_sents = []


building data to feed to w2v...


In [14]:
a = nltk.tokenize.punkt.PunktLanguageVars()

for j in range (len(train_data)):
    tokenizedbysentence.append(tokenizer.tokenize(train_data[j].strip()))
    tokenized_sents.append([a.word_tokenize(i) for i in tokenizedbysentence[j]])


In [15]:
sentences = [item for sublist in tokenized_sents for item in sublist]
print('data are prepared to build a model!')


data are prepared to build a model!


In [16]:
print('building a model')
# for each word a vector with the size of 100 will be built
# only words which occure more that 10 times will be used
model = gensim.models.Word2Vec(sentences, workers=4, size=100, min_count=10) 
print('model is built!')


building a model
model is built!


In [17]:

print('building train and test data...')
train_vector3 = []
for review in tokenized_sents:
    sum_vector = np.zeros(100)
    for sentence in review:
        for word in sentence:
            try:
                sum_vector = sum_vector + model[word]
            except KeyError:
                sum_vector = sum_vector + np.zeros(100)
    train_vector3.append(sum_vector)



building train and test data...


In [18]:
test_tokenizedbysentence = []
test_tokenized_sents = []

for j in range (len(test_data)):
    test_tokenizedbysentence.append(tokenizer.tokenize(test_data[j].strip()))
    test_tokenized_sents.append([a.word_tokenize(i) for i in test_tokenizedbysentence[j]])

test_vector3 = []
for review in test_tokenized_sents:
    sum_vector = np.zeros(100)
    for sentence in review:
        for word in sentence:
            try:
                sum_vector = sum_vector + model[word]
            except KeyError:
                sum_vector = sum_vector + np.zeros(100)
    test_vector3.append(sum_vector)
print('data are ready!')


data are ready!


In [19]:

print('traning a neural network with unigrams...')
nn = Classifier(
                layers=[
                        Layer("Sigmoid", units=1000),#increase the number of neurons of the 1st layer
                        Layer("Sigmoid", units=500, dropout=0.33),#increase the number of neurons of the 2nd layer
                        Layer("Sigmoid", units=250),
                        Layer("Softmax")],
                learning_rate=0.0001,# you can play around with that too
                n_iter=10000)#make it bigger if it helps



traning a neural network with unigrams...


In [20]:
nn.fit(np.asarray(train_vector3), np.asarray(train_labels))



  "downsample module has been moved to the theano.tensor.signal.pool module.")


Classifier(batch_size=1, callback=None, debug=False, dropout_rate=None,
      f_stable=0.001,
      hidden0=<sknn.nn.Layer `Sigmoid`: units=1000, name=u'hidden0', frozen=False>,
      hidden1=<sknn.nn.Layer `Sigmoid`: units=500, name=u'hidden1', frozen=False, dropout=0.33>,
      hidden2=<sknn.nn.Layer `Sigmoid`: units=250, name=u'hidden2', frozen=False>,
      layers=[<sknn.nn.Layer `Sigmoid`: units=1000, name=u'hidden0', frozen=False>, <sknn.nn.Layer `Sigmoid`: units=500, name=u'hidden1', frozen=False, dropout=0.33>, <sknn.nn.Layer `Sigmoid`: units=250, name=u'hidden2', frozen=False>, <sknn.nn.Layer `Softmax`: units=2, name=u'output', frozen=False>],
      learning_momentum=0.9, learning_rate=0.0001, learning_rule=u'sgd',
      loss_type=None, n_iter=10000, n_stable=10, normalize=None,
      output=<sknn.nn.Layer `Softmax`: units=2, name=u'output', frozen=False>,
      parameters=None, random_state=None, regularize=None, valid_set=None,

In [21]:
prediction_mlp = nn.predict(np.asarray(test_vector3))
print('neural network is trained!')


[(200, 2)]
neural network is trained!


In [22]:
print(classification_report(np.asarray(test_labels), prediction_mlp))
print(accuracy_score(np.asarray(test_labels), prediction_mlp))

             precision    recall  f1-score   support

        neg       0.66      0.59      0.62       100
        pos       0.63      0.69      0.66       100

avg / total       0.64      0.64      0.64       200

0.64


In [23]:
print('processing data for bigrams...')
words = []
for i in vectorizer2.get_feature_names():
    words.append(i.split())


processing data for bigrams...


In [24]:
bigram_vectors = []
for bigram in words:
    try:
        bigram_vectors.append(np.concatenate((model[bigram[0]],model[bigram[1]]),axis=0))
    except KeyError:
        try:
            bigram_vectors.append(np.concatenate((np.zeros(100),model[bigram[1]]),axis=0))
        except KeyError:
            try:
                bigram_vectors.append(np.concatenate((model[bigram[0]],np.zeros(100)),axis=0))
            except KeyError:
                bigram_vectors.append(np.concatenate((np.zeros(100),np.zeros(100)),axis=0))



In [25]:
train_vector4 = []
for vec in train_vectors2.toarray():
    sum_vector = np.zeros(200)
    for i in range(len(vec)):
        sum_vector = sum_vector + vec[i]*bigram_vectors[i]
    train_vector4.append(sum_vector)
    
print('data are proccesed!')



data are proccesed!


In [26]:

print('traning a neural network with bigrams...')
#nn1 = Classifier(
#    layers=[
#        Layer("Sigmoid", units=10),
#        Layer("Softmax")],
#    learning_rate=0.001,
#    n_iter=25)
nn1 = Classifier(
                layers=[
                        Layer("Sigmoid", units=1000),#increase the number of neurons of the 1st layer
                        Layer("Sigmoid", units=500, dropout=0.33),#increase the number of neurons of the 2nd layer
                        Layer("Sigmoid", units=250),
                        Layer("Softmax")],
                learning_rate=0.0001,# you can play around with that too
                n_iter=10000)#make it bigger if it helps


traning a neural network with bigrams...


In [27]:
nn1.fit(np.asarray(train_vector4), np.asarray(train_labels))



Classifier(batch_size=1, callback=None, debug=False, dropout_rate=None,
      f_stable=0.001,
      hidden0=<sknn.nn.Layer `Sigmoid`: units=1000, name=u'hidden0', frozen=False>,
      hidden1=<sknn.nn.Layer `Sigmoid`: units=500, name=u'hidden1', frozen=False, dropout=0.33>,
      hidden2=<sknn.nn.Layer `Sigmoid`: units=250, name=u'hidden2', frozen=False>,
      layers=[<sknn.nn.Layer `Sigmoid`: units=1000, name=u'hidden0', frozen=False>, <sknn.nn.Layer `Sigmoid`: units=500, name=u'hidden1', frozen=False, dropout=0.33>, <sknn.nn.Layer `Sigmoid`: units=250, name=u'hidden2', frozen=False>, <sknn.nn.Layer `Softmax`: units=2, name=u'output', frozen=False>],
      learning_momentum=0.9, learning_rate=0.0001, learning_rule=u'sgd',
      loss_type=None, n_iter=10000, n_stable=10, normalize=None,
      output=<sknn.nn.Layer `Softmax`: units=2, name=u'output', frozen=False>,
      parameters=None, random_state=None, regularize=None, valid_set=None,

In [28]:
test_vector4 = []
for vec in test_vectors2.toarray():
    sum_vector = np.zeros(200)
    for i in range(len(vec)):
        sum_vector = sum_vector + vec[i]*bigram_vectors[i]
    test_vector4.append(sum_vector)



In [29]:
prediction_mlp2 = nn1.predict(np.asarray(test_vector4))
print('network is built!')


[(200, 2)]
network is built!


In [30]:
print(classification_report(np.asarray(test_labels), prediction_mlp2))
print(accuracy_score(np.asarray(test_labels), prediction_mlp2))

             precision    recall  f1-score   support

        neg       0.64      0.49      0.56       100
        pos       0.59      0.73      0.65       100

avg / total       0.62      0.61      0.60       200

0.61


In [31]:
from sknn.mlp import Classifier, Convolution, Layer

In [32]:
print('building cnn for bigrams...')
cnn = Classifier(
    layers=[
        Convolution("Rectifier", channels=10, kernel_shape=(1,10)),
        Layer("Softmax")],
    learning_rate=0.02,
    n_iter=5)


building cnn for bigrams...


In [33]:
np.asarray(train_vector4).shape

(1800, 200)

In [34]:
cnn.fit(np.reshape(np.asarray(train_vector4),[1800,20,10]), np.asarray(train_labels))


  border_mode=border_mode)


Classifier(batch_size=1, callback=None, debug=False, dropout_rate=None,
      f_stable=0.001,
      hidden0=<sknn.nn.Convolution `Rectifier`: channels=10, scale_factor=(1, 1), name=u'hidden0', frozen=False, kernel_shape=(1, 10), kernel_stride=(1, 1), pool_shape=(1, 1), border_mode=u'valid'>,
      layers=[<sknn.nn.Convolution `Rectifier`: channels=10, scale_factor=(1, 1), name=u'hidden0', frozen=False, kernel_shape=(1, 10), kernel_stride=(1, 1), pool_shape=(1, 1), border_mode=u'valid'>, <sknn.nn.Layer `Softmax`: units=2, name=u'output', frozen=False>],
      learning_momentum=0.9, learning_rate=0.02, learning_rule=u'sgd',
      loss_type=None, n_iter=5, n_stable=10, normalize=None,
      output=<sknn.nn.Layer `Softmax`: units=2, name=u'output', frozen=False>,
      parameters=None, random_state=None, regularize=None, valid_set=None,

In [35]:
prediction_cnn = cnn.predict(np.reshape(np.asarray(test_vector4),[200,20,10]))
print('cnn is built!')

[(200, 2)]
cnn is built!


In [36]:
print(classification_report(np.asarray(test_labels), prediction_cnn))
print(accuracy_score(np.asarray(test_labels), prediction_cnn))

             precision    recall  f1-score   support

        neg       0.64      0.16      0.26       100
        pos       0.52      0.91      0.66       100

avg / total       0.58      0.54      0.46       200

0.535
