In [1]:
import pandas as pd
import numpy as np
import os
import pickle
from operator import itemgetter

In [2]:
train_pos = os.listdir('./train/pos')
train_neg = os.listdir('./train/neg')

In [23]:
def read_files(address):
    #Read all files given an address
    #Returns: list of strings
    my_files = os.listdir(address)
    return [open("{}/{}".format(address,i),'r', encoding='utf-8').read() for i in my_files]

def shuffle_arr(arr, seed=314):
    #Shuffles a list
    np.random.seed(seed)
    np.random.shuffle(arr)

def clean_and_split(text):
    #Takes a string, removes white spaces, makes it lowercase and splits given space
    #Input: string
    #Returns: list of strings
    return text.strip().lower().split()

def vocabulary_builder(collection, size=2000):
    #Takes a list of strings and creates a simple counter and vocabulary
    #Input: list of strings
    #Returns: dict of strings {word:count}
    vocab = {}
    for sentence in collection:
        for word in clean_and_split(sentence):
            try:
                vocab[word] += 1
            except:
                vocab[word] = 1
    _vocab = sorted(vocab.items(), key=itemgetter(1), reverse=True)[:size]
    id2word = [i[0] for i in _vocab]
    word2id = {id2word[i]:i for i in range(len(id2word))}
    return _vocab, id2word, word2id

def make_set(pos, neg, vocab, shuffle=True):
    X = [clean_and_split(i) for i in pos + neg]
    y = [1] *len(pos) + [0] * len(neg)
    if shuffle:
        shuffle_arr(X)
        shuffle_arr(y)
    return to_bow(X, vocab), y

def to_bow(X, vocab):
    '''
    Input:  X -> matrix of samples
            vocab -> list of words
    Return: Binary array indicating if the word appear in the sample
    '''
    
    return np.array([[i in sample for i in vocab] for sample in X])

In [6]:
def sigmoid(x):
    return 1 / (1 + np.exp(-x))

def prob(x, w):
    return sigmoid(np.dot(x, w))

def loss(x, y, w):
    m = x.shape[0]           #2000x1
    return -(1 / m) * np.sum(y * np.log(prob(x, w)) + (1 - y) * np.log(1 - prob(x, w)))

def gradient(x, y, w):
    m = x.shape[0]
    return (1 / m) * np.dot(x.T, prob(x, w) - y)
    
def fit(x, y, w=[], alpha=0.1, loss_f=loss, num_epochs=300, batch_size=20):
    w = np.random.uniform(-0.5,0.50001, size=len(x[0]))
    m = x.shape[0]
    for _ in range(len(num_epochs)):
        for i in range(m, batch_size):
            start = i * batch_size
            end = start + batch_size
            b_x = x[start:end]
            b_y = y[start:end]
            
            y_pred = np.dot(b_x, w)
            residuals = y_pred - b_y
            g_vals = np.dot(b_x.T, residuals)
            w -= (alpha / m) * g_vals
            l = loss(b_x, b_y, w)
    return w

def predict(x, w):
    return np.dot(x, w)

In [9]:
#Loading train and test set from files
train_pos = read_files('./train/pos')
train_neg = read_files('./train/neg')
test_pos = read_files('./test/pos')
test_neg = read_files('./test/neg')

#shuffling to avoid "Sorted Bias" 
shuffle_arr(train_pos)
shuffle_arr(train_neg)

In [10]:
#Splitting the training set into training and validation
val_pos, val_neg = train_pos[:2500], train_neg[:2500]
train_pos, train_neg = train_pos[2500:], train_neg[2500:]

In [22]:
len(id2word)

2000

In [24]:
vocab, id2word, word2id = vocabulary_builder(train_pos + train_neg)
#transforming the sets into a binary representation
X_train, y_train = make_set(train_pos, train_neg, id2word)
X_val, y_val = make_set(val_pos,val_neg, id2word, False)
X_test, y_test = make_set(test_pos, test_neg, id2word, False)

In [30]:
w = np.random.uniform(-0.5, 0.50001, size=2000)
predict(X_val[2], w)

-1.2167589398130239

In [61]:
pickle.dump(X_train, open('X_train', 'wb'))

['the',
 'line',
 'is',
 'funnier',
 'in',
 'england,',
 'where,',
 'away',
 'from',
 "vixen!'s",
 'native',
 'america,',
 'the',
 'word',
 '"fanny"',
 'has',
 'a',
 'whole',
 'new',
 'meaning.',
 'sadly,',
 "it's",
 'the',
 'only',
 'laugh',
 "you'll",
 'get',
 'in',
 'this',
 'terrible',
 'sex',
 'comedy',
 'that',
 'is',
 'neither',
 'sexy',
 'nor',
 'funny.<br',
 '/><br',
 '/>oddly',
 'unalluring',
 'with',
 'painted-on',
 'eyebrows,',
 'erica',
 'gavin',
 '(acting',
 'ability:',
 'zero)',
 'is',
 'a',
 'nymphomaniac',
 'who',
 'lusts',
 'after',
 'her',
 'own',
 'brother,',
 'but',
 'rejects',
 'his',
 'black',
 'friend',
 'while',
 'making',
 'derogatory',
 'remarks',
 'about',
 'watermelons.',
 'as',
 'if',
 'in',
 'revenge,',
 'he',
 'asks',
 'her',
 'if',
 'she',
 'would',
 'go',
 'with',
 'a',
 'shetland',
 'pony.',
 'reference',
 'is',
 'also',
 'made',
 'to',
 '"making',
 'it',
 'with',
 'monkeys".',
 "gavin's",
 'ability',
 'to',
 'shake',
 'and',
 'tremble',
 'with',
 'or