# Logistic Regression from scratch with SGD + Hashing trick

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import re
from tqdm.notebook import tqdm
import math
import sklearn
tqdm.pandas()

In [None]:
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()

def lemmatize_sentence(sentence):
    return " ".join([lemmatizer.lemmatize(t) for t in sentence.split()])

def simple_clean(text):
    text = text.lower()
    text = re.sub(r"[^a-z ]+", " ", text)
    return lemmatize_sentence(text)
def encode_sentiment(text):
    if text == "positive":
        return 1
    elif text == "negative":
        return 0
    else:
        print("error")
def decode_sentiment(number):
    if number == 1:
        return "positive"
    elif number == 0:
        return "negative"

In [None]:
df = pd.read_csv("../input/imdb-dataset-of-50k-movie-reviews/IMDB Dataset.csv")

In [None]:
df["sentiment_value"] = df["sentiment"].apply(encode_sentiment)
df["review_clean"] = df["review"].progress_apply(simple_clean)

In [None]:
def tokenize(text):
    tokens = []
    one_gram = text.split(" ")
    for i in range(len(one_gram)-1):
        tokens.append(one_gram[i]+" "+one_gram[i+1])
    tokens += one_gram
    return tokens

def hash_trick(tokens,dim=512):
    arr = np.zeros((dim))
    for t in tokens:
        h = sklearn.utils.murmurhash3_32(t)
        if h >= 0:
            arr[h%dim] += 1
        else:
            arr[abs(h)%dim] -= 1
    return arr

def preprocess(batch_text,dim=512):
    return np.concatenate([np.expand_dims(hash_trick(tokenize(t),dim),0) for t in batch_text],axis=0)

In [None]:
X_train,X_test,y_train,y_test = train_test_split(df["review_clean"].values,df["sentiment_value"].values, test_size=0.2, random_state=42)

In [None]:
def sigmoid(x):
    x = np.where(x >= 0, 
                    1 / (1 + np.exp(-x)), 
                    np.exp(x) / (1 + np.exp(x)))
    return np.clip(x,0,1)

def loss(y_prob,y_true):
    y_prob = np.clip(y_prob,0.0000001,0.9999999)
    return -np.mean(y_true*(np.log(y_prob)) + (1-y_true)*np.log(1-y_prob))

class LogRegModel:
    def __init__(self,size_w):
        self.w = np.random.randn(size_w)
        self.b = 0
        self.mw = np.zeros_like(self.w)
        self.mb = 0
    
    def forward(self,x):
        return sigmoid(x.dot(self.w) + self.b)
    
    def update(self,x,y_true,lr,reg=0.1):
        """no autograd here so need to pass in true and predicted values"""
        bs = y_true.shape[0]
        y_prob = self.forward(x)
        batch_loss = loss(y_prob,y_true)
        dw = 1/bs * (x.T.dot(y_prob-y_true) + reg*self.w)
        db = 1/bs * np.sum(y_prob-y_true)
        # momentum
        self.mw = 0.9*self.mw + 0.1*dw
        self.mb = 0.9*self.mb + 0.1*db
        
        self.w -= lr*self.mw
        self.b -= lr*self.mb
        return batch_loss

In [None]:
class AverageMeter(object):
    """Computes and stores the average and current value"""
    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count

In [None]:
epochs = 20
batch_size = 100
dim = 16384
model = LogRegModel(dim)
lr = 1
avg_meter = AverageMeter()

num_batches = math.ceil(X_train.shape[0]/batch_size)
for epoch in range(epochs):    
    tk = tqdm(range(num_batches))
    total_loss = 0
    perm = np.arange(X_train.shape[0])
    np.random.shuffle(perm)
    shuffled_x = X_train[perm]
    shuffled_y = y_train[perm]
    avg_meter.reset()
    # Training
    for i in tk:
        x_batch = shuffled_x[i*batch_size:(i+1)*batch_size]
        x_batch = preprocess(x_batch,dim)
        y_batch = shuffled_y[i*batch_size:(i+1)*batch_size]
        
        batch_loss = model.update(x_batch,y_batch,lr)
        avg_meter.update(batch_loss,x_batch.shape[0])
        total_loss += batch_loss
        tk.set_postfix({'loss':avg_meter.avg})
    # validation
    y_prob = model.forward(preprocess(X_test,dim))
    val_loss = loss(y_prob,y_test)
    val_acc = ((y_prob>0.5) == y_test).mean().item()
    print(f"Epoch: {epoch} Training Loss:{total_loss/num_batches} validation_loss:{val_loss} validation accuracy:{val_acc}")
    
    if epoch%5==0:
        lr *= 0.5

In [None]:
y_pred = model.forward(preprocess(X_test,dim)) > 0.5

In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_test,y_pred))