# Logistic Regression

## Read the data 

In [None]:
import pandas as pd

df_train = pd.read_json("../data/aclIMDB_train.json", orient="records")
df_train.info()
df_train.hist()
df_train.head()

In [None]:
df_val = pd.read_json("../data/aclIMDB_val.json", orient="records")
df_val.info()
df_val.hist()
df_val.head()

In [None]:
df_test = pd.read_json("../data/aclIMDB_test.json", orient="records")
df_test.info()
df_test.hist()
df_test.head()

In [None]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string

STOPWORDS = stopwords.words("english")

def is_html_tag(word):
    w = word.replace("\n", "")
    return w.startswith("<") or w.startswith(">") or w.startswith("/") or w.strip()[:2] == "br"

def remove_html_tags(sentence):
    single_spaces = " ".join(sentence.split())

    return " ".join([token for token in single_spaces.split(" ") if not is_html_tag(token)])

def tokenize_sentence(sentence):
    return word_tokenize(sentence.lower())

def remove_stopwords(sentence):
    return " ".join([word for word in sentence.split(" ") if not word in STOPWORDS])

def remove_punctuation(tokenized_sentence):
    return [word for word in tokenized_sentence if word.isalpha()]

def preprocess(sentence):
    return remove_punctuation(tokenize_sentence(remove_stopwords(remove_html_tags(str(sentence)))))

print(preprocess(df_train["text"].values[1]))


In [None]:
def sentiment(value):
    if(value == 1):
        return "positive"
    else:
        return "negative"

df_train["sentiment"] = [None] * len(df_train)
df_train["sentiment"] = df_train["label"].apply(sentiment)
df_val["sentiment"] = [None] * len(df_test)
df_val["sentiment"] = df_val["label"].apply(sentiment)
df_test["sentiment"] = [None] * len(df_test)
df_test["sentiment"] = df_test["label"].apply(sentiment)

In [None]:
df_train["text"] = df_train["text"].apply(preprocess)
df_train.head()

In [None]:
def build_vocab(tokenized_input, vocab_size):
    d = dict()

    for tokens in tokenized_input:
        for token in tokens:
            # double check
            if token not in STOPWORDS and token.isalpha():
                d[token] = d.get(token, 0) + 1

    del d["br"]

    return {k for k, v in sorted(d.items(), key=lambda item: item[1], reverse=True)[:vocab_size]}

In [None]:
VOCAB_SIZE = 10000
VOCAB = build_vocab(df_train["text"].values, VOCAB_SIZE) # (!) Always build the vocabulary on the TRAIN dataset

In [None]:
len(VOCAB)

In [None]:
def get_frequencies_for_sentiment(df):
    dict_freqs = {"positive": {}, "negative": {}}
    
    for idx in range(df.shape[0]):
        tokens = df_train.iloc[idx]["text"]
        sentiment = df_train.iloc[idx]["sentiment"]
        
        for token in tokens:
            if token in VOCAB:
                dict_freqs[sentiment][token] = dict_freqs[sentiment].get(token, 0) + 1
            
    return dict_freqs

In [None]:
frequency_table = get_frequencies_for_sentiment(df_train)

In [None]:
frequency_table["positive"]["happy"]

In [None]:
frequency_table["negative"]["happy"]

In [None]:
frequency_table["positive"]["frustrating"]

In [None]:
frequency_table["negative"]["frustrating"]

In [None]:
frequency_table["positive"]["boring"]

In [None]:
frequency_table["negative"]["boring"]

In [None]:
import numpy as np

def extract_features(frequency_table, tweet_tokens):
    positives = 0
    negatives = 0

    for t in set(tweet_tokens):
        positives += frequency_table["positive"].get(t, 0)
        negatives += frequency_table["negative"].get(t, 0)
    
    return pd.Series({"positives": positives, "negatives": negatives})


In [None]:
X_train_logistic = df_train["text"].apply(lambda tokens: extract_features(frequency_table, tokens))

X_train_logistic

In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

X_train_logistic = scaler.fit_transform(X_train_logistic)

In [None]:
y_train_logistic = df_train["label"].values

In [None]:
from sklearn.linear_model import LogisticRegression

clf = LogisticRegression(random_state=0).fit(X_train_logistic, y_train_logistic)

In [None]:
from sklearn.metrics import accuracy_score

preds_train = clf.predict(X_train_logistic)

print("Train accuracy:", accuracy_score(y_train_logistic, preds_train))

## Validation

In [None]:
df_val["text"] = df_val["text"].apply(preprocess)
X_val_logistic = df_val["text"].apply(lambda tokens: extract_features(frequency_table, tokens))
X_val_logistic = scaler.transform(X_val_logistic) # (!) You should only TRANSFORM the test data
y_val_logistic = df_val["label"].values

In [None]:
preds_val = clf.predict(X_val_logistic)

print("Validation accuracy:", accuracy_score(y_val_logistic, preds_val))

## Test

In [None]:
df_test["text"] = df_test["text"].apply(preprocess)
X_test_logistic = df_test["text"].apply(lambda tokens: extract_features(frequency_table, tokens))
X_test_logistic = scaler.transform(X_test_logistic) # (!) You should only TRANSFORM the test data
y_test_logistic = df_test["label"].values

In [None]:
preds_test = clf.predict(X_test_logistic)

print("Test accuracy:", accuracy_score(y_test_logistic, preds_test))