# Logistic Regression - tfidf

## Read the data

In [None]:
import pandas as pd

df_train = pd.read_json("../data/aclIMDB_train.json", orient='records')
df_train.info()
df_train.hist()

In [None]:
df_val = pd.read_json("../data/aclIMDB_val.json", orient='records')
df_val.info()
df_val.hist()

In [None]:
df_test = pd.read_json("../data/aclIMDB_test.json", orient='records')
df_test.info()
df_test.hist()

In [None]:
df_test["text"] = [str(item).lower() for item in df_test["text"].values]
df_test.head(5)

In [None]:
df_val["text"] = [str(item).lower() for item in df_val["text"].values]
df_val.head(5)

In [None]:
df_train["text"] = [str(item).lower() for item in df_train["text"].values]
df_train.head(5)

In [None]:
import numpy as np
from tqdm import tqdm
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer

results = { "min_df" : [], "vec_size" : [], "Train accuracy" : [], "Validation accuracy" : []}
for min_df in tqdm([10, 25, 50, 100, 250, 500]):
    vectorizer = TfidfVectorizer(min_df=min_df)
    X_train = vectorizer.fit_transform(df_train["text"].values)
    y_train = np.array(df_train["label"].values)
    x_val = vectorizer.transform(df_val["text"].values)
    y_val = np.array(df_val["label"].values)

    clf = LogisticRegression(penalty="l2", C=0.1)
    clf.fit(X_train, y_train)

    preds_train = clf.predict(X_train)
    preds_val = clf.predict(x_val)

    results["min_df"].append(min_df)
    results["vec_size"].append(X_train.shape[1])
    results["Train accuracy"].append(accuracy_score(y_train, preds_train))
    results["Validation accuracy"].append(accuracy_score(y_val, preds_val))

results = pd.DataFrame(results)
results


In [None]:
best_min_df = results["min_df"].values[np.argmax(results["Validation accuracy"].values, axis=0)]
best_min_df

In [None]:
vectorizer = TfidfVectorizer(min_df=best_min_df)
df_train_val = pd.concat([df_train, df_val]).sample(frac=1)
X_train_val = vectorizer.fit_transform(df_train_val["text"].values)
y_train_val = np.array(df_train_val["label"].values)
X_test = vectorizer.transform(df_test["text"].values)
y_test = np.array(df_test["label"].values)

clf = LogisticRegression(penalty="l2", C=0.1)
clf.fit(X_train_val, y_train_val)

print(y_test[:10])
print(clf.predict(X_test[:10]))
clf.score(X_test, y_test)