## Download data

In [None]:
!gdown 1Qv-wJdXkOAwDwRf3lFOiL9Hjm7sO-c74 -O ../data/imdb_part_1.csv
!gdown 1nyIDGh7bpzYq0rWgOh_4pYVq1zoGcrlL -O ../data/stopwords.txt

## Config

In [None]:
config = {
    "vectorizer": {
        "max_features": 100,
        "min_df": 0.05,
    }
}

In [None]:
with open("../data/stopwords.txt", "r") as stopwords_file:
    stop_words = [line.strip() for line in stopwords_file.readlines()]
config["vectorizer"]["stop_words"] = stop_words

## Read Data

In [None]:
import pandas as pd

In [None]:
df = pd.read_csv("../data/imdb_part_1.csv")

In [None]:
df

In [None]:
texts = df.review.tolist()
labels = [int(label=='positive') for label in df.sentiment.tolist()]

## Train/Test split

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
texts_train, texts_test, labels_train, labels_test = train_test_split(texts, labels, test_size=0.2, random_state=42)

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score

## Vectorizer

In [None]:
vectorizer = TfidfVectorizer(**config["vectorizer"])

In [None]:
X_train = vectorizer.fit_transform(texts_train)
X_test = vectorizer.transform(texts_test)

In [None]:
vectorizer.vocabulary_

## Train model

In [None]:
import xgboost as xgb

In [None]:
model = xgb.XGBClassifier(n_jobs=-1)

In [None]:
model.fit(X_train, labels_train)

## Predict on test

In [None]:
predictions = model.predict(X_test)

In [None]:
accuracy = accuracy_score(labels_test, predictions)

In [None]:
accuracy