In [1]:
import os
import re
import nltk
import time
import dotenv
import mlflow
import string
import dagshub
import logging
import warnings
import pandas as pd
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from IPython.display import clear_output
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [2]:
logging.basicConfig(
    level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
)

logging.getLogger().setLevel(logging.INFO)
warnings.simplefilter("ignore", UserWarning)
warnings.filterwarnings("ignore")


In [3]:
nltk.download("punkt", quiet=True)
nltk.download("stopwords", quiet=True)
nltk.download("wordnet", quiet=True)

stop_words = set(stopwords.words("english"))
lemmatizer = WordNetLemmatizer()

In [4]:
imdb_df = pd.read_csv("imdb.csv")
imdb_df.shape

(50000, 2)

In [5]:
imdb_df

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
...,...,...
49995,I thought this movie did a down right good job...,positive
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",negative
49997,I am a Catholic taught in parochial elementary...,negative
49998,I'm going to have to disagree with the previou...,negative


In [6]:
df = imdb_df.sample(n=12000, random_state=42)
df.drop_duplicates(inplace=True)
df.to_csv(path_or_buf="sample.csv", index=False)
df.shape

(11973, 2)

In [7]:
df

Unnamed: 0,review,sentiment
33553,I really liked this Summerslam due to the look...,positive
9427,Not many television shows appeal to quite as m...,positive
199,The film quickly gets to a major chase scene w...,negative
12447,Jane Austen would definitely approve of this o...,positive
39489,Expectations were somewhat high for me when I ...,negative
...,...,...
23335,"I have seen tons of trash, in every language, ...",negative
49537,The core issues at play (God & Satan / Good & ...,negative
37824,There is no such a thing as perfect murder.Lie...,positive
6275,"The Cheesiest movie I've ever seen, Not scary,...",negative


In [8]:
df.isnull().sum()

review       0
sentiment    0
dtype: int64

In [9]:
def remove_html(text):
    return re.sub(r"<.*?>", " ", text)


def remove_urls(text):
    return re.sub(r"http\S+|www\S+", " ", text)


def remove_punctuations(text):
    return text.translate(str.maketrans("", "", string.punctuation))


def tokenize(text):
    return word_tokenize(text=text)


def remove_stopwords(tokens):
    return [w for w in tokens if w not in stop_words]


def lemmatize_tokens(tokens):
    return [lemmatizer.lemmatize(w) for w in tokens]


In [10]:
def preprocess_text(text):
    text = text.lower()
    text = remove_html(text=text)
    text = remove_urls(text=text)
    text = remove_punctuations(text=text)
    tokens = tokenize(text=text)
    tokens = remove_stopwords(tokens=tokens)
    tokens = lemmatize_tokens(tokens=tokens)
    return " ".join(tokens)


In [11]:
df["sentiment"].value_counts()

sentiment
positive    6075
negative    5898
Name: count, dtype: int64

In [12]:
df["review"] = df["review"].astype(str).apply(preprocess_text)
df["sentiment"] = df["sentiment"].map({"negative": 0, "positive": 1})
df

Unnamed: 0,review,sentiment
33553,really liked summerslam due look arena curtain...,1
9427,many television show appeal quite many differe...,1
199,film quickly get major chase scene ever increa...,0
12447,jane austen would definitely approve one gwyne...,1
39489,expectation somewhat high went see movie thoug...,0
...,...,...
23335,seen ton trash every language every topic ever...,0
49537,core issue play god satan good evil tremendous...,0
37824,thing perfect murderlieutenant columbo know th...,1
6275,cheesiest movie ive ever seen scary bad 1st mo...,0


In [13]:
test_size = 0.2
max_features = 100
max_iter = 1000

vectorizer = CountVectorizer(max_features=max_features)
X = vectorizer.fit_transform(df["review"])
y = df["sentiment"]

In [14]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=test_size, random_state=42
)

In [15]:
dotenv.load_dotenv()

dagshub_uri = os.getenv("DAGSHUB_URI")
dagshub_repo = os.getenv("DAGSHUB_REPO")
dagshub_username = os.getenv("DAGSHUB_USERNAME")

mlflow.set_tracking_uri(dagshub_uri)
dagshub.init(repo_owner=dagshub_username, repo_name=dagshub_repo, mlflow=True)

mlflow.set_experiment("Baseline Experiment")
clear_output()

In [16]:
logging.info("Starting MLFlow run...")
with mlflow.start_run():
    t0 = time.time()

    try:
        mlflow.log_params({"vectorizer": "BoW", "test_size": 0.2, "max_features": 100})
        model = LogisticRegression(max_iter=max_iter)
        mlflow.log_param("model", "LogisticRegression")

        model.fit(X_train, y_train)
        y_hat = model.predict(X_test)

        mlflow.log_metrics(
            {
                "accuracy": accuracy_score(y_test, y_hat),
                "precision": precision_score(y_test, y_hat),
                "recall": recall_score(y_test, y_hat),
                "f1_score": f1_score(y_test, y_hat),
            }
        )

        mlflow.sklearn.log_model(model, "model", input_example=X_train[:5])

    except Exception as e:
        logging.error(f"Unexpected error!: {e}", exc_info=True)
        mlflow.log_param("error", str(e))

    t1 = time.time()
    logging.info(f"Execution time : {t1 - t0:.2f} sec")
logging.info("MLFlow run execution complete\n")
clear_output()