In [None]:
import mlflow
import pandas as pd
import mlflow.sklearn
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

import re
import pandas as pd
import string
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

import numpy as np

#nltk.download('wordnet')

In [None]:
df = pd.read_csv('data.csv')
df.head()

## Data Preprocessing
1. Lemmatize
2. Stop words removal
3. Removing numbers
4. Removing punctuations
5. Converting to lowercase
6. Removing URLs

In [None]:
# 1. Lemmatization function
def lemmatize_text(text):
    lemmatizer = WordNetLemmatizer()
    return ' '.join(lemmatizer.lemmatize(word) for word in text.split())

In [None]:
# 2. Stop words removal function
def remove_stopwords(text):
    stop_words = set(stopwords.words('english'))
    text = ' '.join(word for word in text.split() if word.lower() not in stop_words)
    return text

In [None]:
# 3. Remove numbers
def remove_numbers(text):
    return ''.join([char for char in text if not char.isdigit()])

In [None]:
# 4. Convert to lowercase
def to_lowercase(text):
    return " ".join(word.lower() for word in text.split())

In [None]:
# 5. Remove punctuations
def remove_punctuation(text):
    text = re.sub('[%s]' %re.escape(string.punctuation), ' ', text)
    text = text.replace(';', ' ')
    text = re.sub('\s+', ' ', text)
    return text

In [None]:
# 6. Remove URLs
def remove_urls(text):
    url_pattern = re.compile(r'https?://\S+|www\.\S+')
    return url_pattern.sub(r'', text)

In [None]:
def normalize_text(df):
    try:
        df['review'] = df['review'].apply(to_lowercase)
        df['review'] = df['review'].apply(remove_stopwords)
        df['review'] = df['review'].apply(remove_numbers)
        df['review'] = df['review'].apply(remove_punctuation)
        df['review'] = df['review'].apply(remove_urls)
        df['review'] = df['review'].apply(lemmatize_text)
        return df
    except Exception as e:
        print(f'Error during text normalization: {e}')
        raise

In [None]:
df = normalize_text(df)
df.head()

In [None]:
df['sentiment'].value_counts()

In [None]:
x = df['sentiment'].isin(['positive', 'negative'])
df = df[x]

In [None]:
# Mapping the sentiment labels to binary values: positive -> 1, negative -> 0
df['sentiment'] = df['sentiment'].map({'positive': 1, 'negative':0})
df.head()

In [None]:
df.isnull().sum()

In [None]:
vectorizer = CountVectorizer(max_features=100)
X = vectorizer.fit_transform(df['review'])
y = df['sentiment']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

## Add MLFlow via Dagshub

In [None]:
import dagshub

mlflow.set_tracking_uri('<link to your dagshub project/repo>')
dagshub.init(repo_owner = '<dagshub repo owner>', repo_name = '<dagshub repo name>', mlflow = True)

mlflow.set_experiment('Logistic Regression Experiment (Baseline)')

In [None]:
import logging
import os
import time

In [None]:
# Configure logging
logging.basicConfig(level = logging.INFO, format= "%(asctime)s - %(levelname)s - %(message)s")
logging.info('Starting MLFlow experiment...')
with mlflow.start_run():
    start_time = time.time()

    try:
        logging.info('Logging preprocessing parameters...')
        mlflow.log_param('vectorizer', 'Bag of words')
        mlflow.log_param('num_features', 100)
        mlflow.log_param('test_size', 0.25)

        logging.info('Initializing logistic regression model...')
        model = LogisticRegression(max_iter=1000) # Increased max_iter to prevent non-convergence

        logging.info('Training the model...')
        model.fit(X_train, y_train)
        logging.info('Model training completed.')

        logging.info('Logging model parameters...')
        mlflow.log_param('model', 'LogisticRegression')

        logging.info('Making predictions on the test set...')
        y_pred = model.predict(X_test)

        logging.info('Calculating evaluation metrics...')
        accuracy = accuracy_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred)
        recall = recall_score(y_test, y_pred)
        f1 = f1_score(y_test, y_pred)

        logging.info('Logging evaluation metrics...')
        mlflow.log_metric('accuracy', accuracy)
        mlflow.log_metric('precision', precision)
        mlflow.log_metric('recall', recall)
        mlflow.log_metric('f1_score', f1)

        logging.info('Saving and logging the model...')
        #mlflow.sklearn.log_model(model, 'Logistic_Regression_Model')

        # Log the duration of the run
        logging.info(f'Model training, logging and evaluation completed in {time.time() - start_time:.2f} seconds')

        # Print out the metrics
        logging.info(f'Accuracy: {accuracy:.4f}')
        logging.info(f'Precision: {precision:.4f}')
        logging.info(f'Recall: {recall:.4f}')
        logging.info(f'F1 Score: {f1:.4f}')
    except Exception as e:
        logging.error(f'An error occured: {e}', exc_info=True)


