In [1]:
import pandas as pd
import nltk
import string
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud, STOPWORDS
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder
from prefect import task, Flow
from datetime import timedelta
import mlflow


In [36]:
def load_data(file_path):
    return pd.read_csv(file_path)

In [37]:
def clean_data(df):
    # Drop rows with missing values
    df_cleaned = df.dropna()
    return df_cleaned

In [38]:
def split_data(X, y, test_size=0.25, random_state=0):
    # Split the data into train and test sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)
    return X_train, X_test, y_train, y_test

In [39]:
def vectorize_text(X_train, X_test):
    # Vectorize the text data
    vectorizer = CountVectorizer()
    X_train_vectorized = vectorizer.fit_transform(X_train)
    X_test_vectorized = vectorizer.transform(X_test)
    return X_train_vectorized, X_test_vectorized

In [40]:
def train_pipeline(X_train, y_train):
    pipe = Pipeline([
        ('nb', MultinomialNB())
    ])
    
    ALPHA = [1, 10]

    parameter_grid = [{'nb__alpha' : ALPHA}]

    clf = GridSearchCV(
        estimator=pipe, 
        param_grid=parameter_grid, 
        scoring='f1',
        cv=5,
        return_train_score=True,
        verbose=1
    )

    clf.fit(X_train, y_train)
    
    return clf.best_estimator_

In [35]:
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import GridSearchCV
from prefect import task, Flow
import warnings

# Ignore all warnings
warnings.filterwarnings("ignore")

@task(name="load_data_task")
def load_data(file_path):
    return pd.read_csv(file_path)

@task(name="clean_data_task")
def clean_data(df):
    # Drop rows with missing values
    df_cleaned = df.dropna()
    return df_cleaned

@task(name="split_data_task")
def split_data(X, y, test_size=0.25, random_state=0):
    # Split the data into train and test sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)
    return X_train, X_test, y_train, y_test

@task(name="vectorize_text_task")
def vectorize_text(X_train, X_test):
    # Vectorize the text data
    vectorizer = CountVectorizer()
    X_train_vectorized = vectorizer.fit_transform(X_train)
    X_test_vectorized = vectorizer.transform(X_test)
    return X_train_vectorized, X_test_vectorized

@task(name="train_pipeline_task")
def train_pipeline(X_train, y_train):
    pipe = Pipeline([
        ('nb', MultinomialNB())
    ])
    
    ALPHA = [1, 10]

    parameter_grid = [{'nb__alpha' : ALPHA}]

    clf = GridSearchCV(
        estimator=pipe, 
        param_grid=parameter_grid, 
        scoring='f1',
        cv=5,
        return_train_score=True,
        verbose=1
    )

    clf.fit(X_train, y_train)
    
    return clf.best_estimator_

@flow(name="NB traning flow")
def sentiment_analysis_workflow(data_path):
    # Load data
    df = load_data(data_path)

    # Clean the data
    df_cleaned = clean_data(df)

    # Split the data into train and test sets
    X_train, X_test, y_train, y_test = split_data(df_cleaned['Review text'], df_cleaned['Ratings'])

    # Vectorize the text data
    X_train_vectorized, X_test_vectorized = vectorize_text(X_train, X_test)

    # Training the pipeline
    best_estimator = train_pipeline(X_train_vectorized, y_train)

    return best_estimator

if __name__ == "__main__":
    sentiment_analysis_workflow(data_path="data.csv")


Fitting 5 folds for each of 2 candidates, totalling 10 fits
