In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, classification_report
import joblib
import re
import string
import mlflow

print("Libraries imported.")

Libraries imported.


In [2]:
# Simulate loading data
data = {
    'text': [
        "This product is amazing! Highly recommend.",
        "Very disappointed with the quality.",
        "Works okay, but not great.",
        "Excellent customer service, resolved my issue quickly.",
        "The app is buggy and crashes frequently.",
        "I love the new features!",
        "It's terrible, don't buy it.",
        "Average experience, nothing special.",
        "Best purchase I've made this year!",
        "Waste of money and time."
    ],
    'sentiment': ['positive', 'negative', 'neutral', 'positive', 'negative', 'positive', 'negative', 'neutral', 'positive', 'negative']
}
df = pd.DataFrame(data)

# Save the raw data (simulate data source versioning with DVC later)
# Create a 'data' directory if it doesn't exist
import os
if not os.path.exists('data'):
    os.makedirs('data')
df.to_csv('data/raw_feedback.csv', index=False)

print("Data loaded and saved to data/raw_feedback.csv")
print(df.head())

Data loaded and saved to data/raw_feedback.csv
                                                text sentiment
0         This product is amazing! Highly recommend.  positive
1                Very disappointed with the quality.  negative
2                         Works okay, but not great.   neutral
3  Excellent customer service, resolved my issue ...  positive
4           The app is buggy and crashes frequently.  negative


In [3]:
def preprocess_text(text):
    """Basic text cleaning: lowercase, remove punctuation, remove extra whitespace."""
    if not isinstance(text, str):
        return "" # Handle potential non-string data gracefully
    text = text.lower()
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = re.sub(r'\s+', ' ', text).strip()
    return text

# Apply preprocessing to the dataframe (for EDA/training prep)
df['cleaned_text'] = df['text'].apply(preprocess_text)
print("\nPreprocessed Data Sample:")
print(df[['text', 'cleaned_text']].head())


Preprocessed Data Sample:
                                                text  \
0         This product is amazing! Highly recommend.   
1                Very disappointed with the quality.   
2                         Works okay, but not great.   
3  Excellent customer service, resolved my issue ...   
4           The app is buggy and crashes frequently.   

                                        cleaned_text  
0           this product is amazing highly recommend  
1                 very disappointed with the quality  
2                           works okay but not great  
3  excellent customer service resolved my issue q...  
4            the app is buggy and crashes frequently  


In [4]:
X = df['cleaned_text']
y = df['sentiment']

# Split data for training and evaluation
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

print(f"\nTraining set size: {len(X_train)}")
print(f"Test set size: {len(X_test)}")


Training set size: 7
Test set size: 3


In [5]:
# MLflow Tracking Setup
mlflow.set_experiment("Sentiment Analysis Dev")

with mlflow.start_run(run_name="LogisticRegression_TFIDF") as run:
    # Define model pipeline: TF-IDF Vectorizer + Logistic Regression
    model_pipeline = Pipeline([
        ('tfidf', TfidfVectorizer(stop_words='english')),
        ('clf', LogisticRegression(solver='liblinear', random_state=42)) # liblinear is good for small datasets
    ])

    # Train the model
    model_pipeline.fit(X_train, y_train)
    print("\nModel training complete.")

    # Evaluate the model
    y_pred = model_pipeline.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    print(f"\nTest Set Accuracy: {accuracy:.4f}")
    print("Classification Report:")
    print(classification_report(y_test, y_pred))

    # Log parameters, metrics, and the model with MLflow
    mlflow.log_param("model_type", "LogisticRegression")
    mlflow.log_param("vectorizer", "TfidfVectorizer")
    mlflow.log_param("random_state", 42)
    mlflow.log_metric("accuracy", accuracy)
    # Log the classification report as a text file artifact
    report = classification_report(y_test, y_pred)
    with open("classification_report.txt", "w") as f:
        f.write(report)
    mlflow.log_artifact("classification_report.txt")
    os.remove("classification_report.txt") # Clean up the local file

    # Log the scikit-learn pipeline model
    mlflow.sklearn.log_model(model_pipeline, "sentiment-model")

    print(f"\nMLflow Run ID: {run.info.run_id}")
    print("Model, parameters, and metrics logged to MLflow.")

# Keep track of the best run ID (in this simple case, it's the only run)
best_run_id = run.info.run_id

2025/04/13 17:04:56 INFO mlflow.tracking.fluent: Experiment with name 'Sentiment Analysis Dev' does not exist. Creating a new experiment.
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))



Model training complete.

Test Set Accuracy: 0.3333
Classification Report:
              precision    recall  f1-score   support

    negative       0.33      1.00      0.50         1
     neutral       0.00      0.00      0.00         1
    positive       0.00      0.00      0.00         1

    accuracy                           0.33         3
   macro avg       0.11      0.33      0.17         3
weighted avg       0.11      0.33      0.17         3






MLflow Run ID: acfc159c19df49ea95b18e1b49af50d0
Model, parameters, and metrics logged to MLflow.


In [6]:
# Create a 'models' directory if it doesn't exist
if not os.path.exists('models'):
    os.makedirs('models')

# Save the trained pipeline locally using joblib
model_filename = 'models/sentiment_pipeline.joblib'
joblib.dump(model_pipeline, model_filename)

print(f"\nModel pipeline saved locally to {model_filename}")

# (Optional but good practice) Load the model back to verify
loaded_pipeline = joblib.load(model_filename)
print("Model loaded successfully for verification.")
# Test with a sample prediction
sample_text = "This is a fantastic service!"
cleaned_sample = preprocess_text(sample_text)
prediction = loaded_pipeline.predict([cleaned_sample])
print(f"Prediction for '{sample_text}': {prediction[0]}")


Model pipeline saved locally to models/sentiment_pipeline.joblib
Model loaded successfully for verification.
Prediction for 'This is a fantastic service!': positive
