In [1]:
# Import necessary libraries
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report

# Step 1: Load a labeled dataset
def load_data():
    # Load dataset from Excel file
    data = pd.read_excel('data.xlsx')  # Changed from read_csv to read_excel

    # Inspect the first few rows and columns to ensure it's loaded correctly
    print("Dataset Columns:", data.columns)
    print("Dataset Preview:\n", data.head())

    # Check if required columns exist
    if 'review' not in data.columns or 'sentiment' not in data.columns:
        raise KeyError("The dataset must contain 'text' and 'sentiment' columns.")

    return data

# Step 2: Preprocess the text data and split into train/test sets
def preprocess_and_split(data):
    data=data.dropna(subset=['review','sentiment'])
    X = data['review']  # Features (text data)
    y = data['sentiment']  # Labels (sentiment: positive/negative/neutral)

    # Convert text into a matrix of token counts (Bag of Words approach)
    vectorizer = CountVectorizer()
    X_vectorized = vectorizer.fit_transform(X)

    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X_vectorized, y, test_size=0.2, random_state=42)
    return X_train, X_test, y_train, y_test, vectorizer

# Step 3: Train a sentiment analysis model
def train_model(X_train, y_train):
    model = MultinomialNB()  # Naive Bayes classifier
    model.fit(X_train, y_train)
    return model

# Step 4: Evaluate the model
def evaluate_model(model, X_test, y_test):
    y_pred = model.predict(X_test)
    print("Accuracy:", accuracy_score(y_test, y_pred))
    print("\nClassification Report:\n", classification_report(y_test, y_pred))

# Step 5: Predict sentiment on new text
def predict_sentiment(model, vectorizer, new_text):
    new_text_vectorized = vectorizer.transform([new_text])  # Transform new text using the same vectorizer
    prediction = model.predict(new_text_vectorized)
    print(f"\nSentiment Prediction for '{new_text}': {prediction[0]}")

# Main function to execute the workflow
if __name__ == "__main__":
    try:
        # Load and preprocess data
        data = load_data()
        X_train, X_test, y_train, y_test, vectorizer = preprocess_and_split(data)

        # Train the model
        model = train_model(X_train, y_train)

        # Evaluate the model
        evaluate_model(model, X_test, y_test)

        # Predict sentiment for new input
        new_sample_text = "The product was delivered on time and works perfectly."
        predict_sentiment(model, vectorizer, new_sample_text)

    except Exception as e:
        print(f"An error occurred: {e}")

Dataset Columns: Index(['review', 'sentiment'], dtype='object')
Dataset Preview:
                                               review sentiment
0  One of the other reviewers has mentioned that ...  positive
1  A wonderful little production. <br /><br />The...  positive
2  I thought this was a wonderful way to spend ti...  positive
3  Basically there's a family where a little boy ...  negative
4  Petter Mattei's "Love in the Time of Money" is...  positive
Accuracy: 0.8499

Classification Report:
               precision    recall  f1-score   support

    negative       0.83      0.88      0.85      4948
    positive       0.88      0.82      0.85      5052

    accuracy                           0.85     10000
   macro avg       0.85      0.85      0.85     10000
weighted avg       0.85      0.85      0.85     10000


Sentiment Prediction for 'The product was delivered on time and works perfectly.': positive
