In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

# Load the CSV file
data = pd.read_csv(r"C:\Users\Sanjay\OneDrive\Documents\eng_dataset2.csv")

# Display the first few rows of the data
print(data.head())

# Check for missing data and drop rows where 'content' or 'sentiment' is empty
data = data.dropna(subset=['content', 'sentiment'])

# Preprocess the text data
X = data['content']  # The text data (sentences)
y = data['sentiment']  # The sentiment labels

# Convert the text data into a Bag of Words model using CountVectorizer
vectorizer = CountVectorizer(stop_words='english')
X_vectorized = vectorizer.fit_transform(X)

# Split the dataset into training and testing sets (80% training, 20% testing)
X_train, X_test, y_train, y_test = train_test_split(X_vectorized, y, test_size=0.2, random_state=42)

# Initialize the Logistic Regression model
logistic_model = LogisticRegression(max_iter=1000)

# Train the model
logistic_model.fit(X_train, y_train)

# Predict sentiments on the test set
y_pred = logistic_model.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

# Display classification report (precision, recall, F1-score)
print("Classification Report:\n", classification_report(y_test, y_pred))

# Optional: Display the first 5 sentences, actual vs predicted sentiments
for i in range(10):
    print(f"Sentence: {data['content'].iloc[i]}")
    print(f"Actual Sentiment: {data['sentiment'].iloc[i]}, Predicted Sentiment: {y_pred[i]}\n")


      ID sentiment                                            content
0  10941     anger  At the point today where if someone says somet...
1  10942     anger  @CorningFootball  IT'S GAME DAY!!!!      T MIN...
2  10943     anger  This game has pissed me off more than any othe...
3  10944     anger  @spamvicious I've just found out it's Candice ...
4  10945     anger  @moocowward @mrsajhargreaves @Melly77 @GaryBar...
Accuracy: 0.88
Classification Report:
               precision    recall  f1-score   support

       anger       0.90      0.87      0.88       375
        fear       0.82      0.92      0.87       434
         joy       0.96      0.92      0.94       319
     sadness       0.85      0.77      0.81       293

    accuracy                           0.88      1421
   macro avg       0.88      0.87      0.88      1421
weighted avg       0.88      0.88      0.88      1421

Sentence: At the point today where if someone says something remotely kind to me, a waterfall will burst o