In [1]:
import pandas as pd
import numpy as np
import nltk
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report
import joblib
import tkinter as tk

In [2]:
# Step 1: Data Collection
df = pd.read_csv('/content/train.csv', encoding='ISO-8859-1')

In [3]:
# Step 2: Text Preprocessing
nltk.download('stopwords')
nltk.download('punkt')
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    if isinstance(text, str):  # Check if text is a string
        # Convert text to lowercase
        text = text.lower()
        # Remove special characters
        text = re.sub(r'\W', ' ', text)
        # Tokenization
        tokens = word_tokenize(text)
        # Remove stopwords
        tokens = [word for word in tokens if word not in stop_words]
        return ' '.join(tokens)
    else:
        return ''  # Return empty string if text is not a string

df['clean_text'] = df['text'].apply(preprocess_text)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [4]:
# Step 3: Feature Extraction
tfidf_vectorizer = TfidfVectorizer()
X = tfidf_vectorizer.fit_transform(df['clean_text'])

In [5]:
# Step 4: Model Selection
X_train, X_test, y_train, y_test = train_test_split(X, df['sentiment'], test_size=0.2, random_state=42)
model = MultinomialNB()
model.fit(X_train, y_train)

In [6]:
# Step 5: Model Evaluation
predictions = model.predict(X_test)
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

    negative       0.80      0.40      0.53      1562
     neutral       0.54      0.82      0.65      2230
    positive       0.74      0.57      0.65      1705

    accuracy                           0.62      5497
   macro avg       0.69      0.60      0.61      5497
weighted avg       0.68      0.62      0.61      5497



In [9]:
# Step 6: Take user input and predict sentiment
def predict_sentiment(input_text):
    clean_input = preprocess_text(input_text)
    X_input = tfidf_vectorizer.transform([clean_input])
    predicted_sentiment = model.predict(X_input)[0]
    return predicted_sentiment

user_input = input("Enter a sentence: ")
predicted_sentiment = predict_sentiment(user_input)
print("Predicted sentiment:", predicted_sentiment)

Enter a sentence: you are the best
Predicted sentiment: positive
