In [1]:
pip install pandas scikit-learn nltk matplotlib seaborn



In [5]:
# ======================================
# SENTIMENT ANALYSIS – HINDI NLP (FIXED)
# ======================================

import pandas as pd
import re
import nltk
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

nltk.download('stopwords')

# -------------------------------
# 1. DATASET (INCREASED)
# -------------------------------
data = {
    "text": [
        "यह फिल्म बहुत अच्छी है",
        "यह मूवी शानदार है",
        "मुझे यह बहुत पसंद है",
        "यह एक बेहतरीन अनुभव था",
        "यह उत्पाद अच्छा है",

        "यह फिल्म बेकार है",
        "सेवा बहुत खराब थी",
        "मुझे यह बिल्कुल पसंद नहीं आया",
        "समय की बर्बादी है",
        "यह मोबाइल फोन खराब है",

        "यह ठीक है",
        "औसत अनुभव था",
        "कुछ खास नहीं लगा"
    ],
    "sentiment": [
        "positive","positive","positive","positive","positive",
        "negative","negative","negative","negative","negative",
        "neutral","neutral","neutral"
    ]
}

df = pd.DataFrame(data)

# -------------------------------
# 2. TEXT PREPROCESSING
# -------------------------------
def clean_text(text):
    text = text.lower()
    text = re.sub(r'[^\u0900-\u097F\s]', '', text)
    return text.strip()

df["clean_text"] = df["text"].apply(clean_text)

# -------------------------------
# 3. TF-IDF FEATURES
# -------------------------------
vectorizer = TfidfVectorizer(
    ngram_range=(1,2),
    max_features=500
)

X = vectorizer.fit_transform(df["clean_text"])
y = df["sentiment"]

# -------------------------------
# 4. STRATIFIED SPLIT (IMPORTANT)
# -------------------------------
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.3,
    random_state=42,
    stratify=y
)

# -------------------------------
# 5. MODEL TRAINING
# -------------------------------
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

# -------------------------------
# 6. EVALUATION (NO WARNINGS)
# -------------------------------
y_pred = model.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))

print("\nClassification Report:\n")
print(classification_report(
    y_test,
    y_pred,
    zero_division=0
))

# -------------------------------
# 7. CONFUSION MATRIX
# -------------------------------
print("Confusion Matrix:\n")
print(confusion_matrix(y_test, y_pred))

# -------------------------------
# 8. SINGLE SENTENCE TEST
# -------------------------------
test_sentence = "यह फोन बहुत अच्छा है"
test_vector = vectorizer.transform([clean_text(test_sentence)])
print("\nTest Sentence:", test_sentence)
print("Predicted Sentiment:", model.predict(test_vector)[0])

Accuracy: 0.25

Classification Report:

              precision    recall  f1-score   support

    negative       0.00      0.00      0.00         2
     neutral       0.00      0.00      0.00         1
    positive       0.25      1.00      0.40         1

    accuracy                           0.25         4
   macro avg       0.08      0.33      0.13         4
weighted avg       0.06      0.25      0.10         4

Confusion Matrix:

[[0 0 2]
 [0 0 1]
 [0 0 1]]

Test Sentence: यह फोन बहुत अच्छा है
Predicted Sentiment: positive


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
