# Sentiment Analysis -NLP Project 1

## 1. Data loading

In [19]:
from datasets import load_dataset
import re
import spacy
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

# Load data
dataset = load_dataset("imdb")

# Load spaCy
nlp = spacy.load("en_core_web_sm")

# Clean text function
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r"[^a-zA-Z\s]", "", text)
    doc = nlp(text)
    tokens = [token.text for token in doc if not token.is_stop]
    return " ".join(tokens)

# Get balanced training and test data
train_data = dataset["train"].shuffle(seed=42).select(range(1000))  # 1000 mixed
train_texts = train_data["text"]
train_labels = train_data["label"]
test_texts = dataset["test"]["text"][:1000]
test_labels = dataset["test"]["label"][:1000]

# Process text
train_texts_processed = [preprocess_text(text) for text in train_texts]
test_texts_processed = [preprocess_text(text) for text in test_texts]

# Turn into numbers
vectorizer = TfidfVectorizer(max_features=5000)
X_train = vectorizer.fit_transform(train_texts_processed)
X_test = vectorizer.transform(test_texts_processed)

# Check labels
print("Unique labels:", set(train_labels))

# Train model
model = LogisticRegression(max_iter=1000)
model.fit(X_train, train_labels)

# Test model
y_pred = model.predict(X_test)
accuracy = accuracy_score(test_labels, y_pred)
print(f"Accuracy: {accuracy:.4f}")
print(classification_report(test_labels, y_pred, target_names=["Negative", "Positive"]))

# Sample test
sample_text = dataset["test"]["text"][0]
sample_processed = preprocess_text(sample_text)
sample_vector = vectorizer.transform([sample_processed])
sample_pred = model.predict(sample_vector)[0]
print(f"Sample: {sample_text[:100]}...")
print(f"Processed: {sample_processed[:100]}...")
print(f"Prediction: {'Positive' if sample_pred == 1 else 'Negative'}")

Unique labels: {0, 1}
Accuracy: 0.7700
              precision    recall  f1-score   support

    Negative       1.00      0.77      0.87      1000
    Positive       0.00      0.00      0.00         0

    accuracy                           0.77      1000
   macro avg       0.50      0.39      0.44      1000
weighted avg       1.00      0.77      0.87      1000

Sample: I love sci-fi and am willing to put up with a lot. Sci-fi movies/TV are usually underfunded, under-a...
Processed: love scifi willing lot scifi moviestv usually underfunded underappreciated misunderstood tried like ...
Prediction: Negative


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
