# Movies Sentiment Analysis - NLP Project 1

## 1. Data loading

In [None]:
from datasets import load_dataset
import re
import spacy
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

# Load data
dataset = load_dataset("imdb")

# Load spaCy
nlp = spacy.load("en_core_web_sm")

# Clean text function
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r"[^a-zA-Z\s]", "", text)
    doc = nlp(text)
    tokens = [token.text for token in doc if not token.is_stop]
    return " ".join(tokens)

# Get balanced training and test data
train_data = dataset["train"].shuffle(seed=42).select(range(1000))  # 1000 mixed
train_texts = train_data["text"]
train_labels = train_data["label"]
test_texts = dataset["test"]["text"][:1000]
test_labels = dataset["test"]["label"][:1000]

# Process text
train_texts_processed = [preprocess_text(text) for text in train_texts]
test_texts_processed = [preprocess_text(text) for text in test_texts]

# Turn into numbers
vectorizer = TfidfVectorizer(max_features=5000)
X_train = vectorizer.fit_transform(train_texts_processed)
X_test = vectorizer.transform(test_texts_processed)

# Check labels
print("Unique labels:", set(train_labels))

# Train model
model = LogisticRegression(max_iter=1000)
model.fit(X_train, train_labels)

# Test model
y_pred = model.predict(X_test)
accuracy = accuracy_score(test_labels, y_pred)
print(f"Accuracy: {accuracy:.4f}")
print(classification_report(test_labels, y_pred, target_names=["Negative", "Positive"]))

# Sample test
sample_text = dataset["test"]["text"][0]
sample_processed = preprocess_text(sample_text)
sample_vector = vectorizer.transform([sample_processed])
sample_pred = model.predict(sample_vector)[0]
print(f"Sample: {sample_text[:100]}...")
print(f"Processed: {sample_processed[:100]}...")
print(f"Prediction: {'Positive' if sample_pred == 1 else 'Negative'}")

# 2. Twitter US Airline Sentiment Analysis - NLP 2

In [None]:
import pandas as pd
df = pd.read_csv('Tweets.csv')
display(df)
df.shape

In [None]:
df = df[['text', 'airline_sentiment']]
display(df)

In [None]:
def preprocess_text(text):
    text = text.lower() # lower the text
    text = re.sub(r"http\S+|www\S+|https\S+", "", text) # removes URLS
    text = re.sub(r"@\w+", "", text) # Remove mentions
    text = re.sub(r"#(\w+)", r"\1", text) # Remove # and keeps the word (\w+)
    text = re.sub(r"[^a-zA-Z\s]", "", text)
    doc = nlp(text)
    tokens = [token.text for token in doc if not token.is_stop]
    return " ".join(tokens)

df['text_clean'] = df['text'].apply(preprocess_text)

In [None]:
df['text_clean'].head()

In [None]:
df['sentiment_num'] = df['airline_sentiment'].replace({'negative': 0, 'neutral': 1, 'positive':2})

df[['text_clean', 'sentiment_num']].head()

In [None]:
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(df['text_clean'])
y = df['sentiment_num']

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2, random_state=42)
# Train model
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

# Test model
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")
print(classification_report(y_test, y_pred, target_names=["Negative", "Neutral", "Positive"]))

In [None]:
from catboost import CatBoostClassifier 
from sklearn.metrics import accuracy_score, classification_report
#Train model
model = CatBoostClassifier(iterations = 1000, verbose = 100)
model.fit(X_train, y_train)
#Test Model
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")
print(classification_report(y_test, y_pred, target_names=["Negative", "Neutral", "Positive"]))

In [22]:
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, classification_report

# Define the Model
model = XGBClassifier(objective='multi:softprob', eval_metric='mlogloss')

#Set up parameter grid to test
param_grid = {
    'n_estimators': [500],  #Test 500 or 1000 trees
    'learning_rate': [0.01, 0.1], #slower or moderate learning
    'max_depth': [3,6],  #shallow or deeper trees
    'min_child_weight': [1,3], #light or havier branches
    'subsample': [0.8, 1.0], #use 80% or all data
    'colsample_bytree': [0.8,1.0] # use 80% or all features
}

#Set up GridSearchCV
grid_search = GridSearchCV(
    estimator=model,
    param_grid=param_grid,
    cv=3, # 3-fold cross validation
    scoring='accuracy', #optimize for accuracy
    verbose=1, # show progress
    n_jobs=-1 # use all CPU cores
)

# Fit it
grid_search.fit(X_train,y_train)
best_model = grid_search.best_estimator_
print("Best Parameters:", grid_search.best_params_)

# Test it
y_pred = best_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")
print(classification_report(y_test, y_pred, target_names=["Negative", "Neutral", "Positive"]))

Fitting 3 folds for each of 32 candidates, totalling 96 fits
Best Parameters: {'colsample_bytree': 1.0, 'learning_rate': 0.1, 'max_depth': 6, 'min_child_weight': 1, 'n_estimators': 500, 'subsample': 0.8}
Accuracy: 0.7807
              precision    recall  f1-score   support

    Negative       0.83      0.91      0.86      1889
     Neutral       0.62      0.50      0.55       580
    Positive       0.74      0.62      0.67       459

    accuracy                           0.78      2928
   macro avg       0.73      0.67      0.70      2928
weighted avg       0.77      0.78      0.77      2928

