# Movies Sentiment Analysis - NLP Project 1

## 1. Data loading

In [6]:
from datasets import load_dataset
import re
import spacy
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

# Load data
dataset = load_dataset("imdb")

# Load spaCy
nlp = spacy.load("en_core_web_sm")

# Clean text function
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r"[^a-zA-Z\s]", "", text)
    doc = nlp(text)
    tokens = [token.text for token in doc if not token.is_stop]
    return " ".join(tokens)

# Get balanced training and test data
train_data = dataset["train"].shuffle(seed=42).select(range(1000))  # 1000 mixed
train_texts = train_data["text"]
train_labels = train_data["label"]
test_texts = dataset["test"]["text"][:1000]
test_labels = dataset["test"]["label"][:1000]

# Process text
train_texts_processed = [preprocess_text(text) for text in train_texts]
test_texts_processed = [preprocess_text(text) for text in test_texts]

# Turn into numbers
vectorizer = TfidfVectorizer(max_features=5000)
X_train = vectorizer.fit_transform(train_texts_processed)
X_test = vectorizer.transform(test_texts_processed)

# Check labels
print("Unique labels:", set(train_labels))

# Train model
model = LogisticRegression(max_iter=1000)
model.fit(X_train, train_labels)

# Test model
y_pred = model.predict(X_test)
accuracy = accuracy_score(test_labels, y_pred)
print(f"Accuracy: {accuracy:.4f}")
print(classification_report(test_labels, y_pred, target_names=["Negative", "Positive"]))

# Sample test
sample_text = dataset["test"]["text"][0]
sample_processed = preprocess_text(sample_text)
sample_vector = vectorizer.transform([sample_processed])
sample_pred = model.predict(sample_vector)[0]
print(f"Sample: {sample_text[:100]}...")
print(f"Processed: {sample_processed[:100]}...")
print(f"Prediction: {'Positive' if sample_pred == 1 else 'Negative'}")

  from .autonotebook import tqdm as notebook_tqdm


Unique labels: {0, 1}
Accuracy: 0.7700
              precision    recall  f1-score   support

    Negative       1.00      0.77      0.87      1000
    Positive       0.00      0.00      0.00         0

    accuracy                           0.77      1000
   macro avg       0.50      0.39      0.44      1000
weighted avg       1.00      0.77      0.87      1000

Sample: I love sci-fi and am willing to put up with a lot. Sci-fi movies/TV are usually underfunded, under-a...
Processed: love scifi willing lot scifi moviestv usually underfunded underappreciated misunderstood tried like ...
Prediction: Negative


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


# 2. Twitter US Airline Sentiment Analysis - NLP 2

In [7]:
import pandas as pd
df = pd.read_csv('Tweets.csv')
display(df)
df.shape

Unnamed: 0,tweet_id,airline_sentiment,airline_sentiment_confidence,negativereason,negativereason_confidence,airline,airline_sentiment_gold,name,negativereason_gold,retweet_count,text,tweet_coord,tweet_created,tweet_location,user_timezone
0,570306133677760513,neutral,1.0000,,,Virgin America,,cairdin,,0,@VirginAmerica What @dhepburn said.,,2015-02-24 11:35:52 -0800,,Eastern Time (US & Canada)
1,570301130888122368,positive,0.3486,,0.0000,Virgin America,,jnardino,,0,@VirginAmerica plus you've added commercials t...,,2015-02-24 11:15:59 -0800,,Pacific Time (US & Canada)
2,570301083672813571,neutral,0.6837,,,Virgin America,,yvonnalynn,,0,@VirginAmerica I didn't today... Must mean I n...,,2015-02-24 11:15:48 -0800,Lets Play,Central Time (US & Canada)
3,570301031407624196,negative,1.0000,Bad Flight,0.7033,Virgin America,,jnardino,,0,@VirginAmerica it's really aggressive to blast...,,2015-02-24 11:15:36 -0800,,Pacific Time (US & Canada)
4,570300817074462722,negative,1.0000,Can't Tell,1.0000,Virgin America,,jnardino,,0,@VirginAmerica and it's a really big bad thing...,,2015-02-24 11:14:45 -0800,,Pacific Time (US & Canada)
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14635,569587686496825344,positive,0.3487,,0.0000,American,,KristenReenders,,0,@AmericanAir thank you we got on a different f...,,2015-02-22 12:01:01 -0800,,
14636,569587371693355008,negative,1.0000,Customer Service Issue,1.0000,American,,itsropes,,0,@AmericanAir leaving over 20 minutes Late Flig...,,2015-02-22 11:59:46 -0800,Texas,
14637,569587242672398336,neutral,1.0000,,,American,,sanyabun,,0,@AmericanAir Please bring American Airlines to...,,2015-02-22 11:59:15 -0800,"Nigeria,lagos",
14638,569587188687634433,negative,1.0000,Customer Service Issue,0.6659,American,,SraJackson,,0,"@AmericanAir you have my money, you change my ...",,2015-02-22 11:59:02 -0800,New Jersey,Eastern Time (US & Canada)


(14640, 15)

In [8]:
df = df[['text', 'airline_sentiment']]
display(df)

Unnamed: 0,text,airline_sentiment
0,@VirginAmerica What @dhepburn said.,neutral
1,@VirginAmerica plus you've added commercials t...,positive
2,@VirginAmerica I didn't today... Must mean I n...,neutral
3,@VirginAmerica it's really aggressive to blast...,negative
4,@VirginAmerica and it's a really big bad thing...,negative
...,...,...
14635,@AmericanAir thank you we got on a different f...,positive
14636,@AmericanAir leaving over 20 minutes Late Flig...,negative
14637,@AmericanAir Please bring American Airlines to...,neutral
14638,"@AmericanAir you have my money, you change my ...",negative


In [9]:
def preprocess_text(text):
    text = text.lower() # lower the text
    text = re.sub(r"http\S+|www\S+|https\S+", "", text) # removes URLS
    text = re.sub(r"@\w+", "", text) # Remove mentions
    text = re.sub(r"#(\w+)", r"\1", text) # Remove # and keeps the word (\w+)
    text = re.sub(r"[^a-zA-Z\s]", "", text)
    doc = nlp(text)
    tokens = [token.text for token in doc if not token.is_stop]
    return " ".join(tokens)

df['text_clean'] = df['text'].apply(preprocess_text)

In [10]:
df['text_clean'].head()

0                                                 said
1           plus ve added commercials experience tacky
2                              nt today mean need trip
3      aggressive blast obnoxious entertainment gue...
4                                        big bad thing
Name: text_clean, dtype: object

In [11]:
df['sentiment_num'] = df['airline_sentiment'].replace({'negative': 0, 'neutral': 1, 'positive':2})

df[['text_clean', 'sentiment_num']].head()

Unnamed: 0,text_clean,sentiment_num
0,said,1
1,plus ve added commercials experience tacky,2
2,nt today mean need trip,1
3,aggressive blast obnoxious entertainment gue...,0
4,big bad thing,0


In [12]:
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(df['text_clean'])
y = df['sentiment_num']

In [13]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2, random_state=42)
# Train model
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

# Test model
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")
print(classification_report(y_test, y_pred, target_names=["Negative", "Neutral", "Positive"]))

Accuracy: 0.7930
              precision    recall  f1-score   support

    Negative       0.81      0.95      0.87      1889
     Neutral       0.68      0.45      0.54       580
    Positive       0.82      0.58      0.68       459

    accuracy                           0.79      2928
   macro avg       0.77      0.66      0.70      2928
weighted avg       0.79      0.79      0.78      2928



In [14]:
from catboost import CatBoostClassifier 
from sklearn.metrics import accuracy_score, classification_report
#Train model
model = CatBoostClassifier(iterations = 1000, verbose = 100)
model.fit(X_train, y_train)
#Test Model
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")
print(classification_report(y_test, y_pred, target_names=["Negative", "Neutral", "Positive"]))

Learning rate set to 0.089706
0:	learn: 1.0547835	total: 76.2ms	remaining: 1m 16s
100:	learn: 0.6930128	total: 1.44s	remaining: 12.8s
200:	learn: 0.6456437	total: 2.85s	remaining: 11.3s
300:	learn: 0.6246667	total: 4.2s	remaining: 9.74s
400:	learn: 0.6083950	total: 5.51s	remaining: 8.24s
500:	learn: 0.5950034	total: 6.84s	remaining: 6.82s
600:	learn: 0.5850121	total: 8.16s	remaining: 5.42s
700:	learn: 0.5751977	total: 9.47s	remaining: 4.04s
800:	learn: 0.5670341	total: 10.8s	remaining: 2.68s
900:	learn: 0.5601770	total: 12.1s	remaining: 1.33s
999:	learn: 0.5532858	total: 13.4s	remaining: 0us
Accuracy: 0.7531
              precision    recall  f1-score   support

    Negative       0.76      0.96      0.85      1889
     Neutral       0.70      0.23      0.35       580
    Positive       0.75      0.56      0.64       459

    accuracy                           0.75      2928
   macro avg       0.73      0.59      0.61      2928
weighted avg       0.74      0.75      0.72      2928



In [15]:
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report

model = XGBClassifier(n_estimators=1000, verbosity=1)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")
print(classification_report(y_test, y_pred, target_names=["Negative", "Neutral", "Positive"]))

Accuracy: 0.7842
              precision    recall  f1-score   support

    Negative       0.84      0.90      0.87      1889
     Neutral       0.61      0.51      0.56       580
    Positive       0.72      0.64      0.68       459

    accuracy                           0.78      2928
   macro avg       0.72      0.68      0.70      2928
weighted avg       0.77      0.78      0.78      2928

