# Twitter Sentiment Analysis using NLP
### Thesis Project — Sentiment140 Dataset

This notebook performs sentiment analysis on Twitter data using the Sentiment140 dataset.
It includes data preprocessing, TF-IDF feature extraction, Logistic Regression model training, evaluation, and visualization.

**Author:** Your Name  
**Date:** 2025-11-08

In [ ]:
!pip install --quiet pandas numpy scikit-learn nltk matplotlib seaborn wordcloud joblib kaggle

In [ ]:
import pandas as pd
import numpy as np
import re, string, os
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, f1_score, precision_score, recall_score
import joblib

import nltk
nltk.download('stopwords', quiet=True)
nltk.download('wordnet', quiet=True)
nltk.download('omw-1.4', quiet=True)
nltk.download('punkt', quiet=True)

from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

sns.set(style='whitegrid')

In [ ]:
colnames = ['target', 'id', 'date', 'query', 'user', 'text']
csv_path = 'data/sentiment140.csv'
df = pd.read_csv(csv_path, encoding='latin-1', names=colnames)
df['label'] = df['target'].apply(lambda x: 1 if x == 4 else 0)
df.head()

In [ ]:
url_pattern = re.compile(r'https?://\S+|www\.\S+')
mention_pattern = re.compile(r'@\w+')
rt_pattern = re.compile(r'\brt\b')
hashtag_pattern = re.compile(r'#')

def clean_tweet(text):
    if not isinstance(text, str):
        return ''
    text = text.lower()
    text = url_pattern.sub('', text)
    text = mention_pattern.sub('', text)
    text = rt_pattern.sub('', text)
    text = hashtag_pattern.sub('', text)
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = re.sub(r'\d+', '', text)
    tokens = nltk.word_tokenize(text)
    tokens = [lemmatizer.lemmatize(t) for t in tokens if t not in stop_words and len(t) > 1]
    return ' '.join(tokens)

df['clean_text'] = df['text'].astype(str).apply(clean_tweet)
df[['text', 'clean_text']].head()

In [ ]:
X = df['clean_text']
y = df['label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

vectorizer = TfidfVectorizer(max_features=20000, ngram_range=(1,2), min_df=5, max_df=0.9, sublinear_tf=True)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

clf = LogisticRegression(solver='saga', penalty='l2', C=1.0, max_iter=200, n_jobs=-1, class_weight='balanced', random_state=42)
clf.fit(X_train_tfidf, y_train)

In [ ]:
y_pred = clf.predict(X_test_tfidf)
print('Accuracy:', accuracy_score(y_test, y_pred))
print('Precision:', precision_score(y_test, y_pred))
print('Recall:', recall_score(y_test, y_pred))
print('F1:', f1_score(y_test, y_pred))
print('\nClassification Report:\n', classification_report(y_test, y_pred))

In [ ]:
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(6,5))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['neg','pos'], yticklabels=['neg','pos'])
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix')
plt.show()

In [ ]:
joblib.dump(clf, 'sentiment_model.joblib')
joblib.dump(vectorizer, 'tfidf_vectorizer.joblib')

In [ ]:
def predict_sentiment(text):
    clean = clean_tweet(text)
    vec = vectorizer.transform([clean])
    pred = clf.predict(vec)[0]
    prob = clf.predict_proba(vec).max()
    return 'Positive' if pred==1 else 'Negative', round(prob,3)

samples = [
    'I love this new phone!',
    'Worst service ever!',
    'It is okay, not great.'
]

for s in samples:
    label, prob = predict_sentiment(s)
    print(f'{s} → {label} ({prob})')