In [None]:
import math
from collections import Counter
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

# Given corpus
corpus = [
    'the sun is a star',
    'the moon is a satellite',
    'the sun and moon are celestial bodies'
]

# Tokenize and build vocabulary
tokenized_corpus = [doc.lower().split() for doc in corpus]
vocab = sorted(set(word for doc in tokenized_corpus for word in doc))

# Compute Document Frequency (DF)
def compute_df(tokenized_corpus, vocab):
    df = {}
    for word in vocab:
        df[word] = sum(1 for doc in tokenized_corpus if word in doc)
    return df

# Compute Inverse Document Frequency (IDF)
def compute_idf(df, N):
    idf = {}
    for word, freq in df.items():
        idf[word] = math.log(N / freq)
    return idf

# Compute Term Frequency (TF) for a document
def compute_tf(doc, vocab):
    tf = {}
    counts = Counter(doc)
    doc_len = len(doc)
    for word in vocab:
        tf[word] = counts[word] / doc_len
    return tf

# Compute TF-IDF for all documents
def compute_tfidf(tokenized_corpus, vocab):
    N = len(tokenized_corpus)
    df = compute_df(tokenized_corpus, vocab)
    idf = compute_idf(df, N)
    tfidf_matrix = []
    for doc in tokenized_corpus:
        tf = compute_tf(doc, vocab)
        tfidf = [tf[word] * idf[word] for word in vocab]
        tfidf_matrix.append(tfidf)
    return tfidf_matrix, vocab, idf

manual_tfidf_matrix, vocab, manual_idf = compute_tfidf(tokenized_corpus, vocab)

print("Manual TF-IDF Matrix:")
for row in manual_tfidf_matrix:
    print([round(val, 4) for val in row])
print("Vocabulary:", vocab)
print("Manual IDF:", {w: round(v, 4) for w, v in manual_idf.items()})

# CountVectorizer and TfidfVectorizer Comparison
count_vec = CountVectorizer()
count_matrix = count_vec.fit_transform(corpus)
print("\nCountVectorizer Matrix:")
print(count_matrix.toarray())
print("CountVectorizer Vocabulary:", count_vec.get_feature_names_out())

tfidf_vec = TfidfVectorizer(norm=None, use_idf=True, smooth_idf=False)
tfidf_matrix = tfidf_vec.fit_transform(corpus)
print("\nTfidfVectorizer Matrix (no normalization):")
print(tfidf_matrix.toarray())
print("TfidfVectorizer Vocabulary:", tfidf_vec.get_feature_names_out())


Manual TF-IDF Matrix:
[0.0811, 0.0, 0.0, 0.0, 0.0, 0.0811, 0.0, 0.0, 0.2197, 0.0811, 0.0]
[0.0811, 0.0, 0.0, 0.0, 0.0, 0.0811, 0.0811, 0.2197, 0.0, 0.0, 0.0]
[0.0, 0.1569, 0.1569, 0.1569, 0.1569, 0.0, 0.0579, 0.0, 0.0, 0.0579, 0.0]
Vocabulary: ['a', 'and', 'are', 'bodies', 'celestial', 'is', 'moon', 'satellite', 'star', 'sun', 'the']
Manual IDF: {'a': 0.4055, 'and': 1.0986, 'are': 1.0986, 'bodies': 1.0986, 'celestial': 1.0986, 'is': 0.4055, 'moon': 0.4055, 'satellite': 1.0986, 'star': 1.0986, 'sun': 0.4055, 'the': 0.0}

CountVectorizer Matrix:
[[0 0 0 0 1 0 0 1 1 1]
 [0 0 0 0 1 1 1 0 0 1]
 [1 1 1 1 0 1 0 0 1 1]]
CountVectorizer Vocabulary: ['and' 'are' 'bodies' 'celestial' 'is' 'moon' 'satellite' 'star' 'sun'
 'the']

TfidfVectorizer Matrix (no normalization):
[[0.         0.         0.         0.         1.40546511 0.
  0.         2.09861229 1.40546511 1.        ]
 [0.         0.         0.         0.         1.40546511 1.40546511
  2.09861229 0.         0.         1.        ]
 [2.098

In [1]:
!pip install gensim



In [2]:
import pandas as pd
import numpy as np
import gensim.downloader as api
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('punkt_tab') # Add this line to download the missing resource

df = pd.read_csv("spam.csv", encoding='latin-1')[['v1', 'v2']]
df.columns = ['Label', 'Message']
df['Label'] = df['Label'].map({'spam': 1, 'ham': 0})

stop_words = set(stopwords.words('english'))

def preprocess(text):
    tokens = word_tokenize(text.lower())
    return [word for word in tokens if word.isalpha() and word not in stop_words]

print("Loading Word2Vec model (this may take time)...")
w2v_model = api.load("word2vec-google-news-300")

def vectorize_message(message, model):
    words = preprocess(message)
    vectors = [model[word] for word in words if word in model]
    if vectors:
        return np.mean(vectors, axis=0)
    else:
        return np.zeros(model.vector_size)

X_vectors = np.array([vectorize_message(msg, w2v_model) for msg in df['Message']])
y = df['Label'].values

X_train, X_test, y_train, y_test = train_test_split(X_vectors, y, test_size=0.2, random_state=42)

clf = LogisticRegression(max_iter=1000)
clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)
print("Test Accuracy:", accuracy_score(y_test, y_pred))

def predict_message_class(model, w2v_model, message):
    vec = vectorize_message(message, w2v_model).reshape(1, -1)
    pred = model.predict(vec)[0]
    return "spam" if pred == 1 else "ham"

# Example:
sample = "You won a free ticket! Reply now!"
print("Predicted Class:", predict_message_class(clf, w2v_model, sample))

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


Loading Word2Vec model (this may take time)...
Test Accuracy: 0.9417040358744395
Predicted Class: spam


In [3]:
import pandas as pd
import numpy as np
import re
import string
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
import gensim.downloader
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# Download NLTK data (Colab may already have it)
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# --- 1. Load the Twitter US Airline Sentiment dataset ---
df = pd.read_csv('Tweets.csv')
print(df['airline_sentiment'].value_counts())

# --- 2. Preprocess each tweet ---
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess_tweet(text):
    text = text.lower()
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    text = re.sub(r'@\w+', '', text)
    text = re.sub(r'#\w+', '', text)
    text = re.sub(r'[^\x00-\x7F]+', '', text)
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = re.sub(r'\s+', ' ', text).strip()
    tokens = nltk.word_tokenize(text)
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word.isalpha() and word not in stop_words and len(word) > 1]
    return tokens

df['processed_tokens'] = df['text'].apply(preprocess_tweet)
df = df[df['processed_tokens'].map(len) > 0].reset_index(drop=True)

# --- 3. Download and load the pre-trained Google News Word2Vec model ---
print("Downloading Google News Word2Vec model... This may take a few minutes.")
w2v_model = gensim.downloader.load('word2vec-google-news-300')

# --- 4. Convert each tweet to a fixed-length vector (average of word vectors) ---
def tweet_to_vector(tokens, w2v_model, vector_size=300):
    vectors = [w2v_model[word] for word in tokens if word in w2v_model]
    if vectors:
        return np.mean(vectors, axis=0)
    else:
        return np.zeros(vector_size)

df['vector'] = df['processed_tokens'].apply(lambda tokens: tweet_to_vector(tokens, w2v_model))

# --- 5. Prepare data for machine learning ---
X = np.vstack(df['vector'].values)
y = df['airline_sentiment'].map({'negative': 0, 'neutral': 1, 'positive': 2}).values

# --- 6. Split dataset into training (80%) and testing (20%) sets ---
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# --- 7. Train Multiclass Logistic Regression classifier ---
print("Training model...")
clf = LogisticRegression(
    multi_class='multinomial',
    solver='lbfgs',
    max_iter=1000,
    random_state=42
)
clf.fit(X_train, y_train)

# --- 8. Make predictions and evaluate ---
y_pred = clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"\nTest Accuracy: {accuracy:.4f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=['negative', 'neutral', 'positive']))

# --- 9. Prediction function ---
def predict_tweet_sentiment(model, w2v_model, tweet):
    processed_tokens = preprocess_tweet(tweet)
    tweet_vector = tweet_to_vector(processed_tokens, w2v_model).reshape(1, -1)
    prediction = model.predict(tweet_vector)[0]
    sentiment_map = {0: 'negative', 1: 'neutral', 2: 'positive'}
    return sentiment_map[prediction]

# --- 10. Example predictions ---
test_tweets = [
    "@airline Your service was amazing! Best flight ever! 😊",
    "@airline Delayed 3 hours with no explanation. Terrible experience.",
    "@airline Flight was okay, nothing special to report.",
    "Thank you @airline for the upgrade! Great crew and smooth flight!",
    "@airline Why is customer service so bad? Frustrated passenger here.",
    "Just boarded @airline flight. Let's see how this goes."
]

print("\n" + "="*60)
print("TESTING PREDICTION FUNCTION")
print("="*60)
for tweet in test_tweets:
    print(f"Tweet: {tweet}")
    print(f"Predicted Sentiment: {predict_tweet_sentiment(clf, w2v_model, tweet)}")
    print("-" * 60)


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...


airline_sentiment
negative    9178
neutral     3099
positive    2363
Name: count, dtype: int64
Downloading Google News Word2Vec model... This may take a few minutes.
Training model...





Test Accuracy: 0.7560

Classification Report:
              precision    recall  f1-score   support

    negative       0.80      0.91      0.85      1835
     neutral       0.57      0.40      0.47       615
    positive       0.72      0.60      0.66       472

    accuracy                           0.76      2922
   macro avg       0.70      0.64      0.66      2922
weighted avg       0.74      0.76      0.74      2922


TESTING PREDICTION FUNCTION
Tweet: @airline Your service was amazing! Best flight ever! 😊
Predicted Sentiment: positive
------------------------------------------------------------
Tweet: @airline Delayed 3 hours with no explanation. Terrible experience.
Predicted Sentiment: negative
------------------------------------------------------------
Tweet: @airline Flight was okay, nothing special to report.
Predicted Sentiment: negative
------------------------------------------------------------
Tweet: Thank you @airline for the upgrade! Great crew and smooth flight!
P