In [None]:

import re
import string
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from gensim.models import KeyedVectors
import nltk
from nltk.stem import WordNetLemmatizer

# Download NLTK resources
nltk.download('wordnet')
nltk.download('omw-1.4')


In [None]:

# Contraction map
contraction_map = {
    "don't": "do not", "doesn't": "does not", "can't": "cannot", "i'm": "i am",
    "you're": "you are", "it's": "it is", "that's": "that is", "there's": "there is",
    "isn't": "is not", "aren't": "are not", "wasn't": "was not", "weren't": "were not",
    "won't": "will not", "wouldn't": "would not", "couldn't": "could not", "shouldn't": "should not",
    "didn't": "did not", "hasn't": "has not", "haven't": "have not", "i've": "i have",
    "you've": "you have", "we've": "we have", "they've": "they have", "i'll": "i will",
    "you'll": "you will", "we'll": "we will", "they'll": "they will", "i'd": "i would",
    "you'd": "you would", "he'd": "he would", "she'd": "she would", "they'd": "they would",
    "we'd": "we would", "let's": "let us", "who's": "who is", "what's": "what is",
    "could've": "could have", "would've": "would have", "should've": "should have"
}

# Basic stopwords
stop_words = {
    "i", "me", "my", "myself", "we", "our", "ours", "ourselves", "you",
    "your", "yours", "yourself", "yourselves", "he", "him", "his",
    "himself", "she", "her", "hers", "herself", "it", "its", "itself",
    "they", "them", "their", "theirs", "themselves", "what", "which",
    "who", "whom", "this", "that", "these", "those", "am", "is", "are",
    "was", "were", "be", "been", "being", "have", "has", "had", "having",
    "do", "does", "did", "doing", "a", "an", "the", "and", "but", "if",
    "or", "because", "as", "until", "while", "of", "at", "by", "for",
    "with", "about", "against", "between", "into", "through", "during",
    "before", "after", "above", "below", "to", "from", "up", "down",
    "in", "out", "on", "off", "over", "under", "again", "further",
    "then", "once", "here", "there", "when", "where", "why", "how",
    "all", "any", "both", "each", "few", "more", "most", "other",
    "some", "such", "no", "nor", "not", "only", "own", "same", "so",
    "than", "too", "very", "can", "will", "just", "don", "should", "now"
}


In [None]:

lemmatizer = WordNetLemmatizer()

def expand_contractions(text):
    pattern = re.compile('({})'.format('|'.join(re.escape(k) for k in contraction_map.keys())), flags=re.IGNORECASE)
    return pattern.sub(lambda x: contraction_map[x.group().lower()], text)

def preprocess_tweet(text):
    text = text.lower()
    text = expand_contractions(text)
    text = re.sub(r"http\S+|www\S+", "", text)
    text = re.sub(r"@\w+|#\w+", "", text)
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = re.sub(r"[^\x00-\x7F]+", " ", text)
    words = text.split()
    return [lemmatizer.lemmatize(w) for w in words if w not in stop_words]


In [None]:

# Load Dataset
df = pd.read_csv("Airline-Sentiment-2-w-AA.csv")
df['cleaned'] = df['text'].apply(preprocess_tweet)
df[['text', 'cleaned']].head()


In [None]:

# Load Word2Vec model
w2v = KeyedVectors.load_word2vec_format("GoogleNews-vectors-negative300.bin.gz", binary=True)


In [None]:

# Convert tweets to vectors
def tweet_to_vec(words, model, vector_size=300):
    valid_words = [model[word] for word in words if word in model]
    if not valid_words:
        return [0.0] * vector_size
    return list(sum(valid_words) / len(valid_words))

X = df['cleaned'].apply(lambda x: tweet_to_vec(x, w2v)).tolist()
y = df['airline_sentiment']


In [None]:

# Train/test split and train classifier
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

clf = LogisticRegression(max_iter=1000)
clf.fit(X_train, y_train)


In [None]:

# Evaluate model
y_pred = clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Test Accuracy:", accuracy)


In [None]:

# Prediction function
def predict_tweet_sentiment(model, w2v_model, tweet):
    processed = preprocess_tweet(tweet)
    vec = tweet_to_vec(processed, w2v_model)
    return model.predict([vec])[0]

# Example
predict_tweet_sentiment(clf, w2v, "I love this airline, great service!")
