In [1]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from textblob import TextBlob
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF

In [10]:
# Load the social media data set from a CSV file
data = pd.read_csv("Tweets.csv")
print(data['text'][:10])
# Perform text preprocessing
stop_words = set(stopwords.words('english'))
data['text'] = data['text'].apply(lambda x: ' '.join([word.lower() for word in word_tokenize(x) if word.lower() not in stop_words]))


0                  @VirginAmerica What @dhepburn said.
1    @VirginAmerica plus you've added commercials t...
2    @VirginAmerica I didn't today... Must mean I n...
3    @VirginAmerica it's really aggressive to blast...
4    @VirginAmerica and it's a really big bad thing...
5    @VirginAmerica seriously would pay $30 a fligh...
6    @VirginAmerica yes, nearly every time I fly VX...
7    @VirginAmerica Really missed a prime opportuni...
8      @virginamerica Well, I didn't…but NOW I DO! :-D
9    @VirginAmerica it was amazing, and arrived an ...
Name: text, dtype: object


In [11]:
# Perform sentiment analysis
data['sentiment'] = data['text'].apply(lambda x: TextBlob(x).sentiment.polarity)
print(data['sentiment'][:10])

0    0.000000
1    0.000000
2   -0.390625
3    0.006250
4   -0.350000
5   -0.516667
6    0.466667
7    0.200000
8    0.000000
9    0.466667
Name: sentiment, dtype: float64


In [19]:
# Vectorize the text data using TF-IDF
# This vectorizes the text data using Term Frequency-Inverse Document Frequency (TF-IDF) vectorization. 
# It converts the text data into a matrix of TF-IDF features using TfidfVectorizer from scikit-learn.
tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, stop_words='english')
tfidf = tfidf_vectorizer.fit_transform(data['text'])


In [21]:
# This performs topic modeling using Non-negative Matrix Factorization (NMF) from scikit-learn. 
# It sets the number of topics to 10, initializes the NMF model with 10 components, and fits the 
# model to the TF-IDF matrix. It then extracts the topic keywords from the model components.
num_topics = 10
nmf_model = NMF(n_components=num_topics, random_state=42)
nmf_model.fit(tfidf)
nmf_topic_keywords = nmf_model.components_

In [22]:
keywords = tfidf_vectorizer.get_feature_names_out()
topic_keywords = []
for topic_weights in nmf_topic_keywords:
    top_keywords = [keywords[i] for i in topic_weights.argsort()[:-11:-1]]
    topic_keywords.append(', '.join(top_keywords))
for i, topic_keywords in enumerate(topic_keywords):
    print(f"Topic {i}: {topic_keywords}")

Topic 0: flight, late, delayed, time, virginamerica, problems, booking, plane, hour, delay
Topic 1: jetblue, http, fleek, fleet, rt, love, jfk, guys, stop, good
Topic 2: united, dm, bag, http, plane, time, gate, airline, like, yes
Topic 3: thank, great, jetblue, ok, response, virginamerica, appreciate, okay, awesome, appreciated
Topic 4: thanks, great, jetblue, virginamerica, got, awesome, reply, response, ok, good
Topic 5: southwestair, dm, sent, http, follow, hold, help, destinationdragons, love, southwest
Topic 6: usairways, hold, hours, help, hour, phone, ve, minutes, plane, time
Topic 7: americanair, help, aa, ca, dm, need, phone, hours, guys, dfw
Topic 8: service, customer, worst, great, terrible, experience, poor, airline, today, care
Topic 9: cancelled, flightled, flights, flighted, tomorrow, hold, help, flight, rebook, today
