# Twitter Sentiment Analysis and Topic modeling

In [1]:
import pandas as pd

# Load dataset
df = pd.read_csv('Tweets.csv')
print(df.columns)
print(df[['text', 'airline_sentiment']].head())


Index(['tweet_id', 'airline_sentiment', 'airline_sentiment_confidence',
       'negativereason', 'negativereason_confidence', 'airline',
       'airline_sentiment_gold', 'name', 'negativereason_gold',
       'retweet_count', 'text', 'tweet_coord', 'tweet_created',
       'tweet_location', 'user_timezone'],
      dtype='object')
                                                text airline_sentiment
0                @VirginAmerica What @dhepburn said.           neutral
1  @VirginAmerica plus you've added commercials t...          positive
2  @VirginAmerica I didn't today... Must mean I n...           neutral
3  @VirginAmerica it's really aggressive to blast...          negative
4  @VirginAmerica and it's a really big bad thing...          negative


In [2]:
# removing neutral tweets
df = df[df['airline_sentiment']!='neutral']
print(df.shape)

# positive negative codification
df['airline_sentiment'] = df['airline_sentiment'].apply(lambda x: 1 if x =='positive' else 0)

(6860, 15)


In [3]:
print(df[['text', 'airline_sentiment']].head())

                                                text  airline_sentiment
1  @VirginAmerica plus you've added commercials t...                  1
3  @VirginAmerica it's really aggressive to blast...                  0
4  @VirginAmerica and it's a really big bad thing...                  0
5  @VirginAmerica seriously would pay $30 a fligh...                  0
6  @VirginAmerica yes, nearly every time I fly VX...                  1


In [4]:
import nltk
from nltk.corpus import stopwords
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('punkt_tab')
nltk.download('averaged_perceptron_tagger_eng')

import re

stop_words = stopwords.words('english')
stop_words.extend(['get','hour','http','co'])

def clean_text(text):
    text = text.lower()
    text = re.sub("&[a-z]*;", " ",text)                                           # html tags such as &gt;
    text = re.sub("@[a-z]*", " ",text)
    text = re.sub("[^a-zA-Z]", " ",text)                                          # keep only letters
                                                                                   # lower case
      # Remove stop_word
    words = word_tokenize(text)
    new_text = ""
    for i,w in enumerate(words):
        #if i == 0:                                                                  # first token is always airline name
        #    continue
        if w == 'flightled':
            w = 'flight'
        if w not in stop_words and len(w) > 1:
            new_text = new_text + " " + w
    return new_text
  #return ' '.join([word for word in text.split() if word.lower() not in stopwords.words('english')])

from nltk.stem import WordNetLemmatizer
wnl = WordNetLemmatizer()

from nltk import word_tokenize, pos_tag

nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')

def lemmatize_tag(text):
    lemma=[]
    for i,j in pos_tag(word_tokenize(text)) :
        p=j[0].lower()
        lm = ''
        if p in ['j','n','v']:
            if p == 'j':
                p = 'a'
            lm = wnl.lemmatize(i,p)
        else :
            lm = wnl.lemmatize(i)
        if len(lm) > 1:
            lemma.append(lm)
    return ' '.join(lemma)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger_eng.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


In [5]:
df['cleaned_tweet'] = df['text'].apply(clean_text)
df['cleaned_tweet'] = df['cleaned_tweet'].apply(lemmatize_tag)

# remove rows with empty clean tweets
df = df[df['cleaned_tweet'].apply(len)>0]

print(df.shape)
df.head()

(6856, 16)


Unnamed: 0,tweet_id,airline_sentiment,airline_sentiment_confidence,negativereason,negativereason_confidence,airline,airline_sentiment_gold,name,negativereason_gold,retweet_count,text,tweet_coord,tweet_created,tweet_location,user_timezone,cleaned_tweet
1,570301130888122368,1,0.3486,,0.0,Virgin America,,jnardino,,0,@VirginAmerica plus you've added commercials t...,,2015-02-24 11:15:59 -0800,,Pacific Time (US & Canada),plus added commercial experience tacky
3,570301031407624196,0,1.0,Bad Flight,0.7033,Virgin America,,jnardino,,0,@VirginAmerica it's really aggressive to blast...,,2015-02-24 11:15:36 -0800,,Pacific Time (US & Canada),really aggressive blast obnoxious entertainmen...
4,570300817074462722,0,1.0,Can't Tell,1.0,Virgin America,,jnardino,,0,@VirginAmerica and it's a really big bad thing...,,2015-02-24 11:14:45 -0800,,Pacific Time (US & Canada),really big bad thing
5,570300767074181121,0,1.0,Can't Tell,0.6842,Virgin America,,jnardino,,0,@VirginAmerica seriously would pay $30 a fligh...,,2015-02-24 11:14:33 -0800,,Pacific Time (US & Canada),seriously would pay flight seat play really ba...
6,570300616901320704,1,0.6745,,0.0,Virgin America,,cjmcginnis,,0,"@VirginAmerica yes, nearly every time I fly VX...",,2015-02-24 11:13:57 -0800,San Francisco CA,Pacific Time (US & Canada),yes nearly every time fly vx ear worm go away


In [6]:
df_negative = df[df['airline_sentiment'] == 0]

In [9]:
!pip install bertopic

from bertopic import BERTopic

Collecting bertopic
  Downloading bertopic-0.17.0-py3-none-any.whl.metadata (23 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers>=0.4.1->bertopic)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers>=0.4.1->bertopic)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers>=0.4.1->bertopic)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=1.11.0->sentence-transformers>=0.4.1->bertopic)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=1.11.0->sentence-transformers>=0.4.1->bertopic)
  Downloa

In [11]:
topic_model = BERTopic()
topics, probs = topic_model.fit_transform(df_negative['cleaned_tweet'])

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

<!-- from umap import UMAP
from hdbscan import HDBSCAN
from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import CountVectorizer

from bertopic import BERTopic
from bertopic.representation import KeyBERTInspired
from bertopic.vectorizers import ClassTfidfTransformer


# Step 1 - Extract embeddings
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")

# Step 2 - Reduce dimensionality
umap_model = UMAP(n_neighbors=15, n_components=5, min_dist=0.0, metric='cosine')

# Step 3 - Cluster reduced embeddings
hdbscan_model = HDBSCAN(min_cluster_size=15, metric='euclidean', cluster_selection_method='eom', prediction_data=True)

# Step 4 - Tokenize topics
vectorizer_model = CountVectorizer(stop_words="english")

# Step 5 - Create topic representation
ctfidf_model = ClassTfidfTransformer()

# Step 6 - (Optional) Fine-tune topic representations with
# a `bertopic.representation` model
representation_model = KeyBERTInspired()

# All steps together
topic_model = BERTopic(
  embedding_model=embedding_model,          # Step 1 - Extract embeddings
  umap_model=umap_model,                    # Step 2 - Reduce dimensionality
  hdbscan_model=hdbscan_model,              # Step 3 - Cluster reduced embeddings
  vectorizer_model=vectorizer_model,        # Step 4 - Tokenize topics
  ctfidf_model=ctfidf_model,                # Step 5 - Extract topic words
  representation_model=representation_model # Step 6 - (Optional) Fine-tune topic representations
) -->

In [12]:
topic_model.get_topic_info()

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,1732,-1_flight_cancel_plane_time,"[flight, cancel, plane, time, board, delay, ho...","[ua cancelled flight wait customer service, tr..."
1,0,611,0_bag_luggage_baggage_lose,"[bag, luggage, baggage, lose, check, claim, cl...","[luggage, agent force check carry bag receive ..."
2,1,403,1_customer_hold_service_call,"[customer, hold, service, call, phone, minute,...","[customer service please call, customer servic..."
3,2,225,2_delay_late_time_flight,"[delay, late, time, flight, delayed, departure...",[never flight time delay earlier another delay...
4,3,167,3_gate_tarmac_agent_wait,"[gate, tarmac, agent, wait, sit, plane, min, c...","[crew gate, gate ready gate agent go two, good..."
5,4,125,4_email_complaint_response_case,"[email, complaint, response, case, address, su...","[personal email contact complaint know, fyi in..."
6,5,112,5_book_problem_website_try,"[book, problem, website, try, site, trip, flig...",[problem flight book problem web site keep giv...
7,6,109,6_cancel_flight_cancelled_flighted,"[cancel, flight, cancelled, flighted, home, fl...",[option cancel flight want change flight want ...
8,7,102,7_attendant_rude_passenger_staff,"[attendant, rude, passenger, staff, agent, jok...",[correct attendant error give attitude make ru...
9,8,98,8_suck_guy_tell_nope,"[suck, guy, tell, nope, character, nothing, kn...","[suck, suck, guy suck]"
