<a href="https://colab.research.google.com/github/prathameshks/College-Code-sem7/blob/main/ML-Honors%2520AIML/Practical%25204/twitter_sentement.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import re
import nltk
from nltk.sentiment import SentimentIntensityAnalyzer
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag
from nltk.chunk import ne_chunk
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
nltk.download('averaged_perceptron_tagger_eng')

[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger_eng.zip.


True

In [3]:
# Download necessary NLTK data
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('maxent_ne_chunker')
nltk.download('words')
nltk.download('vader_lexicon')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping chunkers/maxent_ne_chunker.zip.
[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Unzipping corpora/words.zip.
[nltk_data] Downloading package vader_lexicon to /root/nltk_data...


True

In [4]:
nltk.download('maxent_ne_chunker_tab')

[nltk_data] Downloading package maxent_ne_chunker_tab to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping chunkers/maxent_ne_chunker_tab.zip.


True

In [5]:
# Step 1: Collect tweets (In this case, load from CSV)
def load_tweets(file_path, num_samples=10000):
    df = pd.read_csv(file_path, encoding='latin-1', header=None,
                     names=['target', 'id', 'date', 'flag', 'user', 'text'])
    return df.sample(n=num_samples, random_state=42)


In [6]:

# Step 2: Pre-process tweets
def preprocess_tweet(text):
    # Convert to lowercase
    text = text.lower()
    # Remove URLs
    text = re.sub(r'http\S+', '', text)
    # Remove mentions
    text = re.sub(r'@\w+', '', text)
    # Remove special characters
    text = re.sub(r'[^\w\s]', '', text)
    # Remove extra whitespace
    text = re.sub(r'\s+', ' ', text).strip()
    return text


In [15]:

# Step 3: Apply sentiment analysis
def analyze_sentiment(text):
    sia = SentimentIntensityAnalyzer()
    return sia.polarity_scores(text)['compound']

# Step 4: Apply named entity recognition
def extract_entities(text):
    tokens = word_tokenize(text)
    pos_tags = pos_tag(tokens)
    tree = ne_chunk(pos_tags)
    entities = []
    for subtree in tree:
        if isinstance(subtree, nltk.Tree):
            entities.append((subtree.label(), ' '.join([token for token, pos in subtree.leaves()])))
    return entities

# Step 5: Cluster tweets
def cluster_tweets(texts, n_clusters=5):
    vectorizer = TfidfVectorizer(max_features=1000)
    X = vectorizer.fit_transform(texts)
    kmeans = KMeans(n_clusters=n_clusters, random_state=42)
    kmeans.fit(X)
    return kmeans.labels_

# Step 6: Visualize analysis
def visualize_analysis(df):
    # Sentiment distribution
    plt.figure(figsize=(10, 5))
    sns.histplot(df['sentiment'], bins=50, kde=True)
    plt.title('Sentiment Distribution')
    plt.xlabel('Sentiment Score')
    plt.savefig('sentiment_distribution.png')
    plt.close()

    # Top entities
    entity_counts = df['entities'].explode().value_counts()
    if not entity_counts.empty:
        entity_counts = entity_counts.head(10)
        plt.figure(figsize=(12, 6))
        entity_counts.plot(kind='bar')
        plt.title('Top 10 Named Entities')
        plt.xlabel('Entity')
        plt.ylabel('Frequency')
        plt.xticks(rotation=45, ha='right')  # Rotate x-axis labels for better readability
        plt.tight_layout()
        plt.savefig('named_entities.png')
    else:
        print("No named entities found to plot.")  # Inform user if plotting is skipped


    # Cluster visualization (using t-SNE for dimensionality reduction)
    from sklearn.manifold import TSNE
    vectorizer = TfidfVectorizer(max_features=1000)
    X = vectorizer.fit_transform(df['processed_text'])
    tsne = TSNE(n_components=2, random_state=42)
    tsne_results = tsne.fit_transform(X.toarray())

    plt.figure(figsize=(10, 8))
    scatter = plt.scatter(tsne_results[:, 0], tsne_results[:, 1], c=df['cluster'], cmap='viridis')
    plt.colorbar(scatter)
    plt.title('Tweet Clusters Visualization')
    plt.xlabel('t-SNE 1')
    plt.ylabel('t-SNE 2')
    plt.savefig('tweet_clusters.png')
    plt.close()


In [16]:
# Step 1: Load tweets
df = load_tweets('https://github.com/prathameshks/College-Code-sem7/raw/refs/heads/main/ML-Honors%20AIML/Practical%204/data.csv', num_samples=1000)


In [17]:
df.head()

Unnamed: 0,target,id,date,flag,user,text
80956,0,1752531368,Sat May 09 22:11:06 PDT 2009,NO_QUERY,TrishGoger,so.. cold..
109622,0,1824611419,Sun May 17 02:08:36 PDT 2009,NO_QUERY,xeeko,sick! cold! cough! and fever! just at the mis...
148312,0,1883094112,Fri May 22 07:59:57 PDT 2009,NO_QUERY,iamsuebee,@_theX . . . And yet you never come to Beantow...
79883,0,1752047759,Sat May 09 20:55:31 PDT 2009,NO_QUERY,JKlover,@JenAlaniz Wasn't it though! I love the 90's! ...
267821,0,1989271965,Mon Jun 01 00:56:56 PDT 2009,NO_QUERY,SuperDunner,"@DirtyDiva sorry, but @jeffrubenstein didn't w..."


In [18]:
# Step 2: Pre-process tweets
df['processed_text'] = df['text'].apply(preprocess_tweet)


In [19]:
df.head()

Unnamed: 0,target,id,date,flag,user,text,processed_text
80956,0,1752531368,Sat May 09 22:11:06 PDT 2009,NO_QUERY,TrishGoger,so.. cold..,so cold
109622,0,1824611419,Sun May 17 02:08:36 PDT 2009,NO_QUERY,xeeko,sick! cold! cough! and fever! just at the mis...,sick cold cough and fever just at the mist of ...
148312,0,1883094112,Fri May 22 07:59:57 PDT 2009,NO_QUERY,iamsuebee,@_theX . . . And yet you never come to Beantow...,and yet you never come to beantown smh haha
79883,0,1752047759,Sat May 09 20:55:31 PDT 2009,NO_QUERY,JKlover,@JenAlaniz Wasn't it though! I love the 90's! ...,wasnt it though i love the 90s and the 80s so ...
267821,0,1989271965,Mon Jun 01 00:56:56 PDT 2009,NO_QUERY,SuperDunner,"@DirtyDiva sorry, but @jeffrubenstein didn't w...",sorry but didnt want any of my hugs


In [20]:
# Step 3: Apply sentiment analysis
df['sentiment'] = df['processed_text'].apply(analyze_sentiment)


In [21]:
df.head()

Unnamed: 0,target,id,date,flag,user,text,processed_text,sentiment
80956,0,1752531368,Sat May 09 22:11:06 PDT 2009,NO_QUERY,TrishGoger,so.. cold..,so cold,0.0
109622,0,1824611419,Sun May 17 02:08:36 PDT 2009,NO_QUERY,xeeko,sick! cold! cough! and fever! just at the mis...,sick cold cough and fever just at the mist of ...,-0.5106
148312,0,1883094112,Fri May 22 07:59:57 PDT 2009,NO_QUERY,iamsuebee,@_theX . . . And yet you never come to Beantow...,and yet you never come to beantown smh haha,0.1779
79883,0,1752047759,Sat May 09 20:55:31 PDT 2009,NO_QUERY,JKlover,@JenAlaniz Wasn't it though! I love the 90's! ...,wasnt it though i love the 90s and the 80s so ...,-0.6103
267821,0,1989271965,Mon Jun 01 00:56:56 PDT 2009,NO_QUERY,SuperDunner,"@DirtyDiva sorry, but @jeffrubenstein didn't w...",sorry but didnt want any of my hugs,0.5882


In [22]:
# Step 4: Apply named entity recognition
df['entities'] = df['processed_text'].apply(extract_entities)

In [23]:
df.head()

Unnamed: 0,target,id,date,flag,user,text,processed_text,sentiment,entities
80956,0,1752531368,Sat May 09 22:11:06 PDT 2009,NO_QUERY,TrishGoger,so.. cold..,so cold,0.0,[]
109622,0,1824611419,Sun May 17 02:08:36 PDT 2009,NO_QUERY,xeeko,sick! cold! cough! and fever! just at the mis...,sick cold cough and fever just at the mist of ...,-0.5106,[]
148312,0,1883094112,Fri May 22 07:59:57 PDT 2009,NO_QUERY,iamsuebee,@_theX . . . And yet you never come to Beantow...,and yet you never come to beantown smh haha,0.1779,[]
79883,0,1752047759,Sat May 09 20:55:31 PDT 2009,NO_QUERY,JKlover,@JenAlaniz Wasn't it though! I love the 90's! ...,wasnt it though i love the 90s and the 80s so ...,-0.6103,[]
267821,0,1989271965,Mon Jun 01 00:56:56 PDT 2009,NO_QUERY,SuperDunner,"@DirtyDiva sorry, but @jeffrubenstein didn't w...",sorry but didnt want any of my hugs,0.5882,[]


In [24]:
# Step 5: Cluster tweets
df['cluster'] = cluster_tweets(df['processed_text'])

In [25]:
df.head()

Unnamed: 0,target,id,date,flag,user,text,processed_text,sentiment,entities,cluster
80956,0,1752531368,Sat May 09 22:11:06 PDT 2009,NO_QUERY,TrishGoger,so.. cold..,so cold,0.0,[],1
109622,0,1824611419,Sun May 17 02:08:36 PDT 2009,NO_QUERY,xeeko,sick! cold! cough! and fever! just at the mis...,sick cold cough and fever just at the mist of ...,-0.5106,[],2
148312,0,1883094112,Fri May 22 07:59:57 PDT 2009,NO_QUERY,iamsuebee,@_theX . . . And yet you never come to Beantow...,and yet you never come to beantown smh haha,0.1779,[],4
79883,0,1752047759,Sat May 09 20:55:31 PDT 2009,NO_QUERY,JKlover,@JenAlaniz Wasn't it though! I love the 90's! ...,wasnt it though i love the 90s and the 80s so ...,-0.6103,[],1
267821,0,1989271965,Mon Jun 01 00:56:56 PDT 2009,NO_QUERY,SuperDunner,"@DirtyDiva sorry, but @jeffrubenstein didn't w...",sorry but didnt want any of my hugs,0.5882,[],3


In [26]:
# Step 6: Visualize analysis
visualize_analysis(df)

print("Analysis complete. Visualization images saved.")

No named entities found to plot.
Analysis complete. Visualization images saved.
