In [1]:
import pandas as pd
import numpy as np
import nltk
import re
from nltk.corpus import stopwords
import gensim
from sklearn.cluster import KMeans

nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
def clean_tweet(tweet):
    # Remove URLs
    tweet = re.sub(r"http\S+|www\S+|https\S+", "", tweet, flags=re.MULTILINE)
    tweet = re.sub(r'\S+@\S+', '', tweet)

    tweet = re.sub(r'#\w+', '', tweet)
    tweet = re.sub(r'@\w+', '', tweet)
    tweet = re.sub(r'[^\w\s]', ' ', tweet)
    tweet = tweet.lower()

    tweet = re.sub('[()!?]', ' ', tweet)
    tweet = re.sub(r'  ', ' ', tweet)
    tweet = re.sub(r'   ', ' ', tweet)
    tweet = re.sub("\d+\s|\s\d+", " ", tweet)
  
    tokens = tweet.split()
    
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    cleaned_tweet = ' '.join(tokens)
    
    return cleaned_tweet


In [5]:
df = pd.read_csv('drive/MyDrive/PJAIT/ZUM/Twitter_Data.csv')
df = df.rename(columns={"Text":"tweet"})
df["tweet"] = df["tweet"].apply(lambda x: clean_tweet(x))

df["is_long"] = df["tweet"].apply(lambda x:0 if (len(x) <10) | (' ' not in x) else 1)
df = df[df["is_long"]==1].reset_index(drop=True).drop(columns="is_long")


In [7]:
sentences = df['tweet'].apply(lambda x: x.split())

model = gensim.models.Word2Vec(sentences, min_count=1, vector_size=100)

word_vectors = np.zeros((len(sentences), 100))
for i, sentence in enumerate(sentences):
    for word in sentence:
        if word in model.wv:
            word_vectors[i] += model.wv[word]

# Normalize the word vectors
word_vectors /= np.linalg.norm(word_vectors, axis=1, keepdims=True)

# Remove rows with NaN values
word_vectors = word_vectors[~np.isnan(word_vectors).any(axis=1)]

# Perform K-Means clustering
k = 2  # Number of clusters
kmeans = KMeans(n_clusters=k)
clusters = kmeans.fit_predict(word_vectors)

df = df[~np.isnan(word_vectors).any(axis=1)]
df['Cluster'] = clusters



In [8]:
df["Cluster"] = df["Cluster"].apply(lambda x:"group_1" if x == 0 else "group_2")

In [9]:
df.groupby('Cluster').count()

Unnamed: 0_level_0,tweet
Cluster,Unnamed: 1_level_1
group_1,90565
group_2,54410


In [11]:
df.head(10)

Unnamed: 0,tweet,Cluster
0,guides field apart get competency certificatio...,group_2
1,exactly liberal logic seem forget even rbg sai...,group_1
2,trampled integrity broken fun things never don...,group_1
3,written problems address kenyans english people,group_1
4,see thing president cant add seats supreme cou...,group_1
5,idk like thing vas yun jin sing like performan...,group_1
6,passion distributed systems kubernetes databas...,group_2
7,day dream time,group_2
8,dream lover,group_2
9,incredible moment,group_2


In [12]:
df.to_csv('drive/MyDrive/PJAIT/ZUM/Twitter_Clustered_Data.csv',index=False,index_label=False)