In [100]:
import pandas as pd
import numpy as np
import gensim
from zipfile import ZipFile 
from gensim.models import Word2Vec
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.cluster import KMeans,DBSCAN, SpectralClustering, AgglomerativeClustering
from sklearn.metrics import silhouette_score
from  sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import seaborn as sns
import matplotlib.pyplot as plt

In [101]:
file = "data.zip"
  
# ouvrir le fichier zip en mode lecture
with ZipFile(file, 'r') as zip: 
    # extraire tous les fichiers vers un autre répertoire
    zip.extractall('data')

In [102]:
df_tweets = pd.read_csv('data/out.csv')
df_tweets 

Unnamed: 0,lang,possibly_sensitive,text,created_at,label
0,en,False,package would cost problably ukraine gdp,2023-01-15 21:01:27,0
1,en,False,rt syrian solidarity ukrainians strong horrid ...,2023-01-15 21:01:27,-1
2,fr,False,rt apoth ose guerre tableau peintre russe vass...,2023-01-15 21:01:27,0
3,en,False,ukrainians dying far greater numbers fantasy i...,2023-01-15 21:01:27,1
4,en,False,rt death toll jan russian cruise missile strik...,2023-01-15 21:01:27,-1
...,...,...,...,...,...
908,en,False,rt russian troops makiivka party new years eve...,2023-01-15 20:59:11,1
909,en,False,rt polish pm morawiecki traveling berlin tomor...,2023-01-15 20:59:11,1
910,en,False,done nothing russia would taken ukraine days w...,2023-01-15 20:59:11,1
911,en,False,rt sleepy joe japanese prime minister fumio ki...,2023-01-15 20:59:11,0


In [103]:
df_tweets.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 913 entries, 0 to 912
Data columns (total 5 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   lang                913 non-null    object
 1   possibly_sensitive  913 non-null    bool  
 2   text                911 non-null    object
 3   created_at          913 non-null    object
 4   label               913 non-null    int64 
dtypes: bool(1), int64(1), object(3)
memory usage: 29.5+ KB


In [104]:
X= df_tweets['text']
count_vect = CountVectorizer()
tfidf_transformer = TfidfTransformer()

In [105]:
X_train_counts = count_vect.fit_transform(X.values.astype('U'))
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts).todense()

In [106]:
# Using DBSCAN
dbscan_est = DBSCAN(eps=0.5, min_samples=2)
dbscan_est.fit_predict(np.asarray(X_train_tfidf))
#Storing the labels formed by the DBSCAN
labels = dbscan_est.labels_
#Calculating "the number of clusters"
n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
print("Number of clusters obtained using DBSCAN:",n_clusters_)

Number of clusters obtained using DBSCAN: 103


In [107]:
#Computing "the Silhouette Score"
print("Silhouette Coefficient: %0.3f"
      % silhouette_score(np.asarray(X_train_tfidf), labels))

Silhouette Coefficient: 0.433


In [108]:
# Using K-means with n_clusters equal to the number of clusters found by the algorithm DBSCAN
K = 245
kmeans_est = KMeans(n_clusters=K, random_state=0)
kmeans_est.fit_predict(np.asarray(X_train_tfidf))



array([ 36,  37,  98, 238,  10, 132, 150, 147,  24,   1,  70,   2,  93,
       142,  11,  10, 138, 132,   1,   1,  36, 154, 132,  87,  59, 237,
       240,  72,  73,   4, 100, 136,  69,  72, 132,  43,  45, 162,  13,
       151,  98, 154,  84, 132, 228,   2,  54, 135, 197,  10,  10,  15,
        10,   7, 231, 131,  29,   7, 126, 214,  12, 115,  82,  38,   7,
        28, 240,   4,   5, 232,  32, 143, 201, 132,  24, 171, 117,   1,
       106, 103,  63,   0, 227, 130,  24, 105,  21,  64,  47, 109,  71,
       120, 145,  21,  44, 122, 191, 207,  41,   5, 116,  41,  44,  21,
       235,  95, 132,  31,  76, 132,  20,   4, 162, 174, 223, 165, 165,
        10, 172,  38, 143,   5,  60,   2,   5, 212, 235,   0,   5,  42,
        65, 165, 145, 240, 139, 114, 118,  92,   9, 230,   0, 142, 132,
        21, 167, 244, 241, 126,  70, 142,  24, 240,  10,  59, 125, 121,
       197, 145, 187, 143, 106,  13,  74, 240,   4,  29, 232,  11,  72,
         2,  66,  93, 124,  40,  18, 222,  77, 211, 135,   0,   

In [110]:
# Calculate Silhoutte Score
score = silhouette_score(np.asarray(X_train_tfidf), kmeans_est.labels_)
print("Silhouette Coeficient", score)

Silhouette Coeficient 0.41412518493501765


In [111]:
# Using Hierarchical clustering
HCA_est = AgglomerativeClustering(n_clusters=K)
HCA_est.fit_predict(np.asarray(X_train_tfidf))
#Computing "the Silhouette Score"
print("Silhouette Coefficient: %0.3f"
      % silhouette_score(np.asarray(X_train_tfidf), HCA_est.labels_))

Silhouette Coefficient: 0.467
