In [1]:
import pandas as pd

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.cluster import DBSCAN
from sklearn.metrics import silhouette_score
from sklearn.preprocessing import StandardScaler

In [2]:
tweets = pd.read_csv('datasets/cleaned_tweets.csv')

In [3]:
tweets.head()

Unnamed: 0,tweets,id,name,location,coordinates,created_at,favorite_count,geo,place,source,clean_tweets
0,@Jim_Howard_13 @RepAndyBiggsAZ @RandPaul just ...,1.260339e+18,corporationsgonewild 🌊,any town USA,,2020-05-12,1.0,,,Twitter Web App,jim howard repandybiggsaz randpaul go drink cl...
1,Latest study of #Trump's proposed #COVID19 dru...,1.260334e+18,Darius,"Berkeley, CA",,2020-05-12,0.0,,,Twitter for Android,latest study trump proposed covid drug hydroxy...
2,Restock!!\n\nClorox Fraganzia Multi-Purpose Cl...,1.260322e+18,Find 😷 Essentials & Save 💰 Shopping Online,,,2020-05-12,0.0,,,Twitter Web App,restock clorox fraganzia multi purpose cleaner...
3,RT @ABC7: Searching high and low for #Clorox d...,1.260316e+18,jeaned62803,,,2020-05-12,0.0,,,Twitter for iPhone,rt abc searching high low clorox disinfecting ...
4,😊\n[...] Alec #Baldwin returns as President #T...,1.260315e+18,Desnot,Paris Bastille,,2020-05-12,0.0,,,Twitter Web App,alec baldwin returns president trump drinking ...


In [4]:
tweets.dtypes

tweets             object
id                float64
name               object
location           object
coordinates        object
created_at         object
favorite_count    float64
geo                object
place              object
source             object
clean_tweets       object
dtype: object

In [4]:
tweets.isnull().sum()

tweets               0
id                   0
name                 0
location          1821
coordinates       6134
created_at           0
favorite_count       0
geo               6134
place             6008
source               0
clean_tweets         8
dtype: int64

In [5]:
tweets.dropna(subset = ['clean_tweets'] , inplace=True)

In [6]:
tweets.isnull().sum()

tweets               0
id                   0
name                 0
location          1814
coordinates       6126
created_at           0
favorite_count       0
geo               6126
place             6000
source               0
clean_tweets         0
dtype: int64

In [7]:
cvec = CountVectorizer(stop_words=None , ngram_range=(1, 2))

cvec_matrix = cvec.fit_transform(tweets['clean_tweets'])

cvec_term_df = pd.DataFrame(cvec_matrix.toarray(), columns=cvec.get_feature_names())
cvec_term_df.head()

Unnamed: 0,aa,aaah,aaah also,aaal,aaal bw,aahvhvxtp,aahvhvxtp handsanitizer,aarhpsi,aaronblake,aaronblake would,...,zyejvigc,zyvtvrsmvz,zyxjptipzn,zyyn,zyyn dwumr,zz,zz nrw,zzhpqwjkt,zzl,zznpnia
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [8]:
tfidf = TfidfVectorizer(stop_words=None , ngram_range=(1, 2))

tfidf_matrix = tfidf.fit_transform(tweets['clean_tweets'])

tfidf_term_df = pd.DataFrame(tfidf_matrix.toarray(), columns=tfidf.get_feature_names())
tfidf_term_df.head()

Unnamed: 0,aa,aaah,aaah also,aaal,aaal bw,aahvhvxtp,aahvhvxtp handsanitizer,aarhpsi,aaronblake,aaronblake would,...,zyejvigc,zyvtvrsmvz,zyxjptipzn,zyyn,zyyn dwumr,zz,zz nrw,zzhpqwjkt,zzl,zznpnia
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [9]:
tfidf_term_df.sum().sort_values(ascending=False).head(20)

co                   157.268333
https                157.126803
https co             156.344705
rt                   140.830209
clorox                98.662066
alcohol               70.108089
isopropyl             65.144618
disinfectant          62.769171
toiletpaper           60.059064
isopropyl alcohol     56.918925
facemask              54.316415
lysol                 49.262571
hand                  48.427888
sanitizer             46.119049
trump                 43.415741
covid                 42.656648
handsanitizer         42.482825
hand sanitizer        38.047538
wipes                 37.139828
realdonaldtrump       34.061699
dtype: float64

In [10]:
tfidf_term_df.drop(columns=['co', 'https', 'https co', 'rt'], inplace=True)

In [138]:
tfidf_term_df.sum().sort_values(ascending=False).head(10) / tfidf_term_df.shape[0] * 100

clorox               1.596215
alcohol              1.134252
isopropyl            1.053949
disinfectant         1.015518
toiletpaper          0.971672
isopropyl alcohol    0.920869
facemask             0.878764
lysol                0.797000
hand                 0.783496
sanitizer            0.746142
dtype: float64

In [12]:
cvec_term_df.drop(columns=['co', 'https', 'https co', 'rt'], inplace=True)

In [137]:
cvec_term_df.sum().sort_values(ascending=False).head(60) / cvec_term_df.shape[0] * 100

clorox               15.806504
alcohol              12.053066
isopropyl            10.791134
disinfectant         10.419026
facemask              9.318880
isopropyl alcohol     8.768808
toiletpaper           8.477593
hand                  8.380521
sanitizer             7.765734
lysol                 7.474519
trump                 6.827374
covid                 6.795017
hand sanitizer        6.180230
handsanitizer         6.099337
wipes                 4.918298
realdonaldtrump       4.675619
people                4.400582
face                  4.238796
mask                  4.141725
bleach                3.381330
coronavirus           3.284258
masks                 3.154829
new                   2.960686
get                   2.895972
need                  2.847436
toilet                2.798900
paper                 2.701828
like                  2.459149
today                 2.426792
take                  2.394435
use                   2.345899
bt                    2.232648
call    

In [None]:
# https://github.com/TungPhung/Twitter-Natural-Disaster-Mapping/blob/master/ProcessingandModeling.ipynb

In [142]:
dbscan_cvec = DBSCAN()

In [143]:
important_words = ['clorox', 'alcohol', 'isopropyl', 'disinfectant', 'face mask', 'isopropyl alcohol',
                   'toilet paper', 'hand', 'sanitizer', 'hand sanitizer', 'lysol', 'handsanitizer',
                   'wipes', 'face', 'mask', 'bleach', 'masks', 'new', 'get', 'need', 'toilet', 'paper',
                   'today', 'go' , 'toiletpaper', 'spray', 'stock', 'oz', 'us', 'available']

In [144]:
dbscan_cvec.fit(cvec_term_df)

DBSCAN(algorithm='auto', eps=0.5, leaf_size=30, metric='euclidean',
       metric_params=None, min_samples=5, n_jobs=None, p=None)

In [145]:
# checking the silhouette score
silhouette_score(cvec_term_df, dbscan_cvec.labels_)

-0.05342333876367848

In [146]:
dbscan_cvec.fit(cvec_term_df[important_words])

DBSCAN(algorithm='auto', eps=0.5, leaf_size=30, metric='euclidean',
       metric_params=None, min_samples=5, n_jobs=None, p=None)

In [147]:
dbscan_cvec.labels_

array([ 0,  1,  2, ...,  1, 72,  1])

In [148]:
tweets['commodity_tweet'] = dbscan_cvec.labels_

In [149]:
# checking the silhouette score
silhouette_score(cvec_term_df[important_words], dbscan_cvec.labels_)

0.8276351962894265

In [122]:
tweets_dbscan = tweets.groupby(['tweets'])['commodity_tweet'].mean()

In [123]:
pd.DataFrame(tweets_dbscan).sort_values('commodity_tweet', ascending=True).head(40)

Unnamed: 0_level_0,commodity_tweet
tweets,Unnamed: 1_level_1
"Looking for hand sanitizer? Desert Essence has lemongrass probiotic hand sanitizer in stock, 8 fl oz for 9.99. And… https://t.co/JdrKj00Me1",-1
Restock!! \n\nClorox 4 in 1 Disinfectant Sanitizer\n\nStarting at $7.19\n\n*location based and may not be available to al… https://t.co/xUr7t9xEAa,-1
The last two stores I went into had toilet paper. Could the toilet paper crisis of 2020 be almost over?… https://t.co/lkfGyw0M7N,-1
@Lysol so when does the average shopper get to see your disinfectant sprays and wipes back on the shelves? I'm tir… https://t.co/2CkcFvtACL,-1
"RT @SirFreebie75061: My body has absorbed so much soap and sanitizer that when I pee, it cleans the toilet.\n\n#coronavirus #WashYourHands #h…",-1
mamaya available na.. 🤗✨🤭☺️\nKids Premium Neoprene Mask\nretail - 35\n3pcs for 100 pesos only ‼️\n\nwith tissue pocket\n2… https://t.co/9OphdIJmiA,-1
RT @SinmaraDesign: NEW!! 📢 Mask collection! https://t.co/0YlPmz8p3o via @zazzle #mask #masks #facemask #facemasks #stayhealthy #clothfacem…,-1
Check out Lot 6 - 11oz Free Shipping #Clorox https://t.co/6vZ4twLPny via @eBay,-1
The lockdown is not sustainable. Open up...\n\nbut not until the following are available:\n\n-Good masks\n-Hand sanitize… https://t.co/49NqLRE4ud,-1
Restock!! \n\nClorox 4 in 1 Disinfectant Sanitizer\n\nStarting at $7.19\n\n*location based and may not be available to al… https://t.co/Vxp18OAej3,-1


In [124]:
geo_dbscan = tweets.groupby(['geo'])['commodity_tweet'].mean()
pd.DataFrame(geo_dbscan).sort_values('commodity_tweet', ascending=True).head(40)

Unnamed: 0_level_0,commodity_tweet
geo,Unnamed: 1_level_1
"{'type': 'Point', 'coordinates': [33.94283183, -118.33342164]}",-1.0
"{'type': 'Point', 'coordinates': [43.5154, -79.62653]}",-1.0
"{'type': 'Point', 'coordinates': [34.0564, -118.2445]}",-1.0
"{'type': 'Point', 'coordinates': [6.45306, 3.39583]}",-1.0
"{'type': 'Point', 'coordinates': [44.95905, -92.95983]}",-1.0
"{'type': 'Point', 'coordinates': [33.76443, -84.35005]}",-1.0
"{'type': 'Point', 'coordinates': [28.56006986, -81.82137357]}",-1.0
"{'type': 'Point', 'coordinates': [32.8695093, -96.6797152]}",-1.0
"{'type': 'Point', 'coordinates': [45.4201767, -122.78649566]}",-1.0
"{'type': 'Point', 'coordinates': [46.50105096, -84.34907913]}",-1.0


In [40]:
ss = StandardScaler()
tfidf_transformed = ss.fit_transform(tfidf_term_df)

In [41]:
tfidf_trans = DBSCAN()
tfidf_trans.fit(tfidf_term_df[important_words])

DBSCAN(algorithm='auto', eps=0.5, leaf_size=30, metric='euclidean',
       metric_params=None, min_samples=5, n_jobs=None, p=None)

In [42]:
# checking the silhouette score
silhouette_score(tfidf_term_df, tfidf_trans.labels_)

-0.007261363257047598

In [43]:
tfidf_trans.labels_

array([0, 0, 0, ..., 0, 0, 0])

In [126]:
tweets['source'].value_counts(normalize=True)

Twitter for iPhone      0.333279
Twitter for Android     0.257240
Twitter Web App         0.241223
Twitter for iPad        0.035431
Instagram               0.026209
                          ...   
Healthcare Aggregate    0.000162
SPTK: PutnamDV          0.000162
Primal for Twitter      0.000162
crowdeath_ebooks        0.000162
Talon (Plus)            0.000162
Name: source, Length: 147, dtype: float64

In [127]:
.333279 + .257240

0.590519

In [135]:
tweets.groupby(['geo'])['commodity_tweet'].mean().sort_values()

geo
{'type': 'Point', 'coordinates': [33.94283183, -118.33342164]}     -1.000000
{'type': 'Point', 'coordinates': [43.5154, -79.62653]}             -1.000000
{'type': 'Point', 'coordinates': [34.0564, -118.2445]}             -1.000000
{'type': 'Point', 'coordinates': [6.45306, 3.39583]}               -1.000000
{'type': 'Point', 'coordinates': [44.95905, -92.95983]}            -1.000000
{'type': 'Point', 'coordinates': [33.76443, -84.35005]}            -1.000000
{'type': 'Point', 'coordinates': [28.56006986, -81.82137357]}      -1.000000
{'type': 'Point', 'coordinates': [32.8695093, -96.6797152]}        -1.000000
{'type': 'Point', 'coordinates': [45.4201767, -122.78649566]}      -1.000000
{'type': 'Point', 'coordinates': [46.50105096, -84.34907913]}      -1.000000
{'type': 'Point', 'coordinates': [44.96629517, -93.07325315]}      -1.000000
{'type': 'Point', 'coordinates': [42.1296, -87.8311]}              -1.000000
{'type': 'Point', 'coordinates': [41.883222, -87.632496]}           1.00