In [None]:
import pandas as pd
import snscrape.modules.twitter as sntwitter
import itertools
import time
import re

In [None]:
# Python Dictionary of Countries mapped to their Continents
countriesDict = {
    "India":"Asia","Japan":"Asia",
    "South Korea":"Asia","Israel":"Asia","United States":"North America","Trinidad and Tobago":"North America","El Salvador":"North America",
    "Mexico":"North America", "Canada":"North America", 
    "Brazil":"South America", "Argentina":"South Amercia","Peru":"South America",
    "Zimbabwe":"Africa", "Kenya":"Africa",
    "Nigeria":"Africa","Germany":"Europe","United Kingdom":"Europe",
    "Italy":"Europe","Spain":"Europe","Switzerland":"Europe", 
    "South Africa":"Africa","France":"Europe",
    "Australia":"Australia","New Zealand":"Australia","Papua New Guinea":"Australia","Egypt":"Africa","Mauritius":"Africa"
    
}

num_tweets_per_tag = 150

In [None]:
# Keywords
keywords = [ 
            #Mentions of Bitcoin
            "#bitcoin","#bitcoins","#btc","#BTC","#bitcoinnews","#bitcoinprice", "btc","bitcoins","bitcoin news",
            "#bitcoinprice","#BTCUSD","#BTCUSDT","Bitcoin","Satoshi Nakamoto","$btc" 

]

In [None]:
# This is the main method used to scrape Twitter data (tweets) using SNScrape
def scrape_data(countryName, countriesDict=countriesDict, withinRange = 1000, num_tweets_per_tag=num_tweets_per_tag):
    start = time.time()
    df = pd.DataFrame()
    for word in keywords:
        try:
            df = df.append(pd.DataFrame(itertools.islice(sntwitter.TwitterSearchScraper(
                f'{word} near:"{countriesDict[countryName]}" within:{withinRange}km lang:en since:2021-09-28 until:2021-12-28').get_items(), num_tweets_per_tag)))
        except Exception as e:
            print(f"An error occured: :(\n")
            continue
    if len(df) < 150:
        print(f"Number of tweets for {countryName} is lower than expected! df shape: {df.shape}")
    df['username'] =  df['user'].apply(lambda x: x['username'])
    df['verified'] = df['user'].apply(lambda x:x['verified'])
    df['followersCount'] = df['user'].apply(lambda x:x['followersCount'])
    df['country'] = countryName
    df_ = df[["username","verified","followersCount", "content", "date", "country", "replyCount", "retweetCount", "likeCount", "url"]]
    df_.to_csv(f'Ethereum_{countryName}.csv', index = False)
    print(f"Shape of df for {countryName}: {df_.shape}, Time taken: {((time.time() - start)/60):.1f} mins")
    return df_

In [None]:
# Initializing Dictionary of DataFrames for Each of the 23 Countries
countriesDf = {}

In [None]:
# This code block scrapes data for each country in the countriesDict dictionary.
# For some countries, the range parameter for SNScrape has been specified.

for country in countriesDict.keys():
    if country in countriesDf.keys():
        continue
    if country in ['Singapore']:
        withinRange=50
    elif country in ['Mexico']:
        withinRange=500
    elif country in ['Canada']:
        withinRange=100
    else:
        withinRange=500
    countriesDf[country] = scrape_data(country, withinRange=withinRange)

Number of tweets for India is lower than expected! df shape: (20, 27)
Shape of df for India: (20, 10), Time taken: 0.8 mins
Number of tweets for Japan is lower than expected! df shape: (29, 27)
Shape of df for Japan: (29, 10), Time taken: 0.5 mins
Number of tweets for South Korea is lower than expected! df shape: (32, 27)
Shape of df for South Korea: (32, 10), Time taken: 0.5 mins
Number of tweets for Israel is lower than expected! df shape: (44, 27)
Shape of df for Israel: (44, 10), Time taken: 0.4 mins
Shape of df for United States: (451, 10), Time taken: 1.8 mins
Shape of df for Trinidad and Tobago: (468, 10), Time taken: 0.9 mins
Shape of df for El Salvador: (547, 10), Time taken: 1.0 mins
Shape of df for Mexico: (573, 10), Time taken: 0.9 mins
Number of tweets for Canada is lower than expected! df shape: (65, 27)
Shape of df for Canada: (65, 10), Time taken: 0.3 mins
Number of tweets for Brazil is lower than expected! df shape: (2, 27)
Shape of df for Brazil: (2, 10), Time taken: 

In [None]:
countriesDf

{'India':           username  verified  followersCount  \
 0         gnojgnoj     False            1193   
 1       Johnarvic5     False              86   
 2       jammaligad     False             781   
 3      CutiePieYay     False              79   
 4         gnojgnoj     False            1193   
 5      JRochelle08     False              36   
 6      JRochelle08     False              36   
 7        jhongdman     False              34   
 8   magdugokitkath     False             115   
 9    JayAr95122486     False             230   
 10  pdjasminetagui     False             655   
 0         gnojgnoj     False            1192   
 1     chaneloween3     False              31   
 2          atejo12     False              72   
 3     Vetz83137424     False               2   
 4          atejo12     False              72   
 5          atejo12     False              72   
 6       TristanBal     False            6017   
 7         ivahrYaj     False             116   
 8   countr

In [None]:
# To check the Number of Tweets found for each Country
for country, countryDf in countriesDf.items():
    print(f"{country}: {len(countryDf)}")

India: 20
Japan: 29
South Korea: 32
Israel: 44
United States: 451
Trinidad and Tobago: 468
El Salvador: 547
Mexico: 573
Canada: 65
Brazil: 2
Argentina: 1050
Peru: 2
Zimbabwe: 6
Kenya: 18
Nigeria: 49
Germany: 192
United Kingdom: 278
Italy: 237
Spain: 265
Switzerland: 339
South Africa: 41
France: 397
Australia: 333
New Zealand: 328
Papua New Guinea: 328
Egypt: 49
Mauritius: 35


In [None]:
df = pd.DataFrame()
for countryDf in countriesDf.values():
    df = df.append(countryDf)

print(df.shape)

(6178, 10)


In [None]:
# Cleaning Data
df_indexes_v2 = []
user_dict = {}
for i in range(len(df)):
    tweet = df["content"].iloc[i]
    
    # To remove tweets that have more hashtags than normal text
    word_list = tweet.lower().split()
    num_normal = 0
    num_tags = 0
    for j in range(len(word_list)):
        temp = word_list[j]
        if temp[0] == '#':
            num_tags += 1
        else:
            num_normal += 1
    if num_tags > num_normal:
        continue
    
    # To choose only the latest tweet from a user to prevent multiple tweets from same user
    user = df["username"].iloc[i]
    user_dict[user] = i
    
for value in user_dict.values():
    df_indexes_v2.append(value)

df_v2 = df.iloc[df_indexes_v2]
print(f'Shape of df after cleaning: {df_v2.shape}')

Shape of df after cleaning: (966, 10)


In [None]:
pattern = r'#(\w+)'
df_v2['hashtags'] = df_v2['content'].apply(lambda x: re.findall(pattern, x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._set_item(key, value)


In [None]:
# Shuffling tweets in version 2 of the dataframe, and saving to a CSV file
df_v2 = df_v2.drop_duplicates(subset='content')
df_v2 = df_v2.sample(frac=1).reset_index(drop=True)
print(df_v2.shape)
df_v2.to_csv("cz4034_scraped__ethereum_data.csv", encoding = "utf-8-sig", index=False)

(965, 11)


In [None]:
df_v2['content'][3]

3    @Jihoz_Axie And also please don't send that et...
3    @Orion__Research @ethereum @Orion__Trading @Or...
3                                       @hugo_eth Done
3    Update, for the record \n#ETH https://t.co/RGE...
3                    What will be bigger? #ETH or #LTC
3    Meet our Founder: Stephen Garbett\n\nSteve is ...
3                         @PerrySaylorWEB3 Sol and eth
3                                    @hugo_eth Les go!
3    Soon to be released. The Cryptopose wish you a...
3    @CryptoTitanGo @morales_eth @0xPolygon Not enough
Name: content, dtype: object

In [None]:
# To print the unique countries in the DataFrame
print(df_v2['country'].unique())

['Israel' 'Argentina' 'Mexico' 'France' 'Switzerland'
 'Trinidad and Tobago' 'Canada' 'Peru' 'United States' 'Papua New Guinea'
 'Mauritius' 'El Salvador' 'United Kingdom' 'Australia' 'Spain'
 'South Africa' 'Germany' 'South Korea' 'Egypt' 'Japan' 'Italy']


In [None]:
# To print the number of tweets for each country
print(df_v2.groupby('country')['content'].nunique())

country
Argentina              574
Australia                2
Canada                  22
Egypt                    3
France                 110
Germany                  6
Israel                  16
Italy                    3
Mauritius               10
Mexico                 157
Nigeria                  1
Papua New Guinea        19
Peru                     1
South Africa             2
South Korea              1
Spain                    5
Switzerland              4
Trinidad and Tobago      6
United Kingdom          22
United States            2
Name: content, dtype: int64


In [None]:
df_v2

Unnamed: 0,username,verified,followersCount,content,date,country,replyCount,retweetCount,likeCount,url,hashtags
6,gnojgnoj,False,1193,@wirexapp smart contract + high gas fee = #Eth...,2021-10-28 08:16:34+00:00,Israel,0,0,0,https://twitter.com/gnojgnoj/status/1453636799...,[Ethereum]
1,Johnarvic5,False,86,@taegiveaway @Walker_eth Done https://t.co/n4J...,2021-12-23 13:03:22+00:00,Israel,0,0,0,https://twitter.com/Johnarvic5/status/14740026...,[]
2,jammaligad,False,781,Guys this plant cost 0.019 eth 💀💀💀 https://t.c...,2021-12-18 20:46:43+00:00,Israel,1,0,4,https://twitter.com/jammaligad/status/14723073...,[]
3,CutiePieYay,False,79,@Jihoz_Axie And also please don't send that et...,2021-12-17 16:58:13+00:00,Israel,0,0,0,https://twitter.com/CutiePieYay/status/1471887...,[]
7,JRochelle08,False,36,@Shibaholderx I'm going to flood this until yo...,2021-11-14 06:24:30+00:00,Israel,0,0,1,https://twitter.com/JRochelle08/status/1459769...,[]
...,...,...,...,...,...,...,...,...,...,...,...
30,Alpha_SMF,False,2062,Just buy BTC and ETH aye?!??,2021-12-04 13:32:02+00:00,Papua New Guinea,1,0,2,https://twitter.com/Alpha_SMF/status/146712453...,[]
50,emiliovisual,False,867,@XolosCrew @fabricadeartecu @iamDCinvestor @fu...,2021-11-29 14:36:24+00:00,Papua New Guinea,0,0,2,https://twitter.com/emiliovisual/status/146532...,[]
0,LittleDope85,False,1083,• “Ops..” 🤣 #BTC #Ethereum #Cardano #MATIC #XR...,2021-12-13 18:23:14+00:00,Egypt,0,1,1,https://twitter.com/LittleDope85/status/147045...,"[BTC, Ethereum, Cardano, MATIC, XRP, dogecoin,..."
1,giovanniljc,False,248,The best time to buy #Bitcoin is during the st...,2021-12-04 18:38:59+00:00,Egypt,1,0,1,https://twitter.com/giovanniljc/status/1467201...,"[Bitcoin, Ethereum, cryptocrash, Crypto, crypt..."
