In [41]:
from common_funcs import *
from pmaw import PushshiftAPI
import datetime as dt
import praw

# Data Acquisition
I will use PushshiftAPI via the Python package "pmaw" to scrape Reddit. I will scrape subreddits for multiple countries. I am choosing countries that spsan the range of pro-Ukraine sentiment. Some countries are very pro-Ukraine (US, most of Europe) while others are officially neutral (Israel, UAE), others lean pro-Russia (India), and then there's Russia itself. Ideally, I would scrape posts with high scores because high scores indicate a particular post resonates with people. However, if I want up-to-date score info, I need to use Reddit's API which is currently blocking the Russia subreddit. Reddit alleges the Russia subreddit contains significant misinformation and so it has imposed several restrictions which have the effect of making it harder to scrape. PushshiftAPI, however, is not blocking Russia but does not contain up-to-date scores. I have decided for the purposes of this project to prioritize posts regardless of score so I can capture data from the Russia subreddit.

## Instantiate a Reddit search vehicle

In [175]:
# Create a list of search terms. The API will search for comments that contain at least one term in the below
# list. Searches on PushshiftAPI are not case sensitive.

search_terms = 'Putin, Russia, Ukraine, Zelensky, Invasion, Kyiv, Kiev, Kharkiv, Mariupol, Lviv, NATO'
search_terms = search_terms.split(',')
search_terms = '|'.join(search_terms).replace(' ', '')
search_terms

'Putin|Russia|Ukraine|Zelensky|Invasion|Kyiv|Kiev|Kharkiv|Mariupol|Lviv|NATO'

In [179]:
# Set time bounds. I will collect posts between Feb 23 (one day before the invasion) 
# and March 30 (approximately one month).

after = int(dt.datetime(2022, 2, 23, 0, 0).timestamp())
before = int(dt.datetime(2022, 4, 2, 0, 0).timestamp())

In [191]:
def scrape_subreddit(subreddit, limit, api=PushshiftAPI(), restart=False):
    
    # Retrieve comments from subreddit
    comments = api.search_comments(q=search_terms, subreddit=subreddit, limit=limit, before=before, after=after,
                          safe_exit=True)

    print(f'Retrieved {len(comments)} comments from {subreddit.capitalize()} subreddit')

    comments_df = pd.DataFrame(comments)
    comments_df.to_csv(f'data/{subreddit}_comments.csv')

    subs = api.search_submissions(q=search_terms, subreddit=subreddit, limit=limit, before=before, after=after,
                              safe_exit=True)

    print(f'Retrieved {len(subs)} subs from {subreddit.capitalize()} subreddit')
    print('\n')

    subs_df = pd.DataFrame(subs)
    subs_df.to_csv(f'data/{subreddit}_subs.csv')  

In [183]:
scrape_subreddit('israel', 100000)

Retrieved 1099 comments from Reddit
Retrieved 156 subs from Reddit


In [184]:
scrape_subreddit('uae', 100000)

Retrieved 86 comments from Reddit
Retrieved 22 subs from Reddit


There doesn't seem to be a whole lot of posts from Israel and UAE so I will increase the number of countries and loop through them.

In [189]:
countries = 'india, pakistan, saudiarabia, switzerland, germany, spain, france, poland, polska, belarus, ukraine, russia'.split(', ')
countries

['india',
 'pakistan',
 'saudiarabia',
 'switzerland',
 'germany',
 'spain',
 'france',
 'poland',
 'polska',
 'belarus',
 'ukraine',
 'russia']

In [190]:
for country in countries:
    scrape_subreddit(country, 100000)

Retrieved 9945 comments from India subreddit
Retrieved 1100 subs from India subreddit
Retrieved 1956 comments from Pakistan subreddit
Retrieved 194 subs from Pakistan subreddit
Retrieved 382 comments from Saudiarabia subreddit
Retrieved 58 subs from Saudiarabia subreddit
Retrieved 1487 comments from Switzerland subreddit
Retrieved 114 subs from Switzerland subreddit
Retrieved 1972 comments from Germany subreddit
Retrieved 410 subs from Germany subreddit
Retrieved 120 comments from Spain subreddit
Retrieved 24 subs from Spain subreddit
Retrieved 6730 comments from France subreddit
Retrieved 745 subs from France subreddit
Retrieved 5907 comments from Poland subreddit
Retrieved 855 subs from Poland subreddit
Retrieved 3480 comments from Polska subreddit
Retrieved 401 subs from Polska subreddit
Retrieved 1941 comments from Belarus subreddit
Retrieved 357 subs from Belarus subreddit
Retrieved 100000 comments from Ukraine subreddit
Retrieved 42911 subs from Ukraine subreddit
Retrieved 1204 c

Since this was pretty easy so far, let's scrape some more countries. Let's start by looking at a list of countries that have provided aid to Ukraine of any kind according to [Wikipedia](https://en.wikipedia.org/wiki/List_of_foreign_aid_to_Ukraine_during_the_Russo-Ukrainian_War). We'll also look at subreddits for countries selected by GDP from each region of the world. We will also include data on whether a country abstained at the UN vote on the Ukraine conflict.

In [197]:
aid_countries_table = pd.read_html('https://en.wikipedia.org/wiki/List_of_foreign_aid_to_Ukraine_during_the_Russo-Ukrainian_War')

In [198]:
aid_countries_table[1].head()

Unnamed: 0,Country,Military aid,Financial aid(including non-earmarked for humanitarian purposes),Humanitarian aid(including earmarked funds)
0,Albania,Minister of Defense Niko Peleshi said on 18 Ma...,,
1,Argentina,,,The Argentine Government has sent the followin...
2,Australia,The Australian Government approved military ai...,,AU$30 million on 20 March 2022:[10] AU$10 mill...
3,Austria,Chancellor Karl Nehammer approved military aid...,,
4,Azerbaijan,,,President Ilham Aliyev approved aid for Ukrain...


In [199]:
# It's going to be too difficult to translate aid into a $USD amount so I will just divide countries into
# those that have given aid and those that haven't.

aid_countries = aid_countries_table[1].Country
aid_countries.head()

0       Albania
1     Argentina
2     Australia
3       Austria
4    Azerbaijan
Name: Country, dtype: object

In [408]:
aid_countries.to_pickle('data/aid_countries')

In [265]:
# Which countries abstained at the UN vote on the Ukraine conflict?
# https://www.tellerreport.com/news/2022-03-03-a-total-of-39-countries-oppose-and-abstain-from-the-un-general-assembly-s-resolution-condemning-russia.SyyQqhpxq.html

abstained = 'Algeria Angola Armenia Bangladesh Bolivia Brunji CentralAfrica China CongoRepublic Cuba ElSalvador EquatorGuinea India Iran Iraq Kazakhstan Kirgis Laos Madagascar Mali Mongolia Mozambique Namibia Pakistan Senegal SouthAfrica SouthSudan SriLanka Sudan Tajikistan Uganda Tanzania Vietnam Zimbabwe'.split()
abstained[:6]

['Algeria', 'Angola', 'Armenia', 'Bangladesh', 'Bolivia', 'Brunji']

In [200]:
# Get all countries in the world and their GDP data
all_countries_table = pd.read_html('https://en.wikipedia.org/wiki/List_of_countries_by_GDP_(nominal)')

In [228]:
all_countries = all_countries_table[2]
all_countries.head()

Unnamed: 0_level_0,Country/Territory,Region,IMF[1],IMF[1],United Nations[12],United Nations[12],World Bank[13][14],World Bank[13][14]
Unnamed: 0_level_1,Country/Territory,Region,Estimate,Year,Estimate,Year,Estimate,Year
0,United States,Americas,22939580.0,2021,20893746.0,2020,20936600.0,2020
1,China,Asia,16862979.0,[n 2]2021,14722801.0,[n 3]2020,14722731.0,2020
2,Japan,Asia,5103110.0,2021,5057759.0,2020,4975415.0,2020
3,Germany,Europe,4230172.0,2021,3846414.0,2020,3806060.0,2020
4,United Kingdom,Europe,3108416.0,2021,2764198.0,2020,2707744.0,2020


In [235]:
# Select the 5 countries for each region of the world with the 5 highest GDPs because presumably higher GDPs
# mean a country's opinion matters more on the world stage in some sense.

top_GDP_countries = all_countries.sort_values(by=('IMF[1]', 'Estimate'), ascending=False).groupby(('Region', 'Region')).head(5)
top_GDP_countries

Unnamed: 0_level_0,Country/Territory,Region,IMF[1],IMF[1],United Nations[12],United Nations[12],World Bank[13][14],World Bank[13][14]
Unnamed: 0_level_1,Country/Territory,Region,Estimate,Year,Estimate,Year,Estimate,Year
0,United States,Americas,22939580.0,2021,20893746.0,2020,20936600.0,2020
1,China,Asia,16862979.0,[n 2]2021,14722801.0,[n 3]2020,14722731.0,2020
2,Japan,Asia,5103110.0,2021,5057759.0,2020,4975415.0,2020
3,Germany,Europe,4230172.0,2021,3846414.0,2020,3806060.0,2020
4,United Kingdom,Europe,3108416.0,2021,2764198.0,2020,2707744.0,2020
5,India,Asia,2946061.0,2021,2664749.0,2020,2622984.0,2020
6,France,Europe,2940428.0,2021,2630318.0,2020,2603004.0,2020
7,Italy,Europe,2120232.0,2021,1888709.0,2020,1886445.0,2020
8,Canada,Americas,2015983.0,2021,1644037.0,2020,1643408.0,2020
9,South Korea,Asia,1823852.0,2021,1637896.0,2020,1630525.0,2020


In [249]:
# Truncate table to make it more manageable. Remove multi-index column labels.

top_GDP_countries = top_GDP_countries.iloc[:, :3]
top_GDP_countries.columns = top_GDP_countries.columns.get_level_values(1)
top_GDP_countries.columns = ['Country', 'Region', 'GDP']
top_GDP_countries

Unnamed: 0,Country,Region,GDP
0,United States,Americas,22939580.0
1,China,Asia,16862979.0
2,Japan,Asia,5103110.0
3,Germany,Europe,4230172.0
4,United Kingdom,Europe,3108416.0
5,India,Asia,2946061.0
6,France,Europe,2940428.0
7,Italy,Europe,2120232.0
8,Canada,Americas,2015983.0
9,South Korea,Asia,1823852.0


In [261]:
top_GDP_countries['aid'] = top_GDP_countries['Country'].map(lambda x: True if x in list(aid_countries) else False)

In [263]:
top_GDP_countries['abstained'] = top_GDP_countries.Country.map(lambda x: True if x.replace(' ', '') in list(abstained) else False)

It appears some countries that abstained do not rank highly on GDP. I want to include these in the subreddit search though so I will select 10 countries that abstained that have the highest GDPs.

In [289]:
all_countries_df = all_countries.sort_values(by=('IMF[1]', 'Estimate'), ascending=False)
all_countries_df.columns = all_countries_df.columns.get_level_values(1)
all_countries_df = all_countries_df.iloc[:, :3]
all_countries_df.columns = ['Country', 'Region', 'GDP']
all_countries_df

Unnamed: 0,Country,Region,GDP
0,United States,Americas,22939580.0
1,China,Asia,16862979.0
2,Japan,Asia,5103110.0
3,Germany,Europe,4230172.0
4,United Kingdom,Europe,3108416.0
...,...,...,...
200,Sint Maarten,Americas,
203,American Samoa,Oceania,
208,Cook Islands,Oceania,
209,Anguilla,Americas,


In [272]:
abstained_df = pd.DataFrame(abstained, columns=['Country'])
abstained_df.head()

Unnamed: 0,Country
0,Algeria
1,Angola
2,Armenia
3,Bangladesh
4,Bolivia


In [407]:
all_countries_df['Country'] = all_countries_df.Country.str.replace(' ', '')
all_countries_df.to_pickle('data/all_countries_GDP')
all_countries_df.head()

Unnamed: 0,Country,Region,GDP
0,UnitedStates,Americas,22939580.0
1,China,Asia,16862979.0
2,Japan,Asia,5103110.0
3,Germany,Europe,4230172.0
4,UnitedKingdom,Europe,3108416.0


In [298]:
abstained__GDP_df = abstained_df.merge(all_countries_df, on='Country').sort_values('GDP', ascending=False)
abstained__GDP_df['abstained'] = True
abstained__GDP_df

Unnamed: 0,Country,Region,GDP,abstained
5,China,Asia,16862979.0,True
8,India,Asia,2946061.0,True
9,Iran,Asia,1081383.0,True
20,SouthAfrica,Africa,435145.0,True
27,Vietnam,Asia,368002.0,True
3,Bangladesh,Asia,355689.0,True
18,Pakistan,Asia,261726.0,True
10,Iraq,Asia,201472.0,True
11,Kazakhstan,Asia,194024.0,True
0,Algeria,Africa,163812.0,True


In [305]:
#Now select 5 countries from each region like before. See if they gave aid.

abstained_top_GDP_df = abstained__GDP_df.groupby('Region').head(5)
abstained_top_GDP_df['aid'] = abstained_top_GDP_df.Country.map(lambda x: True if x.replace(' ', '') in list(aid_countries) else False)
abstained_top_GDP_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  abstained_top_GDP_df['aid'] = abstained_top_GDP_df.Country.map(lambda x: True if x.replace(' ', '') in list(aid_countries) else False)


Unnamed: 0,Country,Region,GDP,abstained,aid
5,China,Asia,16862979.0,True,True
8,India,Asia,2946061.0,True,True
9,Iran,Asia,1081383.0,True,False
20,SouthAfrica,Africa,435145.0,True,False
27,Vietnam,Asia,368002.0,True,False
3,Bangladesh,Asia,355689.0,True,False
0,Algeria,Africa,163812.0,True,False
1,Angola,Africa,70339.0,True,False
26,Tanzania,Africa,69238.0,True,False
25,Uganda,Africa,43243.0,True,False


In [314]:
# Now append to the main list. Remove spacing in country name because that will be the subreddit name.
# Change the name of US to United States of America because that is the name of the subreddit.

subreddit_countries_df = pd.concat([top_GDP_countries, abstained_top_GDP_df])
subreddit_countries_df.drop_duplicates(inplace=True)
subreddit_countries_df['Country'] = subreddit_countries_df.Country.str.replace(' ', '')
subreddit_countries_df.loc[0, 'Country'] = 'UnitedStatesOfAmerica'
subreddit_countries_df

Unnamed: 0,Country,Region,GDP,aid,abstained
0,UnitedStatesOfAmerica,Americas,22939580.0,True,False
1,China,Asia,16862979.0,True,True
2,Japan,Asia,5103110.0,True,False
3,Germany,Europe,4230172.0,True,False
4,UnitedKingdom,Europe,3108416.0,True,False
5,India,Asia,2946061.0,True,True
6,France,Europe,2940428.0,True,False
7,Italy,Europe,2120232.0,True,False
8,Canada,Americas,2015983.0,True,False
9,SouthKorea,Asia,1823852.0,True,False


In [318]:
# create list of countries to scrape. Check first if already scraped. Then scrape.

for country in subreddit_countries_df.Country:
    if country.lower() not in countries:
        scrape_subreddit(country, 100000)

Retrieved 19 comments from Unitedstatesofamerica subreddit
Retrieved 20 subs from Unitedstatesofamerica subreddit


Retrieved 5769 comments from China subreddit
Retrieved 523 subs from China subreddit


Retrieved 1050 comments from Japan subreddit
Retrieved 128 subs from Japan subreddit


Retrieved 11305 comments from Unitedkingdom subreddit
Retrieved 947 subs from Unitedkingdom subreddit


Retrieved 13248 comments from Italy subreddit
Retrieved 158 subs from Italy subreddit


Retrieved 16273 comments from Canada subreddit
Retrieved 773 subs from Canada subreddit


Retrieved 10 comments from Southkorea subreddit
Retrieved 2 subs from Southkorea subreddit


Retrieved 46 comments from Brazil subreddit
Retrieved 17 subs from Brazil subreddit


Retrieved 3217 comments from Australia subreddit
Retrieved 254 subs from Australia subreddit


Retrieved 1438 comments from Mexico subreddit
Retrieved 83 subs from Mexico subreddit


Retrieved 2619 comments from Indonesia subreddit
Retrieved 87 subs

The last things to do are add a democracy score for each country, then combine all data from each csv into one df and export that as a pickle to pick it up in the next notebook.

In [330]:
scraped_countries = pd.concat([pd.Series(countries), subreddit_countries_df.Country])
scraped_countries = scraped_countries.str.lower().drop_duplicates().append(pd.Series(['israel', 'uae']))

In [395]:
reddit_df = pd.DataFrame()
for country in scraped_countries:
    
    country_df = pd.read_csv(f'data/{country}_comments.csv', index_col=0)
    country_df['country'] = country
    country_df['comments_or_subs'] = 'comments'
    reddit_df = reddit_df.append(country_df)
    
    country_df = pd.read_csv(f'data/{country}_subs.csv', index_col=0)
    country_df['comments_or_subs'] = 'subs'
    country_df['country'] = country
    reddit_df = reddit_df.append(country_df)

display(reddit_df)

  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)


Unnamed: 0,all_awardings,archived,associated_award,author,author_flair_background_color,author_flair_css_class,author_flair_richtext,author_flair_template_id,author_flair_text,author_flair_text_color,...,crosspost_parent,crosspost_parent_list,poll_data,call_to_action,category,event_end,event_is_live,event_start,collections,discussion_type
0,[],False,,remote79,,,[],,,,...,,,,,,,,,,
1,[],False,,Falls_stuff,,,[],,,,...,,,,,,,,,,
2,[],False,,nram88,#d3d6da,,"[{'e': 'text', 't': 'poor customer '}]",817b9b08-b34b-11e6-ab9c-0e53646228d0,poor customer,dark,...,,,,,,,,,,
3,[],False,,Eth1cs_Gr4dient,,,[],,,,...,,,,,,,,,,
4,[],False,,sleepnaught,,,[],,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17,[],,,RamblingMan2,,,[],,,,...,,,,,,,,,,
18,[],,,yourfairymonzter,,,[],,,,...,,,,,,,,,,
19,[],,,inthelu2,,,[],,,,...,,,,,,,,,,
20,[],,,drsheikh-in,,,[],,,,...,,,,,,,,,,


In [385]:
subreddit_countries_df.Country = subreddit_countries_df.Country.str.lower().str.replace(' ', '')

In [403]:
merged_reddit_df = reddit_df.merge(subreddit_countries_df, how = 'left', left_on='country', right_on='Country')
merged_reddit_df.drop('Country', axis=1, inplace=True)

Now we get the democracy score from Wikipedia.

In [354]:
democracy_score_table = pd.read_html('https://en.wikipedia.org/wiki/Democracy_Index')
democracy_score_table = democracy_score_table[6].sort_values('2021', ascending=False)

In [360]:
democracy_score_table.Country = democracy_score_table.Country.str.replace(' ', '').str.lower()
democracy_score_table.head()

Unnamed: 0,Region,2021 rank,Country,Regime type,2021,2020,2019,2018,2017,2016,2015,2014,2013,2012,2011,2010,2008,2006
16,Western Europe,1,norway,Full democracy,9.75,9.81,9.87,9.87,9.87,9.93,9.93,9.93,9.93,9.93,9.8,9.8,9.68,9.55
91,Asia and Australasia,2,newzealand,Full democracy,9.37,9.25,9.26,9.26,9.26,9.26,9.26,9.26,9.26,9.26,9.26,9.26,9.19,9.01
6,Western Europe,3,finland,Full democracy,9.27,9.2,9.25,9.14,9.03,9.03,9.03,9.03,9.03,9.06,9.06,9.19,9.25,9.25
19,Western Europe,4,sweden,Full democracy,9.26,9.26,9.39,9.39,9.39,9.39,9.45,9.73,9.73,9.73,9.5,9.5,9.88,9.88
10,Western Europe,5,iceland,Full democracy,9.18,9.37,9.58,9.58,9.58,9.5,9.58,9.58,9.65,9.65,9.65,9.65,9.65,9.71


In [404]:
# I will use the 2021 democracy scores
reddit_df_final = merged_reddit_df.merge(democracy_score_table[['Country', '2021']], how='left', left_on='country', right_on='Country')
reddit_df_final.rename(columns = {'2021':'democracy_score'}, inplace = True)

In [406]:
reddit_df_final.to_pickle('data/reddit_df_final')

Now over to the next notebook to do some cleaning.