### setup

In [1]:
%load_ext lab_black

In [2]:
import pandas as pd
import requests
import pageviewapi
import datetime as dt
from bs4 import BeautifulSoup
import altair as alt

In [3]:
pd.options.display.max_columns = 100
pd.options.display.max_rows = 1000
pd.options.display.max_colwidth = None

In [4]:
begin = "20210101"
today = dt.datetime.today().strftime("%Y%m%d")

---

### get website content<br>


In [5]:
from urllib.request import urlopen
from bs4 import BeautifulSoup

data_list = []

html = urlopen("https://russianpmcs.csis.org/")

In [6]:
soup = BeautifulSoup(html, "html.parser")

In [7]:
countries = []

nations = soup.find_all(
    "h2", class_="case__header-title text--barlow-sc text--semibold"
)
for nation in nations:
    countries.append(nation.text)

In [8]:
nations = soup.find_all("h3", class_="case__overview-title text--barlow-sc")

for nation in nations:
    countries.append(nation.text)

In [9]:
year = []

dates = soup.find_all("p", class_="case__header-arrival text--barlow")
for date in dates:
    year.append(date.text)

In [10]:
dates = soup.find_all("div", class_="case__overview-meta text--barlow-sc")

for date in dates:
    year.append(date.text)

In [11]:
df = pd.DataFrame({"places": countries, "year": year, "manual": "no"})

In [12]:
df["year"] = df["year"].str.replace("Date of Arrival: ", "")

In [13]:
missing_places = [
    "Senegal",
    "Mali",
    "Nigeria",
    "Chad",
    "South Sudan",
    "Egypt",
    "Republic of the Congo",
    "Democratic Republic of the Congo",
    "Burundi",
    "Botswana",
    "South Africa",
    "Eswatini",
    "Iraq",
    "Yemen",
    "Afghanistan",
    "Tajikistan",
    "Equatorial Guinea",
    "Comoros",
    "Indonesia",
    "Sri Lanka",
]
missing_years = [
    2014,
    2019,
    2019,
    2017,
    2017,
    2017,
    2019,
    2014,
    2018,
    2018,
    2018,
    2015,
    2017,
    2018,
    2018,
    2017,
    2020,
    2018,
    2016,
    2014,
]

In [14]:
missing_df = pd.DataFrame(
    {"places": missing_places, "year": missing_years, "manual": "yes"}
)

In [15]:
df = pd.concat([df, missing_df])

In [18]:
df = df.drop(index=[3, 4]).reset_index().drop(columns="index")

In [19]:
df

Unnamed: 0,places,year,manual
0,Ukraine,2014,no
1,Syria,2015,no
2,Libya,2015,no
3,Sudan,2017,no
4,Central African Republic,2018,no
5,Madagascar,2018,no
6,Mozambique,2019,no
7,Venezuela,2017,no
8,Senegal,2014,yes
9,Mali,2019,yes


In [28]:
pop = pd.read_csv("data/raw/worldpop.csv", skiprows=3)

In [36]:
pop_slim = pop[["Country Name", "Country Code", "Indicator Name", "2020"]]

In [37]:
pop_slim.columns = ["name", "code", "indicator", "pop"]

In [38]:
pop_slim = (
    pop_slim[pop_slim["indicator"] == "Population, total"]
    .drop(columns="indicator")
    .rename(columns={"name": "places"})
)

In [41]:
df = df.merge(pop_slim, on="places", how="left")

In [42]:
df[df["pop"].isna()]

Unnamed: 0,places,year,manual,code,pop
1,Syria,2015,no,,
4,Central African Republic,2018,no,,
7,Venezuela,2017,no,,
11,Egypt,2017,yes,,
12,Republic of the Congo,2019,yes,,
13,Democratic Republic of the Congo,2014,yes,,
19,Yemen,2018,yes,,


In [44]:
missing_pop = ["SYR", "CAF", "VEN", "EGY", "COG", "COD", "YEM"]

In [45]:
pop_slim[pop_slim["code"].isin(missing_pop)]

Unnamed: 0,places,code,pop
34,Central African Republic,CAF,4829764.0
43,"Congo, Dem. Rep.",COD,89561404.0
44,"Congo, Rep.",COG,5518092.0
67,"Egypt, Arab Rep.",EGY,102334403.0
227,Syrian Arab Republic,SYR,17500657.0
254,"Venezuela, RB",VEN,28435943.0
262,"Yemen, Rep.",YEM,29825968.0


In [59]:
df.loc[1, "code"] = "SYR"
df.loc[4, "code"] = "CAF"
df.loc[7, "code"] = "VEN"
df.loc[11, "code"] = "EGY"
df.loc[12, "code"] = "COG"
df.loc[13, "code"] = "COD"
df.loc[19, "code"] = "YEM"

In [81]:
df

Unnamed: 0,places,year,manual,code
0,Ukraine,2014,no,UKR
1,Syria,2015,no,SYR
2,Libya,2015,no,LBY
3,Sudan,2017,no,SDN
4,Central African Republic,2018,no,CAF
5,Madagascar,2018,no,MDG
6,Mozambique,2019,no,MOZ
7,Venezuela,2017,no,VEN
8,Senegal,2014,yes,SEN
9,Mali,2019,yes,MLI


In [66]:
# df = df.drop(columns="pop")
df_pop = df.merge(pop_slim, on=["code"], how="left").drop(columns="places_y")

### bring in continents

In [83]:
continents = pd.read_csv(
    "https://gist.githubusercontent.com/stevewithington/20a69c0b6d2ff846ea5d35e5fc47f26c/raw/13716ceb2f22b5643ce5e7039643c86a0e0c6da6/country-and-continent-codes-list-csv.csv"
)

In [84]:
continents.columns = continents.columns.str.lower()

In [109]:
continents.head()

Unnamed: 0,continent_name,continent_code,country_name,two_letter_country_code,three_letter_country_code,country_number
0,Asia,AS,"Afghanistan, Islamic Republic of",AF,AFG,4.0
1,Europe,EU,"Albania, Republic of",AL,ALB,8.0
2,Antarctica,AN,Antarctica (the territory South of 60 deg S),AQ,ATA,10.0
3,Africa,AF,"Algeria, People's Democratic Republic of",DZ,DZA,12.0
4,Oceania,OC,American Samoa,AS,ASM,16.0


In [110]:
continents_slim = continents[["continent_name", "three_letter_country_code"]].rename(
    columns={"three_letter_country_code": "code"}
)

In [113]:
df_pop_continent = df_pop.merge(continents_slim, on="code", how="left")

In [114]:
df_pop_continent.to_csv("data/processed/wagner_places.csv")

### count by year for bubbles

In [90]:
yearcount = df_pop.groupby("year").size().reset_index(name="count")

In [91]:
yearcount["y"] = 1

In [107]:
yearcount["year"] = yearcount["year"].astype(str)

In [108]:
alt.Chart(yearcount).mark_circle(opacity=0.8, fill="#00d4d8").encode(
    alt.X("year:O", axis=alt.Axis(labelAngle=0, title="", tickSize=0)),
    alt.Y("y", scale=alt.Scale(zero=False), axis=alt.Axis(tickCount=0, title="")),
    alt.Size("count"),
).properties(width=400, height=50)