# Update Societies

### Objective:
Update the societies that are already in the database.

# Do not rerun
Run on April 25, 2020

In [1]:
# Import packages
import requests
import pandas as pd

In [3]:
# Import classes and functions needed for this analysis from config module
# These are only available on my computer
from config import dbaccess, validator, society, error_dict_to_string

# Create an instance of the DBAccess class for running queries
db = dbaccess.DBAccess()

# Create an instance of the Validator class for validating data prior to inserting/updating database
val = validator.Validator()


## Data Gathering
### Colonization Data

In [4]:
icow = pd.read_csv('colonial_data.csv')
icow.sample(5)

Unnamed: 0,State,Name,ColRuler,IndFrom,IndDate,IndViol,IndType,SecFrom,SecDate,SecViol,Into,IntoDate,COWsys,GWsys,Notes
78,346,Bosnia and Herzegovina,640,345,199203,1,3,345,199203,1,-9,-9,199204,199204,Acquired by Austria from Turkey in 1878 Treaty...
44,230,Spain,-9,-9,147901,1,1,-9,-9,-9,-9,-9,181601,181601,-9
178,712,Mongolia,710,710,192108,1,3,-9,-9,-9,-9,-9,192103,192103,-9
54,271,Wuerttemburg,-9,-9,180601,1,4,-9,-9,-9,255,187012,181601,181601,Formed by Napoleonic partition of Holy Roman E...
114,451,Sierra Leone,200,200,196104,0,2,-9,-9,-9,-9,-9,196104,196104,-9


### Country Codes
Read in and de-duplicate the country codes used by The Correlates of War Project. Rename columns.

In [5]:
country_codes = pd.read_csv('cow_country_codes.csv')
country_codes.drop_duplicates(inplace=True)
country_codes.rename(columns={'StateAbb': 'abbreviation', 'CCode': 'cow_code', 'StateNme': 'country_name'}, inplace=True)
country_codes.sample(5)

Unnamed: 0,abbreviation,cow_code,country_name
56,BAV,245,Bavaria
213,BNG,771,Bangladesh
108,UKR,369,Ukraine
32,BRA,140,Brazil
112,AZE,373,Azerbaijan


Add "Federal Republic of Central America" with `cow_code` of 89. This value is in the `indep_from` column for Guatemala, Honduras, El Salvador, Nicaragua, and Costa Rica, but isn't in the `country_codes` dataset.

In [6]:
frca = {'abbreviation':'FRC', 
        'cow_code':89, 
        'country_name': 'Federal Republic of Central America'}
country_codes = country_codes.append(frca, ignore_index=True)

#### Match COW Countries with `society` Table
Many countries already exist in the `society` database table, based on United Nations data. Match COW Project countries with those in the database and execute an UPDATE query to set their `cow_code` field to the corresponding value in the `country_codes` dataframe. Add countries in the COW data that are not in the database to `society`.

In [7]:
# Get societies from the database
query = db.run_query('SELECT society_id, common_name FROM society')
societies = pd.DataFrame.from_dict(query['data'])
societies.sample(5)

Unnamed: 0,society_id,common_name
94,95,Guyana
118,119,Laos
40,41,Cayman Islands
84,85,Gibraltar
47,48,Colombia


Find entries in `country_codes` that are not in the `societies` dataframe (i.e., not in the `society` database table).

In [9]:
print(country_codes[~country_codes['country_name'].isin(list(societies['common_name'].unique()))].shape)
country_codes[~country_codes['country_name'].isin(list(societies['common_name'].unique()))]

(41, 3)


Unnamed: 0,abbreviation,cow_code,country_name
0,USA,2,United States of America
2,BHM,31,Bahamas
11,SLU,56,St. Lucia
12,SVG,57,St. Vincent and the Grenadines
13,AAB,58,Antigua & Barbuda
14,SKN,60,St. Kitts and Nevis
37,NTH,210,Netherlands
47,HAN,240,Hanover
48,BAV,245,Bavaria
50,GFR,260,German Federal Republic


Fifteen of these 40 unmatched countries are already in `societies`, but `country_name` in `country_codes` is different from `common_name` in `societies`. Update `country_name` in `country_codes` dataframe to match `common_name`. Create two dataframes, one for countries already in the database that need to be updated with COW country codes and one for countries that need to be added to the database.

In [14]:
# Update country_name to match common_name in the database
update_cow = {
    'United States of America': 'USA',
    'Bahamas': 'The Bahamas',
    'St. Lucia': 'Saint Lucia',
    'St. Vincent and the Grenadines': 'Saint Vincent and the Grenadines',
    'Antigua & Barbuda': 'Antigua and Barbuda',
    'St. Kitts and Nevis': 'Saint Kitts and Nevis',
    'Netherlands': 'The Netherlands',
    'Czech Republic': 'Czechia',
    'Cape Verde': 'Cabo Verde',
    'Sao Tome and Principe': 'São Tomé and Príncipe',
    'Ivory Coast': 'Côte d’Ivoire',
    'Democratic Republic of the Congo': 'DRC',
    'Swaziland': 'Eswatini',
    'East Timor': 'Timor-Leste',
    'Federated States of Micronesia': 'Micronesia'
}
country_codes.replace({'country_name': update_cow}, inplace=True)

####### Create two dataframes
# Societies already in the database that need to be updated with COW country code
# Use inner merge to get countries with data in database and country_codes dataframe
existing_societies = country_codes.merge(societies, left_on='country_name', right_on='common_name', how='inner')

# Societies that need to be added to the database
# Find country_name in country_codes dataframe that aren't in existing_societies
new_societies = country_codes[~country_codes['country_name'].isin(list(existing_societies['country_name'].unique()))].copy()

In [15]:
# 26 new_societies and 192 existing_societies
print('New:', new_societies.shape[0])
print('Existing:', existing_societies.shape[0])

New: 26
Existing: 192


#### Update `society` Table
`1)` Update `society` table in database: Set `cow_code` for records in `existing_societies`.

`2)` Create necessary information for records in `new_societies` and add to `society` table.

##### Update Existing Societies with COW Country Code

In [17]:
# Check if any of the values fall outside of the 0-999 range for COW country codes
print('Number of errors:', existing_societies[val.integer_out_of_bounds(existing_societies['cow_code'],0,999)].shape[0])

Number of errors: 0


In [18]:
# Build an UPDATE statement for each row, then execute them
def build_update_statement(row):
    return 'UPDATE society SET cow_code = ' + str(row['cow_code']) + ' WHERE society_id = ' + str(row['society_id'])

# Create UPDATE statement and execute
existing_societies['update_statement'] = existing_societies.apply(build_update_statement, axis=1)

records_updated = 0
row = 1
print('Attempting update of', existing_societies.shape[0], 'rows')
for update in existing_societies['update_statement']:
    update_query = db.run_query(update)
    records_updated += update_query['rows']
    print(row, end=' ')
    row += 1

print('\n', records_updated, 'records updated')

#print( 'This has already been run and 192 records were updated.' )

Attempting update of 192 rows
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 
 192 records updated


In [19]:
# Pull a few records of society data from the database for a confirmation check
query = db.run_query('SELECT * FROM society ORDER BY RAND() LIMIT 5')
peek = pd.DataFrame.from_dict(query['data'])
peek.sample(5)

Unnamed: 0,society_id,common_name,official_name,capital,society_type_id,un_m49,iso_alpha3,un_region,cow_code,still_exists
2,2,Albania,Republic of Albania,Tirana,1,8,ALB,39,339,0
4,72,Falkland Islands,Falkland Islands,Stanley,1,238,FLK,5,0,0
3,240,Vanuatu,Republic of Vanuatu,Port Vila,1,548,VUT,54,935,0
1,42,Central African Republic,Central African Republic,Bangui,1,140,CAF,17,482,0
0,126,Lithuania,Republic of Lithuania,Vilnius,1,440,LTU,154,368,0


##### Add New Societies
Many of these societies had multiple different names and constitutional types (such as duchies, kingdoms, etc). For now, add to the database in barebones format with common name, official name (duplicated from common name), society type ID of 99 ("Uncategorized"), and COW Code. These will be updated later.

In [24]:
# Drop abbreviation column and rename country_name to common_name
new_societies.drop(columns=['abbreviation'], inplace=True)
new_societies.rename(columns={'country_name':'common_name'}, inplace=True)

# Add official_name as duplicate of common_name, and set society_type_id to 99
new_societies['official_name'] = new_societies['common_name']
new_societies['society_type_id'] = 99

new_societies.sample(5)

Unnamed: 0,cow_code,common_name,official_name,society_type_id
50,260,German Federal Republic,German Federal Republic,99
66,327,Papal States,Papal States,99
48,245,Bavaria,Bavaria,99
217,89,Federal Republic of Central America,Federal Republic of Central America,99
47,240,Hanover,Hanover,99


In [25]:
# Check for errors in the new_societies dataframe before insertion
print(error_dict_to_string(society.Society.validate_societies(new_societies, db, val)))

No errors


In [26]:
# Insert Societies
print('Expecting to insert:', new_societies.shape[0], 'rows') # How many rows am I expecting to be inserted?
insert_new_societies = db.run_query(db.build_insert_query('society', new_societies))
print(insert_new_societies['rows'], 'records inserted')

Expecting to insert: 26 rows
26 records inserted


<a id='references'></a>
## References

<li>Paul R. Hensel (2018). "ICOW Colonial History Data Set, version 1.1." Available at <a href='http://www.paulhensel.org/icowcol.html' target='_new'>http://www.paulhensel.org/icowcol.html</a></li>
<li><a href='http://www.correlatesofwar.org/data-sets/downloadable-files/cow-country-codes' target='_new'>Correlates of War country codes</a></li>