# Stat 139 Project

## Data Cleaning and Preparation
### by Kendrick Lo (last revised 25 Nov)

In [1]:
%matplotlib inline
import numpy as np
import scipy as sp
import matplotlib as mpl
import matplotlib.cm as cm
import matplotlib.pyplot as plt
import pandas as pd
pd.set_option('display.width', 500)
pd.set_option('display.max_columns', 100)
pd.set_option('display.notebook_repr_html', True)

import math
import warnings
warnings.filterwarnings('ignore')

In [2]:
# response (country of event, year)
df = pd.read_csv('attacks_country_year.csv')

In [3]:
df.shape

(42373, 2)

In [4]:
df.dtypes

country_txt    object
iyear           int64
dtype: object

In [5]:
df.head()  # note duplicate country entries

Unnamed: 0,country_txt,iyear
0,Egypt,2011
1,Iraq,2011
2,Iraq,2011
3,Nigeria,2011
4,Iraq,2011


In [6]:
# check list of unique country entries
list(set(df.country_txt))

['Canada',
 'Turkmenistan',
 'Cambodia',
 'West Bank and Gaza Strip',
 'Ethiopia',
 'Argentina',
 'Bolivia',
 'Cameroon',
 'Burkina Faso',
 'Bahrain',
 'Saudi Arabia',
 'Guatemala',
 'Guinea',
 'Jordan',
 'Spain',
 'Liberia',
 'Netherlands',
 'Jamaica',
 'Tanzania',
 'New Zealand',
 'Yemen',
 'Pakistan',
 'Albania',
 'United Arab Emirates',
 'India',
 'Azerbaijan',
 'Kenya',
 'Tajikistan',
 'Turkey',
 'Afghanistan',
 'Bangladesh',
 'Mauritania',
 'France',
 'Rwanda',
 'Somalia',
 'Peru',
 'Laos',
 'Norway',
 'Montenegro',
 'Republic of the Congo',
 'China',
 'Armenia',
 'Dominican Republic',
 'Ukraine',
 'Ghana',
 'Libya',
 'Indonesia',
 'Central African Republic',
 'United States',
 'Sweden',
 'Australia',
 'Mali',
 'Russia',
 'Bulgaria',
 'Portugal',
 'South Africa',
 'Nicaragua',
 'Malaysia',
 'Senegal',
 'Mozambique',
 'Uganda',
 'Hungary',
 'Niger',
 'Bosnia-Herzegovina',
 'Brazil',
 'Kuwait',
 'Bahamas',
 'Ivory Coast',
 'Nigeria',
 'Ecuador',
 'Czech Republic',
 'Belarus',
 'Ira

In [7]:
# looks OK, let's look at sorted list
sorted(list(set(df.country_txt)))

['Afghanistan',
 'Albania',
 'Algeria',
 'Argentina',
 'Armenia',
 'Australia',
 'Austria',
 'Azerbaijan',
 'Bahamas',
 'Bahrain',
 'Bangladesh',
 'Belarus',
 'Belgium',
 'Belize',
 'Bhutan',
 'Bolivia',
 'Bosnia-Herzegovina',
 'Brazil',
 'Bulgaria',
 'Burkina Faso',
 'Burundi',
 'Cambodia',
 'Cameroon',
 'Canada',
 'Central African Republic',
 'Chad',
 'Chile',
 'China',
 'Colombia',
 'Croatia',
 'Cyprus',
 'Czech Republic',
 'Democratic Republic of the Congo',
 'Denmark',
 'Djibouti',
 'Dominican Republic',
 'Ecuador',
 'Egypt',
 'Eritrea',
 'Estonia',
 'Ethiopia',
 'France',
 'Georgia',
 'Germany',
 'Ghana',
 'Greece',
 'Guatemala',
 'Guinea',
 'Guinea-Bissau',
 'Honduras',
 'Hungary',
 'Iceland',
 'India',
 'Indonesia',
 'Iran',
 'Iraq',
 'Ireland',
 'Israel',
 'Italy',
 'Ivory Coast',
 'Jamaica',
 'Japan',
 'Jordan',
 'Kazakhstan',
 'Kenya',
 'Kosovo',
 'Kuwait',
 'Kyrgyzstan',
 'Laos',
 'Lebanon',
 'Liberia',
 'Libya',
 'Macedonia',
 'Madagascar',
 'Malaysia',
 'Maldives',
 'Mali

In [8]:
len(sorted(list(set(df.country_txt))))

129

In [9]:
data = pd.DataFrame({'country' : sorted(list(set(df.country_txt))), 
                  2011: [0] * len(sorted(list(set(df.country_txt)))),
                  2012: [0] * len(sorted(list(set(df.country_txt)))),
                  2013: [0] * len(sorted(list(set(df.country_txt)))),
                  2014: [0] * len(sorted(list(set(df.country_txt)))),
                 }).set_index('country')
data

Unnamed: 0_level_0,2011,2012,2013,2014
country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Afghanistan,0,0,0,0
Albania,0,0,0,0
Algeria,0,0,0,0
Argentina,0,0,0,0
Armenia,0,0,0,0
Australia,0,0,0,0
Austria,0,0,0,0
Azerbaijan,0,0,0,0
Bahamas,0,0,0,0
Bahrain,0,0,0,0


In [10]:
for i in range(df.shape[0]):
    name = df.country_txt[i]
    year = df.iyear[i]
    prevcnt = data.get_value(name, year)
    data.set_value(name, year, prevcnt + 1)

In [11]:
data

Unnamed: 0_level_0,2011,2012,2013,2014
country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Afghanistan,422,1469,1442,1820
Albania,0,0,1,2
Algeria,15,41,22,13
Argentina,1,2,2,0
Armenia,0,0,1,0
Australia,0,0,1,7
Austria,1,0,1,1
Azerbaijan,0,0,0,3
Bahamas,0,0,1,0
Bahrain,1,26,52,40


<div class="alert alert-info">
Note that if no events were recorded for a country, we enter zero.  This is accounted for in the average.
</div>

In [12]:
data['avg_attacks'] = data.mean(axis=1)

In [13]:
data

Unnamed: 0_level_0,2011,2012,2013,2014,avg_attacks
country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Afghanistan,422,1469,1442,1820,1288.25
Albania,0,0,1,2,0.75
Algeria,15,41,22,13,22.75
Argentina,1,2,2,0,1.25
Armenia,0,0,1,0,0.25
Australia,0,0,1,7,2.00
Austria,1,0,1,1,0.75
Azerbaijan,0,0,0,3,0.75
Bahamas,0,0,1,0,0.25
Bahrain,1,26,52,40,29.75


In [14]:
# make sure we got everything
(sum(data[2011]) + sum(data[2012]) + sum(data[2013]) + sum(data[2014]))==df.shape[0]

True

In [15]:
data = data.drop([2011, 2012, 2013, 2014], axis=1)
data

Unnamed: 0_level_0,avg_attacks
country,Unnamed: 1_level_1
Afghanistan,1288.25
Albania,0.75
Algeria,22.75
Argentina,1.25
Armenia,0.25
Australia,2.00
Austria,0.75
Azerbaijan,0.75
Bahamas,0.25
Bahrain,29.75


In [16]:
data.to_csv("Y_avg.csv")

## Visualizations

![](world.png)

![](overfifty.png)

## Sex Ratio

<div class="alert alert-info">
Note that we are dropping any rows that we do not have data for in our response variable (y).
</div>

In [17]:
data2 = pd.read_csv('Y_avg.csv')

In [18]:
# (region, 2010, 2015)
sr = pd.read_csv('data/sex_ratio.csv')
sr

Unnamed: 0,region,2010,2015,Unnamed: 3
0,Afghanistan,109.000,108.828,
1,AFRICA,100.013,100.595,
2,Albania,96.332,97.214,
3,Algeria,103.123,103.037,
4,Angola,97.437,97.555,
5,Argentina,101.258,101.403,
6,Armenia,87.471,90.307,
7,Aruba,93.366,94.970,
8,ASIA,105.879,106.355,
9,Australia,101.152,101.796,


In [19]:
data2["sexratio"]=float('NaN')

In [20]:
for i in range(data2.shape[0]):
    name = data2.country[i]
    for j in range(sr.shape[0]):
        if sr.region[j]==name:
            data2.sexratio[i] = (sr["2010"][j] + sr["2015"][j])/2.0
            break

data2

Unnamed: 0,country,avg_attacks,sexratio
0,Afghanistan,1288.25,108.9140
1,Albania,0.75,96.7730
2,Algeria,22.75,103.0800
3,Argentina,1.25,101.3305
4,Armenia,0.25,88.8890
5,Australia,2.00,101.4740
6,Austria,0.75,102.1110
7,Azerbaijan,0.75,94.7420
8,Bahamas,0.25,96.3805
9,Bahrain,29.75,142.3155


In [21]:
data2[data2.sexratio.isnull()]

Unnamed: 0,country,avg_attacks,sexratio
16,Bosnia-Herzegovina,1.25,
24,Central African Republic,30.0,
31,Czech Republic,1.75,
32,Democratic Republic of the Congo,43.0,
35,Dominican Republic,0.5,
59,Ivory Coast,6.0,
65,Kosovo,3.75,
72,Macedonia,1.0,
97,Republic of the Congo,0.25,
105,South Sudan,14.25,


In [22]:
# manual insertion
data2.sexratio[16] = (sr["2010"][23] + sr["2015"][23])/2.0  # Bosnia and Herzegovina
data2.sexratio[24] = (sr["2010"][35] + sr["2015"][35])/2.0  # Central African Republic
data2.sexratio[31] = (sr["2010"][51] + sr["2015"][51])/2.0  # Czech Republic
data2.sexratio[32] = (sr["2010"][51] + sr["2015"][51])/2.0  # Democratic Republic Congo
data2.sexratio[35] = (sr["2010"][56] + sr["2015"][56])/2.0  # Dominican Republic
data2.sexratio[59] = (sr["2010"][47] + sr["2015"][47])/2.0  # Ivory Coast
data2.sexratio[65] = (sr["2010"][177] + sr["2015"][177])/2.0  # Kosovo NOTE:  Using value for SERBIA
data2.sexratio[72] = (sr["2010"][200] + sr["2015"][200])/2.0  # Macedonia
data2.sexratio[97] = (sr["2010"][200] + sr["2015"][200])/2.0  # Republic of the Congo
data2.sexratio[105] = (sr["2010"][193] + sr["2015"][193])/2.0  # South Sudan  NOTE:  Using value for SUDAN
data2.sexratio[112] = (sr["2010"][40] + sr["2015"][40])/2.0  # Taiwan  NOTE:  Using value for CHINA
data2.sexratio[126] = (sr["2010"][151] + sr["2015"][151])/2.0  # West Bank and Gaza
data2.sexratio[127] = (sr["2010"][226] + sr["2015"][226])/2.0  # Yemen

In [23]:
print sr["region"][23]
print sr["region"][35]
print sr["region"][51]
print sr["region"][53]
print sr["region"][56]
print sr["region"][47]
print sr["region"][177]
print sr["region"][200]
print sr["region"][45]
print sr["region"][193]
print sr["region"][40]
print sr["region"][151]
print sr["region"][226]

Bosnia and Herzegovina
Central African Rep.
Czech Rep.
Congo, Dem. Rep.
Dominican Rep.
Cote d'Ivoire
Serbia
Macedonia, FYR
Congo, Rep.
Sudan
China
West Bank and Gaza
Yemen, Rep.


<div class="alert alert-warning">
We have imputed the missing value for Kosovo, by using the data for Serbia. <br>
We have imputed the missing value for South Sudan, by using the data for Sudan. <br>
We have imputed the missing value for Taiwan, by using the data for China.
</div>

In [24]:
## check if we got all of them
data2[data2.sexratio.isnull()]

Unnamed: 0,country,avg_attacks,sexratio


## Literacy Rate

<div class="alert alert-info">
Note that we are expanding the years in which we use the data (2005-11).
</div>

In [25]:
# (literacy, 2005, 2006, 2007, 2008, 2009, 2010, 2011)
lit = pd.read_csv('data/literacy.csv')
lit

Unnamed: 0,literacy,2005,2006,2007,2008,2009,2010,2011
0,Afghanistan,,,,,,,
1,Albania,,,,98.831207,,,98.791190
2,Algeria,,91.779641,,,,,
3,Andorra,,,,,,,
4,Angola,,,,,,,73.036460
5,Anguilla,,,,,,,
6,Antigua and Barbuda,,,,,,,
7,Argentina,,,,,,,99.215080
8,Armenia,,,,,,,99.756760
9,Aruba,,,,,,99.138412,


<div class="alert alert-info">
Note that we do not include missing values in calculating the row average.
</div>

In [26]:
lit["avg"]=float('NaN')
for i in range(lit.shape[0]):
    runningtot = 0
    totalvals = 0
    for j in range(1, 8, 1):
        if not math.isnan(lit.iloc[i][j]):
            runningtot += lit.iloc[i][j]
            totalvals += 1
    if totalvals>0:
        lit = lit.set_value(i, "avg", runningtot/(totalvals*1.0))

lit

Unnamed: 0,literacy,2005,2006,2007,2008,2009,2010,2011,avg
0,Afghanistan,,,,,,,,
1,Albania,,,,98.831207,,,98.791190,98.811199
2,Algeria,,91.779641,,,,,,91.779641
3,Andorra,,,,,,,,
4,Angola,,,,,,,73.036460,73.036460
5,Anguilla,,,,,,,,
6,Antigua and Barbuda,,,,,,,,
7,Argentina,,,,,,,99.215080,99.215080
8,Armenia,,,,,,,99.756760,99.756760
9,Aruba,,,,,,99.138412,,99.138412


In [27]:
data2["literacy"]=float('NaN')

In [28]:
for i in range(data2.shape[0]):
    name = data2.country[i]
    for j in range(lit.shape[0]):
        if lit.literacy[j]==name:
            data2.literacy[i] = lit.avg[j]
            break

data2

Unnamed: 0,country,avg_attacks,sexratio,literacy
0,Afghanistan,1288.25,108.9140,
1,Albania,0.75,96.7730,98.811199
2,Algeria,22.75,103.0800,91.779641
3,Argentina,1.25,101.3305,99.215080
4,Armenia,0.25,88.8890,99.756760
5,Australia,2.00,101.4740,
6,Austria,0.75,102.1110,
7,Azerbaijan,0.75,94.7420,99.975648
8,Bahamas,0.25,96.3805,
9,Bahrain,29.75,142.3155,98.163596


In [29]:
data2[data2.literacy.isnull()]

Unnamed: 0,country,avg_attacks,sexratio,literacy
0,Afghanistan,1288.25,108.914,
5,Australia,2.0,101.474,
6,Austria,0.75,102.111,
8,Bahamas,0.25,96.3805,
12,Belgium,0.75,101.8855,
13,Belize,0.25,102.4845,
16,Bosnia-Herzegovina,1.25,95.9685,
23,Canada,1.75,102.8465,
24,Central African Republic,30.0,98.391,
31,Czech Republic,1.75,106.265,


In [30]:
# manual insertion
data2.literacy[16] = lit.avg[24]  # Bosnia and Herzegovina
data2.literacy[24] = lit.avg[37]  # Central African Republic
data2.literacy[24] = lit.avg[43]  # Democratic Republic of the Congo
data2.literacy[35] = lit.avg[55]  # Dominican Republic
data2.literacy[59] = lit.avg[47]  # Ivory Coast
data2.literacy[72] = lit.avg[111]  # Macedonia
data2.literacy[126] = lit.avg[204]  # West Bank and Gaza
data2.literacy[127] = lit.avg[206]  # Yemen

In [31]:
print lit.literacy[24]
print lit.literacy[37]
print lit.literacy[43]
print lit.literacy[55]
print lit.literacy[47]
print lit.literacy[111]
print lit.literacy[204]
print lit.literacy[206]

Bosnia and Herzegovina
Central African Rep.
Congo, Dem. Rep.
Dominican Rep.
Cote d'Ivoire
Macedonia, FYR
West Bank and Gaza
Yemen, Rep.


In [32]:
## we didn't get them all
data2[data2.literacy.isnull()]

Unnamed: 0,country,avg_attacks,sexratio,literacy
0,Afghanistan,1288.25,108.914,
5,Australia,2.0,101.474,
6,Austria,0.75,102.111,
8,Bahamas,0.25,96.3805,
12,Belgium,0.75,101.8855,
13,Belize,0.25,102.4845,
23,Canada,1.75,102.8465,
31,Czech Republic,1.75,106.265,
32,Democratic Republic of the Congo,43.0,106.265,
33,Denmark,0.25,103.19,


<div class="alert alert-warning">
Note that we may be able to impute a handful of these values in a manner similar to what we did for the previous section, but this one has a lot more elements without data.  So we may want to consider whether we want to keep this or not, or find a better source of data.
</div>

## territorial change

<div class="alert alert-warning">
These cleaning steps are based on algorithm provided separately. <br><p>

1. If gainer & loser are both NOT (-9), we record one entry for (gainer, gaintype) and one entry for (loser, losetype), and add gaintype/losetype value (if not -9) to the value of conflict to obtain a score. If one of gainer & loser is (-9), we only have one entry, but we add gaintype/losetype value to the value of conflict in the same way. Only entries 1950 and onwards contribute to the score for each entity. We accumulate all such scores for each entity in its respective counter. (Step 1 and Step 2 of algorithm combined). <b>NOTE: we are adding +1 to each of gaintype and losetype when computing the score for each entity, per entry.</b>
</div>

In [33]:
confraw = pd.read_csv('data/conflict.csv')
confraw

Unnamed: 0,year,gainer,gaintype,loser,losetype,conflict
0,1816,160,1,230,0,0
1,1816,200,0,790,1,1
2,1816,200,0,-9,1,0
3,1817,220,0,200,0,0
4,1817,365,1,-9,1,1
5,1818,2,1,200,0,0
6,1818,155,1,230,0,1
7,1818,200,0,2,1,0
8,1818,200,0,-9,1,1
9,1818,200,0,-9,1,0


In [34]:
print min(confraw.gainer)
print max(confraw.gainer)
print min(confraw.loser)
print max(confraw.loser)
print min(confraw.gaintype)
print max(confraw.gaintype)
print min(confraw.losetype)
print max(confraw.losetype)
print min(confraw.conflict)
print max(confraw.conflict)

0
990
-9
7693
-9
1
-9
1
0
1


In [35]:
# set up counter for scores by entity code, as dictionary
scoredict = {}

In [36]:
# increment score of each entity (2 per row unless blank), row-by-row, ____type + conflict
for i in range(confraw.shape[0]):
    # only use records >= 1950 for score (but keep all records for history of entities)
    if confraw.year[i]>=1950 and confraw.gainer[i]>0 and confraw.gaintype[i]>=0:
        if confraw.gainer[i] in scoredict:
            scoredict[confraw.gainer[i]] += confraw.gaintype[i] + confraw.conflict[i] + 1
        else:
            scoredict[confraw.gainer[i]] = confraw.gaintype[i] + confraw.conflict[i] + 1
    if confraw.year[i]>=1950 and confraw.loser[i]>0 and confraw.losetype[i]>=0:
        if confraw.loser[i] in scoredict:
            scoredict[confraw.loser[i]] += confraw.losetype[i] + confraw.conflict[i] + 1
        else:
            scoredict[confraw.loser[i]] = confraw.losetype[i] + confraw.conflict[i] + 1

In [37]:
# take a look at dictionary
scoredict.items()[0:10]

[(2, 12),
 (516, 2),
 (517, 2),
 (520, 4),
 (522, 2),
 (530, 5),
 (531, 3),
 (540, 3),
 (541, 2),
 (31, 2)]

<div class="alert alert-warning">
2. We now process the file `entities`. For each unique entity number, we capture the latest (in time) entry, many of which may represent the entity being subsumed under another entity number. We record all such mappings from one entity (which may no longer exist) to another (usually a currently existing country). We also "self-map" current entities in the `COWcountrycodes`, and record that as a mapping.
</div>

In [38]:
entities = pd.read_csv('data/entities.csv')
entities

Unnamed: 0,number,name,begin,end,polstat
0,3,Alaska,1816,1867,Became colony of 365
1,3,Alaska,1867,1959,Became colony of 2
2,3,Alaska,1959,1993,Became part of 2
3,4,Hawaii,1898,1960,Became colony of 2
4,4,Hawaii,1960,1993,Became part of 2
5,5,Virgin Islands,1816,1917,Became colony of 390
6,5,Virgin Islands,1917,1993,Became colony of 2
7,6,Puerto Rico,1816,1821,Became part of 1070
8,6,Puerto Rico,1821,1898,Became colony of 230
9,6,Puerto Rico,1898,1952,Became colony of 2


In [39]:
# read COW codes -- note duplicate codes
cow = pd.read_csv('data/cowcodes.csv')

# mapping for entity code names, keep unique
cowcode = {}
for i in range(cow.shape[0]):
    if cow.CCode[i] not in cowcode:
        cowcode[cow.CCode[i]] = cow.StateNme[i]

sorted(cowcode.items())

[(2, 'United States of America'),
 (20, 'Canada'),
 (31, 'Bahamas'),
 (40, 'Cuba'),
 (41, 'Haiti'),
 (42, 'Dominican Republic'),
 (51, 'Jamaica'),
 (52, 'Trinidad and Tobago'),
 (53, 'Barbados'),
 (54, 'Dominica'),
 (55, 'Grenada'),
 (56, 'St. Lucia'),
 (57, 'St. Vincent and the Grenadines'),
 (58, 'Antigua & Barbuda'),
 (60, 'St. Kitts and Nevis'),
 (70, 'Mexico'),
 (80, 'Belize'),
 (90, 'Guatemala'),
 (91, 'Honduras'),
 (92, 'El Salvador'),
 (93, 'Nicaragua'),
 (94, 'Costa Rica'),
 (95, 'Panama'),
 (100, 'Colombia'),
 (101, 'Venezuela'),
 (110, 'Guyana'),
 (115, 'Suriname'),
 (130, 'Ecuador'),
 (135, 'Peru'),
 (140, 'Brazil'),
 (145, 'Bolivia'),
 (150, 'Paraguay'),
 (155, 'Chile'),
 (160, 'Argentina'),
 (165, 'Uruguay'),
 (200, 'United Kingdom'),
 (205, 'Ireland'),
 (210, 'Netherlands'),
 (211, 'Belgium'),
 (212, 'Luxembourg'),
 (220, 'France'),
 (221, 'Monaco'),
 (223, 'Liechtenstein'),
 (225, 'Switzerland'),
 (230, 'Spain'),
 (232, 'Andorra'),
 (235, 'Portugal'),
 (240, 'Hanover'),

In [40]:
# we take advantage of the fact that the latest event for an entity is the last entry
# for that entity number. We also need only keep the entity number in the first column,
# and the entity referenced in the final column
# we store mapping in a dictionary of {first entity number: last referenced entity number}

def extractentity(s):
    # given a string s, extract the entity number referenced at the end of the string
    suffix = s
    for i in range(len(s)):
        try:
            a = int(suffix)
            return int(suffix)
        except:
            suffix = suffix[1:]
       
entitymap = {}

for i in range(entities.shape[0]):
    if i==(entities.shape[0]-1) or entities.number[i]!=entities.number[i+1]:
        # last entry for that entity number
        entitymap[int(entities.number[i])] = extractentity(entities.polstat[i])
        
terractivitylist = sorted(entitymap.items())
terractivitylist[0:10]  # print first 10 items

[(3, 2),
 (4, 2),
 (5, 2),
 (6, 2),
 (7, 2),
 (10, 2),
 (11, 200),
 (20, 200),
 (21, 20),
 (30, 200)]

In [41]:
# get list of all unique entities in the last element of the tuple
absorbingentities = list(set(entitymap.values()))

# add up all territorial change scores (scoredict) for each entity 
# in the first element of the tuple, but assigned to the counter
# associated with the last element of the tuple -- thus all of
# the territorial activity associated with the history of the
# final entity is reflected in the cumulative score 

# we also replace with country name

terrcounter = {}
for item in terractivitylist:
    
    (old, new) = item
    errorflag = False
    
    while True:
        if new in cowcode:
            break
        else:
            # second and third+ order transitions A -> [B+] -> C
            if new in entitymap:
                new = entitymap[new] # loop again
            else:
               print "no code found for %i -- record skipped" % new
               errorflag = True
               break
    
    if not errorflag:
        if (cowcode[new] in terrcounter) and (old in scoredict):
            terrcounter[cowcode[new]] += scoredict[old]
        elif (old in scoredict):
            terrcounter[cowcode[new]] = scoredict[old]

# add scores for countries in COW codes in as well
for i in range(cow.shape[0]):
    if (cowcode[cow.CCode[i]] in terrcounter) and (cow.CCode[i] in scoredict):
        terrcounter[cowcode[cow.CCode[i]]] += scoredict[cow.CCode[i]]
    elif (cow.CCode[i] in scoredict):
        terrcounter[cowcode[cow.CCode[i]]] = scoredict[cow.CCode[i]]
    else:
        terrcounter[cowcode[cow.CCode[i]]] = 0  # no events past 1950

no code found for 263 -- record skipped
no code found for 9999 -- record skipped
no code found for 9999 -- record skipped
no code found for 9999 -- record skipped
no code found for 5630 -- record skipped
no code found for 5630 -- record skipped
no code found for 5630 -- record skipped


In [42]:
sorted(terrcounter.items())

[('Afghanistan', 0),
 ('Albania', 0),
 ('Algeria', 3),
 ('Andorra', 0),
 ('Angola', 3),
 ('Antigua & Barbuda', 2),
 ('Argentina', 2),
 ('Armenia', 2),
 ('Australia', 8),
 ('Austria', 4),
 ('Austria-Hungary', 0),
 ('Azerbaijan', 2),
 ('Baden', 0),
 ('Bahamas', 2),
 ('Bahrain', 2),
 ('Bangladesh', 5),
 ('Barbados', 2),
 ('Bavaria', 0),
 ('Belarus', 2),
 ('Belgium', 12),
 ('Belize', 2),
 ('Benin', 4),
 ('Bhutan', 0),
 ('Bolivia', 0),
 ('Bosnia and Herzegovina', 3),
 ('Botswana', 4),
 ('Brazil', 0),
 ('Brunei', 2),
 ('Bulgaria', 0),
 ('Burkina Faso', 5),
 ('Burundi', 6),
 ('Cambodia', 2),
 ('Cameroon', 11),
 ('Canada', 0),
 ('Cape Verde', 2),
 ('Central African Republic', 2),
 ('Chad', 4),
 ('Chile', 2),
 ('China', 24),
 ('Colombia', 2),
 ('Comoros', 4),
 ('Congo', 2),
 ('Costa Rica', 0),
 ('Croatia', 3),
 ('Cuba', 0),
 ('Cyprus', 5),
 ('Czech Republic', 10),
 ('Czechoslovakia', 20),
 ('Democratic Republic of the Congo', 2),
 ('Denmark', 0),
 ('Djibouti', 2),
 ('Dominica', 2),
 ('Dominican

In [43]:
len(terrcounter)

217

<div class="alert alert-warning">
3. We have mapped the country name to the entity codes, and summed the 1950+ territorial activity counts. We now add this data to our dataframe.
</div>

In [44]:
data2["territory"]=float('NaN')

for i in range(data2.shape[0]):
    name = data2.country[i]
    if name in terrcounter:
        data2.territory[i] = terrcounter[name]

data2

Unnamed: 0,country,avg_attacks,sexratio,literacy,territory
0,Afghanistan,1288.25,108.9140,,0
1,Albania,0.75,96.7730,98.811199,0
2,Algeria,22.75,103.0800,91.779641,3
3,Argentina,1.25,101.3305,99.215080,2
4,Armenia,0.25,88.8890,99.756760,2
5,Australia,2.00,101.4740,,8
6,Austria,0.75,102.1110,,4
7,Azerbaijan,0.75,94.7420,99.975648,2
8,Bahamas,0.25,96.3805,,2
9,Bahrain,29.75,142.3155,98.163596,2


In [45]:
data2[data2.territory.isnull()]

Unnamed: 0,country,avg_attacks,sexratio,literacy,territory
16,Bosnia-Herzegovina,1.25,95.9685,99.6976,
97,Republic of the Congo,0.25,104.862,,
102,Serbia,0.75,103.615,99.27081,
124,United States,14.25,102.675,,
126,West Bank and Gaza Strip,60.25,105.2455,99.181879,


In [46]:
# manual insertion
data2.territory[16] = terrcounter['Bosnia and Herzegovina']  
data2.territory[97] = terrcounter['Congo']  
data2.territory[102] = terrcounter['Yugoslavia']  # Serbia mapped under Yugoslavia in 'Entities'
data2.territory[124] = terrcounter['United States of America'] 
data2.territory[126] = terrcounter['Egypt']  # West Bank and Gaza mapped under Egypt in 'Entities'

In [47]:
# check that we have them all
data2[data2.territory.isnull()]

Unnamed: 0,country,avg_attacks,sexratio,literacy,territory


<div class="alert alert-warning">
Note that we used the value from EGYPT for West Bank and Gaza, since that was last reflected in entities (via "Palestine"). However, as you know this is disputed.
</div>

## religious adherence

<div class="alert alert-info">
Note that we are treating (1- less than 0.1% as 0.999)
</div>

In [48]:
relig = pd.read_csv('data/nonrelig.csv')
relig

Unnamed: 0,Country,aff
0,Afghanistan,0.999
1,Albania,0.986
2,Algeria,0.982
3,American Samoa,0.993
4,Andorra,0.912
5,Angola,0.949
6,Anguilla,0.960
7,Antigua and Barbuda,0.983
8,Argentina,0.878
9,Armenia,0.987


In [49]:
data2["proprelig"]=float('NaN')

for i in range(data2.shape[0]):
    name = data2.country[i]
    for j in range(relig.shape[0]):
        if relig.Country[j]==name:
            data2.proprelig[i] = relig.aff[j]*1.0
            break

data2

Unnamed: 0,country,avg_attacks,sexratio,literacy,territory,proprelig
0,Afghanistan,1288.25,108.9140,,0,0.999
1,Albania,0.75,96.7730,98.811199,0,0.986
2,Algeria,22.75,103.0800,91.779641,3,0.982
3,Argentina,1.25,101.3305,99.215080,2,0.878
4,Armenia,0.25,88.8890,99.756760,2,0.987
5,Australia,2.00,101.4740,,8,0.758
6,Austria,0.75,102.1110,,4,0.865
7,Azerbaijan,0.75,94.7420,99.975648,2,0.999
8,Bahamas,0.25,96.3805,,2,0.969
9,Bahrain,29.75,142.3155,98.163596,2,0.981


In [50]:
data2[data2.proprelig.isnull()]

Unnamed: 0,country,avg_attacks,sexratio,literacy,territory,proprelig
48,Guinea-Bissau,0.25,98.7455,73.16962,2,
72,Macedonia,1.0,104.862,98.67001,2,
84,Myanmar,11.75,95.9895,96.0959,2,
126,West Bank and Gaza Strip,60.25,105.2455,99.181879,48,


In [51]:
# manual insertion
data2.proprelig[48] = relig.aff[85]  # Guinea-Bissau
data2.proprelig[72] = relig.aff[166]  # Macedonia
data2.proprelig[84] = relig.aff[32]  # Myanmar
data2.proprelig[126] = relig.aff[156]  # West Bank and Gaza Strip

In [52]:
print relig.Country[85]
print relig.Country[166]
print relig.Country[32]
print relig.Country[156]

Guinea Bissau
Republic of Macedonia
Burma (Myanmar)
Palestinian territories


In [53]:
# check we got them all
data2[data2.proprelig.isnull()]

Unnamed: 0,country,avg_attacks,sexratio,literacy,territory,proprelig


## alcohol consumption

In [54]:
alc = pd.read_csv('data/alcohol.csv')
alc

Unnamed: 0,Country,2005,2008
0,Afghanistan,0.02,0.03
1,Albania,6.68,7.29
2,Algeria,0.96,0.69
3,Andorra,15.48,10.17
4,Angola,5.40,5.57
5,Antigua and Barbuda,7.22,8.17
6,Argentina,10.00,9.35
7,Armenia,11.35,13.66
8,Australia,10.02,10.21
9,Austria,13.24,12.40


<div class="alert alert-info">
Note that we do not include missing values in calculating the row average.
</div>

In [55]:
data2["alcohol"]=float('NaN')

for i in range(data2.shape[0]):
    name = data2.country[i]
    for j in range(alc.shape[0]):
        if alc.Country[j]==name:
            if math.isnan(alc["2005"][j]):
                data2.alcohol[i] = alc["2008"][j]
            elif math.isnan(alc["2008"][j]):
                data2.alcohol[i] = alc["2005"][j]
            elif (math.isnan(alc["2008"][j]) and math.isnan(alc["2005"][j])):
                data2.alcohol[i] = float('NaN')
            else:
                data2.alcohol[i] = (alc["2005"][j] + alc["2008"][j])/2.0
            
            break

data2

Unnamed: 0,country,avg_attacks,sexratio,literacy,territory,proprelig,alcohol
0,Afghanistan,1288.25,108.9140,,0,0.999,0.025
1,Albania,0.75,96.7730,98.811199,0,0.986,6.985
2,Algeria,22.75,103.0800,91.779641,3,0.982,0.825
3,Argentina,1.25,101.3305,99.215080,2,0.878,9.675
4,Armenia,0.25,88.8890,99.756760,2,0.987,12.505
5,Australia,2.00,101.4740,,8,0.758,10.115
6,Austria,0.75,102.1110,,4,0.865,12.820
7,Azerbaijan,0.75,94.7420,99.975648,2,0.999,11.970
8,Bahamas,0.25,96.3805,,2,0.969,8.705
9,Bahrain,29.75,142.3155,98.163596,2,0.981,3.925


In [56]:
data2[data2.alcohol.isnull()]

Unnamed: 0,country,avg_attacks,sexratio,literacy,territory,proprelig,alcohol
16,Bosnia-Herzegovina,1.25,95.9685,99.6976,3,0.975,
32,Democratic Republic of the Congo,43.0,106.265,,2,0.982,
59,Ivory Coast,6.0,103.416,67.52731,2,0.92,
65,Kosovo,3.75,103.615,,0,0.984,
72,Macedonia,1.0,104.862,98.67001,2,0.986,
75,Maldives,3.0,101.476,99.301007,2,0.999,
81,Montenegro,0.5,100.811,99.3173,0,0.968,
84,Myanmar,11.75,95.9895,96.0959,2,0.995,
97,Republic of the Congo,0.25,104.862,,2,0.91,
105,South Sudan,14.25,102.1215,,3,0.995,


In [57]:
# manual insertion
data2.alcohol[16] = (alc["2005"][21] + alc["2008"][21])/2.0  # Bosnia and Herzegovina
data2.alcohol[32] = (alc["2005"][38] + alc["2008"][38])/2.0  # Democratic Republic Congo
data2.alcohol[59] = (alc["2005"][42] + alc["2008"][42])/2.0  # Ivory Coast
data2.alcohol[65] = (alc["2005"][147] + alc["2008"][147])/2.0  # Kosovo NOTE:  Using value for SERBIA
data2.alcohol[72] = (alc["2005"][99] + alc["2008"][99])/2.0  # Macedonia
data2.alcohol[84] = (alc["2005"][113] + alc["2008"][113])/2.0  # Myanmar
data2.alcohol[97] = (alc["2005"][39] + alc["2008"][39])/2.0  # Republic of the Congo
data2.alcohol[105] = (alc["2005"][159] + alc["2008"][159])/2.0  # South Sudan  NOTE:  Using value for SUDAN
data2.alcohol[112] = (alc["2005"][35] + alc["2008"][35])/2.0  # Taiwan  NOTE:  Using value for CHINA

In [58]:
print alc.Country[113]
print alc.Country[21]
print alc.Country[38]
print alc.Country[39]
print alc.Country[42]
print alc.Country[147]
print alc.Country[99]
print alc.Country[159]
print alc.Country[35]

Myanmar [Burma]
Bosnia and Herzegovina
Congo [DRC]
Congo [Republic]
C�te d'Ivoire
Serbia
Macedonia [FYROM]
Sudan
China


In [59]:
# here's what is left
data2[data2.alcohol.isnull()]

Unnamed: 0,country,avg_attacks,sexratio,literacy,territory,proprelig,alcohol
75,Maldives,3.0,101.476,99.301007,2,0.999,
81,Montenegro,0.5,100.811,99.3173,0,0.968,
126,West Bank and Gaza Strip,60.25,105.2455,99.181879,48,0.999,


<div class="alert alert-warning">
These three still have missing values. <br>

Note we imputed values for Kosova, South Sudan, and Taiwan using data from SERBIA, SUDAN, and CHINA, respectively.
</div>

## gdp per capita

In [60]:
gdpcp = pd.read_csv('data/gdpcp.csv')
gdpcp

Unnamed: 0,Country,2011,2012,2013,2014
0,Aruba,25353.787540,,,
1,Andorra,41630.052580,39666.369210,42806.522550,
2,Afghanistan,622.379654,690.842629,661.969562,658.981813
3,Angola,4744.985104,5084.346486,5295.210896,5423.609735
4,Albania,4437.811725,4256.016702,4458.073207,4619.211258
5,Arab World,6900.381094,7442.144593,7540.460043,7412.521238
6,United Arab Emirates,39778.489710,41587.512810,44506.763590,44204.318600
7,Argentina,13439.941460,14436.600090,14623.478690,12568.569820
8,Armenia,3417.171836,3343.437635,3486.145636,3619.776319
9,American Samoa,,,,


<div class="alert alert-info">
Note that we do not include missing values in calculating the row average.
</div>

In [61]:
gdpcp["avg"] = float('NaN')
for i in range(gdpcp.shape[0]):
    runningtot = 0
    totalvals = 0
    for j in range(1, 5, 1):
        if not math.isnan(gdpcp.iloc[i][j]):
            runningtot += gdpcp.iloc[i][j]
            totalvals += 1
    if totalvals>0:
        gdpcp = gdpcp.set_value(i, "avg", runningtot/(totalvals*1.0))

gdpcp

Unnamed: 0,Country,2011,2012,2013,2014,avg
0,Aruba,25353.787540,,,,25353.787540
1,Andorra,41630.052580,39666.369210,42806.522550,,41367.648113
2,Afghanistan,622.379654,690.842629,661.969562,658.981813,658.543414
3,Angola,4744.985104,5084.346486,5295.210896,5423.609735,5137.038055
4,Albania,4437.811725,4256.016702,4458.073207,4619.211258,4442.778223
5,Arab World,6900.381094,7442.144593,7540.460043,7412.521238,7323.876742
6,United Arab Emirates,39778.489710,41587.512810,44506.763590,44204.318600,42519.271177
7,Argentina,13439.941460,14436.600090,14623.478690,12568.569820,13767.147515
8,Armenia,3417.171836,3343.437635,3486.145636,3619.776319,3466.632856
9,American Samoa,,,,,


In [62]:
data2["gdpcp"] = float('NaN')

for i in range(data2.shape[0]):
    name = data2.country[i]
    for j in range(gdpcp.shape[0]):
        if gdpcp.Country[j]==name:
            data2.gdpcp[i] = gdpcp.avg[j]
            break
            
data2

Unnamed: 0,country,avg_attacks,sexratio,literacy,territory,proprelig,alcohol,gdpcp
0,Afghanistan,1288.25,108.9140,,0,0.999,0.025,658.543414
1,Albania,0.75,96.7730,98.811199,0,0.986,6.985,4442.778223
2,Algeria,22.75,103.0800,91.779641,3,0.982,0.825,5470.406003
3,Argentina,1.25,101.3305,99.215080,2,0.878,9.675,13767.147515
4,Armenia,0.25,88.8890,99.756760,2,0.987,12.505,3466.632856
5,Australia,2.00,101.4740,,8,0.758,10.115,64751.357082
6,Austria,0.75,102.1110,,4,0.865,12.820,50279.914147
7,Azerbaijan,0.75,94.7420,99.975648,2,0.999,11.970,7569.818690
8,Bahamas,0.25,96.3805,,2,0.969,8.705,
9,Bahrain,29.75,142.3155,98.163596,2,0.981,3.925,23637.291640


In [63]:
data2[data2.gdpcp.isnull()]

Unnamed: 0,country,avg_attacks,sexratio,literacy,territory,proprelig,alcohol,gdpcp
8,Bahamas,0.25,96.3805,,2,0.969,8.705,
16,Bosnia-Herzegovina,1.25,95.9685,99.6976,3,0.975,9.615,
32,Democratic Republic of the Congo,43.0,106.265,,2,0.982,3.345,
37,Egypt,182.25,100.6855,86.429147,48,0.999,0.345,
54,Iran,10.0,103.2615,97.576977,5,0.999,1.025,
59,Ivory Coast,6.0,103.416,67.52731,2,0.92,6.475,
67,Kyrgyzstan,0.75,100.3085,99.75185,2,0.996,4.905,
68,Laos,0.25,99.9755,83.931063,2,0.991,6.86,
72,Macedonia,1.0,104.862,98.67001,2,0.986,8.725,
97,Republic of the Congo,0.25,104.862,,2,0.91,4.33,


In [64]:
# manual insertion
data2.gdpcp[8] = gdpcp.avg[21]  # Bahamas
data2.gdpcp[16] = gdpcp.avg[22]  # Bosnia...
data2.gdpcp[32] = gdpcp.avg[245]  # Dem Rep Congo
data2.gdpcp[37] = gdpcp.avg[63]  # Egypt
data2.gdpcp[54] = gdpcp.avg[102]  # Iran
data2.gdpcp[59] = gdpcp.avg[39]  # Ivory Coast
data2.gdpcp[67] = gdpcp.avg[112]  # Kyrgyzstan
data2.gdpcp[68] = gdpcp.avg[120]  # Laos
data2.gdpcp[72] = gdpcp.avg[147]  # Macedonia
data2.gdpcp[97] = gdpcp.avg[41]  # Rep. of Congo
data2.gdpcp[98] = gdpcp.avg[190]  # Russia
data2.gdpcp[111] = gdpcp.avg[215]  # Syria
data2.gdpcp[125] = gdpcp.avg[236]  # Venezuela
data2.gdpcp[126] = gdpcp.avg[240]  # West Bank and Gaza
data2.gdpcp[127] = gdpcp.avg[243]  # Yemen

In [65]:
print gdpcp.Country[21]
print gdpcp.Country[22]
print gdpcp.Country[245]
print gdpcp.Country[63]
print gdpcp.Country[102]
print gdpcp.Country[39]
print gdpcp.Country[112]
print gdpcp.Country[120]
print gdpcp.Country[147]
print gdpcp.Country[41]
print gdpcp.Country[190]
print gdpcp.Country[215]
print gdpcp.Country[236]
print gdpcp.Country[240]
print gdpcp.Country[243]

Bahamas, The
Bosnia and Herzegovina
Congo, Dem. Rep.
Egypt, Arab Rep.
Iran, Islamic Rep.
Cote d'Ivoire
Kyrgyz Republic
Lao PDR
Macedonia, FYR
Congo, Rep.
Russian Federation
Syrian Arab Republic
Venezuela, RB
West Bank and Gaza
Yemen, Rep.


In [66]:
# Here's what's left
data2[data2.gdpcp.isnull()]

Unnamed: 0,country,avg_attacks,sexratio,literacy,territory,proprelig,alcohol,gdpcp
103,Somalia,424.5,98.313,,8,0.999,0.5,
111,Syria,207.75,103.233,95.28453,24,0.98,1.46,
112,Taiwan,0.5,107.964,,3,0.873,5.735,


<div class="alert alert-warning">
These still have missing values. <br>
Not sure if we should use "China" for Taiwan here again, at least because it is economic data.
</div>

## gdp growth

In [67]:
gdpgr = pd.read_csv('data/gdpgr.csv')
gdpgr

Unnamed: 0,Country,2011,2012,2013,2014
0,Aruba,,,,
1,Andorra,-4.802675,-1.760010,-0.063514,
2,Afghanistan,6.113685,14.434741,1.934143,1.998533
3,Angola,3.918597,5.155441,6.800058,3.901265
4,Albania,2.545359,1.623699,1.417423,1.900000
5,Arab World,3.604995,6.259919,3.155573,2.247172
6,United Arab Emirates,4.885271,4.677892,5.200000,3.605468
7,Argentina,8.386451,0.801760,2.885352,0.469257
8,Armenia,4.700000,7.200000,3.500000,3.400000
9,American Samoa,,,,


<div class="alert alert-info">
Note that we do not include missing values in calculating the row average.
</div>

In [68]:
gdpgr["avg"] = float('NaN')
for i in range(gdpgr.shape[0]):
    runningtot = 0
    totalvals = 0
    for j in range(1, 5, 1):
        if not math.isnan(gdpgr.iloc[i][j]):
            runningtot += gdpgr.iloc[i][j]
            totalvals += 1
    if totalvals>0:
        gdpgr = gdpgr.set_value(i, "avg", runningtot/(totalvals*1.0))
        
gdpgr

Unnamed: 0,Country,2011,2012,2013,2014,avg
0,Aruba,,,,,
1,Andorra,-4.802675,-1.760010,-0.063514,,-2.208733
2,Afghanistan,6.113685,14.434741,1.934143,1.998533,6.120276
3,Angola,3.918597,5.155441,6.800058,3.901265,4.943840
4,Albania,2.545359,1.623699,1.417423,1.900000,1.871620
5,Arab World,3.604995,6.259919,3.155573,2.247172,3.816915
6,United Arab Emirates,4.885271,4.677892,5.200000,3.605468,4.592158
7,Argentina,8.386451,0.801760,2.885352,0.469257,3.135705
8,Armenia,4.700000,7.200000,3.500000,3.400000,4.700000
9,American Samoa,,,,,


In [69]:
data2["gdpgr"] = float('NaN')

for i in range(data2.shape[0]):
    name = data2.country[i]
    for j in range(gdpgr.shape[0]):
        if gdpgr.Country[j]==name:
            data2.gdpgr[i] = gdpgr.avg[j]
            break

data2

Unnamed: 0,country,avg_attacks,sexratio,literacy,territory,proprelig,alcohol,gdpcp,gdpgr
0,Afghanistan,1288.25,108.9140,,0,0.999,0.025,658.543414,6.120276
1,Albania,0.75,96.7730,98.811199,0,0.986,6.985,4442.778223,1.871620
2,Algeria,22.75,103.0800,91.779641,3,0.982,0.825,5470.406003,3.250002
3,Argentina,1.25,101.3305,99.215080,2,0.878,9.675,13767.147515,3.135705
4,Armenia,0.25,88.8890,99.756760,2,0.987,12.505,3466.632856,4.700000
5,Australia,2.00,101.4740,,8,0.758,10.115,64751.357082,2.759036
6,Austria,0.75,102.1110,,4,0.865,12.820,50279.914147,1.121005
7,Azerbaijan,0.75,94.7420,99.975648,2,0.999,11.970,7569.818690,2.515650
8,Bahamas,0.25,96.3805,,2,0.969,8.705,22040.151197,
9,Bahrain,29.75,142.3155,98.163596,2,0.981,3.925,23637.291640,3.883306


In [70]:
data2[data2.gdpgr.isnull()]

Unnamed: 0,country,avg_attacks,sexratio,literacy,territory,proprelig,alcohol,gdpcp,gdpgr
8,Bahamas,0.25,96.3805,,2,0.969,8.705,22040.151197,
16,Bosnia-Herzegovina,1.25,95.9685,99.6976,3,0.975,9.615,4667.492729,
32,Democratic Republic of the Congo,43.0,106.265,,2,0.982,3.345,398.723228,
37,Egypt,182.25,100.6855,86.429147,48,0.999,0.345,3046.943898,
54,Iran,10.0,103.2615,97.576977,5,0.999,1.025,6677.546833,
59,Ivory Coast,6.0,103.416,67.52731,2,0.92,6.475,1376.603933,
67,Kyrgyzstan,0.75,100.3085,99.75185,2,0.996,4.905,1213.3586,
68,Laos,0.25,99.9755,83.931063,2,0.991,6.86,1551.792854,
72,Macedonia,1.0,104.862,98.67001,2,0.986,8.725,5110.086461,
97,Republic of the Congo,0.25,104.862,,2,0.91,4.33,3246.899329,


In [71]:
# manual insertion
data2.gdpgr[8] = gdpgr.avg[21]  # Bahamas
data2.gdpgr[16] = gdpgr.avg[22]  # Bosnia...
data2.gdpgr[32] = gdpgr.avg[245]  # Dem Rep Congo
data2.gdpgr[37] = gdpgr.avg[63]  # Egypt
data2.gdpgr[54] = gdpgr.avg[102]  # Iran
data2.gdpgr[59] = gdpgr.avg[39]  # Ivory Coast
data2.gdpgr[67] = gdpgr.avg[112]  # Kyrgyzstan
data2.gdpgr[68] = gdpgr.avg[120]  # Laos
data2.gdpgr[72] = gdpgr.avg[147]  # Macedonia
data2.gdpgr[97] = gdpgr.avg[41]  # Rep. of Congo
data2.gdpgr[98] = gdpgr.avg[190]  # Russia
data2.gdpgr[111] = gdpgr.avg[215]  # Syria
data2.gdpgr[125] = gdpgr.avg[236]  # Venezuela
data2.gdpgr[126] = gdpgr.avg[240]  # West Bank and Gaza
data2.gdpgr[127] = gdpgr.avg[243]  # Yemen

In [72]:
print gdpgr.Country[21]
print gdpgr.Country[22]
print gdpgr.Country[245]
print gdpgr.Country[63]
print gdpgr.Country[102]
print gdpgr.Country[39]
print gdpgr.Country[112]
print gdpgr.Country[120]
print gdpgr.Country[147]
print gdpgr.Country[41]
print gdpgr.Country[190]
print gdpgr.Country[215]
print gdpgr.Country[236]
print gdpgr.Country[240]
print gdpgr.Country[243]

Bahamas, The
Bosnia and Herzegovina
Congo, Dem. Rep.
Egypt, Arab Rep.
Iran, Islamic Rep.
Cote d'Ivoire
Kyrgyz Republic
Lao PDR
Macedonia, FYR
Congo, Rep.
Russian Federation
Syrian Arab Republic
Venezuela, RB
West Bank and Gaza
Yemen, Rep.


In [73]:
# Here's what's left
data2[data2.gdpgr.isnull()]

Unnamed: 0,country,avg_attacks,sexratio,literacy,territory,proprelig,alcohol,gdpcp,gdpgr
103,Somalia,424.5,98.313,,8,0.999,0.5,,
111,Syria,207.75,103.233,95.28453,24,0.98,1.46,,
112,Taiwan,0.5,107.964,,3,0.873,5.735,,


<div class="alert alert-warning">
These still have missing values. <br>
Note sure if we should use "China" for Taiwan here, for economic data.
</div>

## democracy score (2011)

In [74]:
democ = pd.read_csv('data/demscore.csv')
democ

Unnamed: 0,country,2011
0,Abkhazia,
1,Afghanistan,
2,Akrotiri and Dhekelia,
3,Albania,9
4,Algeria,2
5,American Samoa,
6,Andorra,
7,Angola,-2
8,Anguilla,
9,Antigua and Barbuda,


In [75]:
data2["democ"] = float('NaN')

for i in range(data2.shape[0]):
    name = data2.country[i]
    for j in range(democ.shape[0]):
        if democ.country[j]==name:
            if math.isnan(democ["2011"][j]):
                data2.democ[i] = float('NaN')
            else:
                data2.democ[i] = democ["2011"][j]
            break

data2

Unnamed: 0,country,avg_attacks,sexratio,literacy,territory,proprelig,alcohol,gdpcp,gdpgr,democ
0,Afghanistan,1288.25,108.9140,,0,0.999,0.025,658.543414,6.120276,
1,Albania,0.75,96.7730,98.811199,0,0.986,6.985,4442.778223,1.871620,9
2,Algeria,22.75,103.0800,91.779641,3,0.982,0.825,5470.406003,3.250002,2
3,Argentina,1.25,101.3305,99.215080,2,0.878,9.675,13767.147515,3.135705,8
4,Armenia,0.25,88.8890,99.756760,2,0.987,12.505,3466.632856,4.700000,5
5,Australia,2.00,101.4740,,8,0.758,10.115,64751.357082,2.759036,10
6,Austria,0.75,102.1110,,4,0.865,12.820,50279.914147,1.121005,10
7,Azerbaijan,0.75,94.7420,99.975648,2,0.999,11.970,7569.818690,2.515650,-7
8,Bahamas,0.25,96.3805,,2,0.969,8.705,22040.151197,0.968758,
9,Bahrain,29.75,142.3155,98.163596,2,0.981,3.925,23637.291640,3.883306,-8


In [76]:
data2[data2.democ.isnull()]

Unnamed: 0,country,avg_attacks,sexratio,literacy,territory,proprelig,alcohol,gdpcp,gdpgr,democ
0,Afghanistan,1288.25,108.914,,0,0.999,0.025,658.543414,6.120276,
8,Bahamas,0.25,96.3805,,2,0.969,8.705,22040.151197,0.968758,
13,Belize,0.25,102.4845,,2,0.911,5.995,4636.851776,2.484792,
16,Bosnia-Herzegovina,1.25,95.9685,99.6976,3,0.975,9.615,4667.492729,0.868691,
24,Central African Republic,30.0,98.391,65.776528,2,0.99,3.26,413.319022,-6.905813,
31,Czech Republic,1.75,106.265,,10,0.236,16.46,20184.768415,0.611336,
32,Democratic Republic of the Congo,43.0,106.265,,2,0.982,3.345,398.723228,7.893147,
35,Dominican Republic,0.5,100.031,96.544841,0,0.891,6.345,5959.61148,4.392253,
51,Iceland,0.5,112.4065,,0,0.965,6.845,47519.86884,2.285653,
59,Ivory Coast,6.0,103.416,67.52731,2,0.92,6.475,1376.603933,6.133331,


In [77]:
# manual insertion
data2.democ[24] = democ["2011"][40]  # Central African Republic
data2.democ[32] = democ["2011"][49]  # Dem Rep Congo
data2.democ[35] = democ["2011"][62]  # Dominican Republic
data2.democ[59] = democ["2011"][53]  # Ivory Coast
data2.democ[72] = democ["2011"][132]  # Macedonia
data2.democ[97] = democ["2011"][50]  # Rep Congo

In [78]:
print democ.country[40]
print democ.country[49]
print democ.country[62]
print democ.country[53]
print democ.country[132]
print democ.country[50]

Central African Rep.
Congo, Dem. Rep.
Dominican Rep.
Cote d'Ivoire
Macedonia, FYR
Congo, Rep.


In [79]:
# see what's left
data2[data2.democ.isnull()]

Unnamed: 0,country,avg_attacks,sexratio,literacy,territory,proprelig,alcohol,gdpcp,gdpgr,democ
0,Afghanistan,1288.25,108.914,,0,0.999,0.025,658.543414,6.120276,
8,Bahamas,0.25,96.3805,,2,0.969,8.705,22040.151197,0.968758,
13,Belize,0.25,102.4845,,2,0.911,5.995,4636.851776,2.484792,
16,Bosnia-Herzegovina,1.25,95.9685,99.6976,3,0.975,9.615,4667.492729,0.868691,
31,Czech Republic,1.75,106.265,,10,0.236,16.46,20184.768415,0.611336,
51,Iceland,0.5,112.4065,,0,0.965,6.845,47519.86884,2.285653,
75,Maldives,3.0,101.476,99.301007,2,0.999,,7701.404796,6.901426,
77,Malta,0.25,105.205,98.26503,2,0.975,4.185,22100.07103,1.8,
117,Tunisia,14.0,101.6175,96.755223,4,0.998,1.17,4273.129471,2.224791,
126,West Bank and Gaza Strip,60.25,105.2455,99.181879,48,0.999,,2852.556448,4.147011,


<div class="alert alert-warning">
All these countries still have missing values.
</div>

## corruption score

In [80]:
corrup = pd.read_csv('data/corrup.csv')
corrup

Unnamed: 0,country,corrup,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,Afghanistan,12,,,
1,Albania,33,,,
2,Algeria,36,,,
3,Angola,19,,,
4,Argentina,34,,,
5,Armenia,37,,,
6,Australia,80,,,
7,Austria,72,,,
8,Azerbaijan,29,,,
9,Bahamas,71,,,


In [81]:
data2["corrup"] = float('NaN')

for i in range(data2.shape[0]):
    name = data2.country[i]
    for j in range(corrup.shape[0]):
        if corrup.country[j]==name:
            data2.corrup[i] = corrup["corrup"][j]
            break

data2

Unnamed: 0,country,avg_attacks,sexratio,literacy,territory,proprelig,alcohol,gdpcp,gdpgr,democ,corrup
0,Afghanistan,1288.25,108.9140,,0,0.999,0.025,658.543414,6.120276,,12
1,Albania,0.75,96.7730,98.811199,0,0.986,6.985,4442.778223,1.871620,9,33
2,Algeria,22.75,103.0800,91.779641,3,0.982,0.825,5470.406003,3.250002,2,36
3,Argentina,1.25,101.3305,99.215080,2,0.878,9.675,13767.147515,3.135705,8,34
4,Armenia,0.25,88.8890,99.756760,2,0.987,12.505,3466.632856,4.700000,5,37
5,Australia,2.00,101.4740,,8,0.758,10.115,64751.357082,2.759036,10,80
6,Austria,0.75,102.1110,,4,0.865,12.820,50279.914147,1.121005,10,72
7,Azerbaijan,0.75,94.7420,99.975648,2,0.999,11.970,7569.818690,2.515650,-7,29
8,Bahamas,0.25,96.3805,,2,0.969,8.705,22040.151197,0.968758,,71
9,Bahrain,29.75,142.3155,98.163596,2,0.981,3.925,23637.291640,3.883306,-8,49


In [82]:
data2[data2.corrup.isnull()]

Unnamed: 0,country,avg_attacks,sexratio,literacy,territory,proprelig,alcohol,gdpcp,gdpgr,democ,corrup
13,Belize,0.25,102.4845,,2,0.911,5.995,4636.851776,2.484792,,
16,Bosnia-Herzegovina,1.25,95.9685,99.6976,3,0.975,9.615,4667.492729,0.868691,,
59,Ivory Coast,6.0,103.416,67.52731,2,0.92,6.475,1376.603933,6.133331,0.0,
72,Macedonia,1.0,104.862,98.67001,2,0.986,8.725,5110.086461,2.079226,9.0,
75,Maldives,3.0,101.476,99.301007,2,0.999,,7701.404796,6.901426,,
97,Republic of the Congo,0.25,104.862,,2,0.91,4.33,3246.899329,4.301881,-4.0,
126,West Bank and Gaza Strip,60.25,105.2455,99.181879,48,0.999,,2852.556448,4.147011,,


In [83]:
# manual insertion
data2.corrup[16] = corrup["corrup"][18]  # Bosnia...
data2.corrup[59] = corrup["corrup"][36]  # Ivory Coast
data2.corrup[97] = corrup["corrup"][34]  # Rep of Congo

In [84]:
print corrup.country[18]
print corrup.country[36]
print corrup.country[34]

Bosnia and Herzegovina
C�te d�Ivoire
Congo Republic


In [85]:
# see what's left
data2[data2.corrup.isnull()]

Unnamed: 0,country,avg_attacks,sexratio,literacy,territory,proprelig,alcohol,gdpcp,gdpgr,democ,corrup
13,Belize,0.25,102.4845,,2,0.911,5.995,4636.851776,2.484792,,
72,Macedonia,1.0,104.862,98.67001,2,0.986,8.725,5110.086461,2.079226,9.0,
75,Maldives,3.0,101.476,99.301007,2,0.999,,7701.404796,6.901426,,
126,West Bank and Gaza Strip,60.25,105.2455,99.181879,48,0.999,,2852.556448,4.147011,,


<div class="alert alert-warning">
These still have missing values.
</div>

## disasters

In [86]:
disaster = pd.read_csv('data/disasters.csv')
disaster

Unnamed: 0,country,damage
0,Afghanistan,5
1,Afghanistan,1750000
2,Afghanistan,12810
3,Afghanistan,57
4,Algeria,793
5,Angola,0
6,Angola,90684
7,Argentina,0
8,Argentina,3000
9,Argentina,332


In [87]:
# group by country and sum
gbdisas = disaster.groupby('country')['damage'].sum()
gbdisas

country
Afghanistan              1986163
Albania                   230020
Algeria                     1374
Angola                   1925687
Argentina                 381716
Armenia                    76000
Australia                  85284
Austria                      200
Azerbaijan                 22499
Bahamas                    11004
Bahrain                        0
Bangladesh              11670151
Belarus                    42955
Belgium                       71
Benin                      99622
Bermuda                        0
Bhutan                     20016
Bolivia                  1251513
Bosnia-Hercegovenia      1010947
Botswana                    4210
Brazil                  34339023
Bulgaria                   46707
Burkina Faso             6882513
Burundi                    14017
Cambodia                 3741973
Cameroon                  321577
Canada                    120127
Cape Verde Is               2500
Central African Rep        95751
Chad                     2233462
  

In [88]:
type(gbdisas)

pandas.core.series.Series

In [89]:
gbdisas.shape[0]

176

In [90]:
data2["disasters"] = float('NaN')

for i in range(data2.shape[0]):
    name = data2.country[i]
    try:
        data2.disasters[i] = gbdisas[name]
    except:
        # not found
        pass

data2

Unnamed: 0,country,avg_attacks,sexratio,literacy,territory,proprelig,alcohol,gdpcp,gdpgr,democ,corrup,disasters
0,Afghanistan,1288.25,108.9140,,0,0.999,0.025,658.543414,6.120276,,12,1986163
1,Albania,0.75,96.7730,98.811199,0,0.986,6.985,4442.778223,1.871620,9,33,230020
2,Algeria,22.75,103.0800,91.779641,3,0.982,0.825,5470.406003,3.250002,2,36,1374
3,Argentina,1.25,101.3305,99.215080,2,0.878,9.675,13767.147515,3.135705,8,34,381716
4,Armenia,0.25,88.8890,99.756760,2,0.987,12.505,3466.632856,4.700000,5,37,76000
5,Australia,2.00,101.4740,,8,0.758,10.115,64751.357082,2.759036,10,80,85284
6,Austria,0.75,102.1110,,4,0.865,12.820,50279.914147,1.121005,10,72,200
7,Azerbaijan,0.75,94.7420,99.975648,2,0.999,11.970,7569.818690,2.515650,-7,29,22499
8,Bahamas,0.25,96.3805,,2,0.969,8.705,22040.151197,0.968758,,71,11004
9,Bahrain,29.75,142.3155,98.163596,2,0.981,3.925,23637.291640,3.883306,-8,49,0


In [91]:
data2[data2.disasters.isnull()]

Unnamed: 0,country,avg_attacks,sexratio,literacy,territory,proprelig,alcohol,gdpcp,gdpgr,democ,corrup,disasters
13,Belize,0.25,102.4845,,2,0.911,5.995,4636.851776,2.484792,,,
16,Bosnia-Herzegovina,1.25,95.9685,99.6976,3,0.975,9.615,4667.492729,0.868691,,39.0,
24,Central African Republic,30.0,98.391,65.776528,2,0.99,3.26,413.319022,-6.905813,-1.0,24.0,
27,China,14.25,107.964,99.64229,24,0.478,5.735,6606.141682,8.067403,-7.0,36.0,
31,Czech Republic,1.75,106.265,,10,0.236,16.46,20184.768415,0.611336,,51.0,
32,Democratic Republic of the Congo,43.0,106.265,,2,0.982,3.345,398.723228,7.893147,5.0,22.0,
34,Djibouti,0.25,101.374,,2,0.998,1.87,1637.081848,4.721977,2.0,34.0,
35,Dominican Republic,0.5,100.031,96.544841,0,0.891,6.345,5959.61148,4.392253,8.0,32.0,
38,Eritrea,0.25,98.622,90.13328,3,0.999,1.59,655.087299,4.683151,-7.0,18.0,
48,Guinea-Bissau,0.25,98.7455,73.16962,2,0.957,3.79,581.316253,2.416079,6.0,19.0,


In [92]:
# manual insertion
data2.disasters[16] = gbdisas['Bosnia-Hercegovenia']
data2.disasters[24] = gbdisas['Central African Rep']
data2.disasters[27] = gbdisas['China P Rep']
data2.disasters[31] = gbdisas['Czech Rep']
data2.disasters[32] = gbdisas['Congo']
data2.disasters[35] = gbdisas['Dominican Rep']
data2.disasters[54] = gbdisas['Iran Islam Rep']  
data2.disasters[59] = gbdisas["Cote d'Ivoire"]
data2.disasters[68] = gbdisas['Lao P Dem Rep']
data2.disasters[71] = gbdisas['Libyan Arab Jamah']
data2.disasters[72] = gbdisas['Macedonia FRY']
data2.disasters[80] = gbdisas['Moldova Rep']
data2.disasters[97] = gbdisas['Zaire/Congo Dem Rep']
data2.disasters[111] = gbdisas['Syrian Arab Rep']
data2.disasters[114] = gbdisas['Tanzania Uni Rep']
data2.disasters[126] = gbdisas['Palestine (West Bank)']

In [93]:
# see what's left
data2[data2.disasters.isnull()]

Unnamed: 0,country,avg_attacks,sexratio,literacy,territory,proprelig,alcohol,gdpcp,gdpgr,democ,corrup,disasters
13,Belize,0.25,102.4845,,2,0.911,5.995,4636.851776,2.484792,,,
34,Djibouti,0.25,101.374,,2,0.998,1.87,1637.081848,4.721977,2.0,34.0,
38,Eritrea,0.25,98.622,90.13328,3,0.999,1.59,655.087299,4.683151,-7.0,18.0,
48,Guinea-Bissau,0.25,98.7455,73.16962,2,0.957,3.79,581.316253,2.416079,6.0,19.0,
51,Iceland,0.5,112.4065,,0,0.965,6.845,47519.86884,2.285653,,79.0,
65,Kosovo,3.75,103.615,,0,0.984,11.65,3804.92157,3.268196,8.0,33.0,
66,Kuwait,0.25,158.0985,98.834028,8,0.999,0.1,49125.564277,5.911286,-7.0,44.0,
69,Lebanon,87.5,96.696,98.714789,0,0.997,2.265,9697.522758,1.776147,7.0,27.0,
116,Trinidad and Tobago,0.75,98.7505,99.56772,2,0.981,6.22,17690.506573,0.506658,10.0,38.0,


<div class="alert alert-warning">
These still have missing values.
</div>

## agricultural land

In [94]:
agric = pd.read_csv('data/agric.csv')
agric

Unnamed: 0,country,2011,2012,2013
0,Aruba,11.111111,11.111111,11.111111
1,Andorra,43.404255,42.978723,44.255319
2,Afghanistan,58.067580,58.067580,58.067580
3,Angola,47.316917,47.316917,47.477340
4,Albania,43.832117,43.843066,43.332117
5,Arab World,38.030748,38.353421,38.133138
6,United Arab Emirates,4.706938,4.565789,4.572967
7,Argentina,54.182607,54.538146,54.518049
8,Armenia,59.645241,59.114858,59.083246
9,American Samoa,24.500000,24.500000,24.500000


<div class="alert alert-info">
Note that we do not include missing values in calculating the row average.
</div>

In [95]:
agric["avg"] = float('NaN')
for i in range(agric.shape[0]):
    runningtot = 0
    totalvals = 0
    for j in range(1, 4, 1):
        if not math.isnan(agric.iloc[i][j]):
            runningtot += agric.iloc[i][j]
            totalvals += 1
    if totalvals>0:
        agric = agric.set_value(i, "avg", runningtot/(totalvals*1.0))
        
agric

Unnamed: 0,country,2011,2012,2013,avg
0,Aruba,11.111111,11.111111,11.111111,11.111111
1,Andorra,43.404255,42.978723,44.255319,43.546099
2,Afghanistan,58.067580,58.067580,58.067580,58.067580
3,Angola,47.316917,47.316917,47.477340,47.370391
4,Albania,43.832117,43.843066,43.332117,43.669100
5,Arab World,38.030748,38.353421,38.133138,38.172436
6,United Arab Emirates,4.706938,4.565789,4.572967,4.615231
7,Argentina,54.182607,54.538146,54.518049,54.412934
8,Armenia,59.645241,59.114858,59.083246,59.281115
9,American Samoa,24.500000,24.500000,24.500000,24.500000


In [96]:
data2["agric"] = float('NaN')

for i in range(data2.shape[0]):
    name = data2.country[i]
    for j in range(agric.shape[0]):
        if agric.country[j]==name:
            data2.agric[i] = agric.avg[j]
            break

data2

Unnamed: 0,country,avg_attacks,sexratio,literacy,territory,proprelig,alcohol,gdpcp,gdpgr,democ,corrup,disasters,agric
0,Afghanistan,1288.25,108.9140,,0,0.999,0.025,658.543414,6.120276,,12,1986163,58.067580
1,Albania,0.75,96.7730,98.811199,0,0.986,6.985,4442.778223,1.871620,9,33,230020,43.669100
2,Algeria,22.75,103.0800,91.779641,3,0.982,0.825,5470.406003,3.250002,2,36,1374,17.384815
3,Argentina,1.25,101.3305,99.215080,2,0.878,9.675,13767.147515,3.135705,8,34,381716,54.412934
4,Armenia,0.25,88.8890,99.756760,2,0.987,12.505,3466.632856,4.700000,5,37,76000,59.281115
5,Australia,2.00,101.4740,,8,0.758,10.115,64751.357082,2.759036,10,80,85284,52.578086
6,Austria,0.75,102.1110,,4,0.865,12.820,50279.914147,1.121005,10,72,200,38.367234
7,Azerbaijan,0.75,94.7420,99.975648,2,0.999,11.970,7569.818690,2.515650,-7,29,22499,57.694526
8,Bahamas,0.25,96.3805,,2,0.969,8.705,22040.151197,0.968758,,71,11004,
9,Bahrain,29.75,142.3155,98.163596,2,0.981,3.925,23637.291640,3.883306,-8,49,0,11.183393


In [97]:
data2[data2.agric.isnull()]

Unnamed: 0,country,avg_attacks,sexratio,literacy,territory,proprelig,alcohol,gdpcp,gdpgr,democ,corrup,disasters,agric
8,Bahamas,0.25,96.3805,,2,0.969,8.705,22040.151197,0.968758,,71.0,11004.0,
16,Bosnia-Herzegovina,1.25,95.9685,99.6976,3,0.975,9.615,4667.492729,0.868691,,39.0,1010947.0,
32,Democratic Republic of the Congo,43.0,106.265,,2,0.982,3.345,398.723228,7.893147,5.0,22.0,34914.0,
37,Egypt,182.25,100.6855,86.429147,48,0.999,0.345,3046.943898,2.080707,-2.0,37.0,436.0,
54,Iran,10.0,103.2615,97.576977,5,0.999,1.025,6677.546833,-0.768914,-7.0,27.0,527059.0,
59,Ivory Coast,6.0,103.416,67.52731,2,0.92,6.475,1376.603933,6.133331,0.0,32.0,28.0,
65,Kosovo,3.75,103.615,,0,0.984,11.65,3804.92157,3.268196,8.0,33.0,,
67,Kyrgyzstan,0.75,100.3085,99.75185,2,0.996,4.905,1213.3586,5.096657,7.0,27.0,11050.0,
68,Laos,0.25,99.9755,83.931063,2,0.991,6.86,1551.792854,8.009827,-7.0,25.0,1078003.0,
72,Macedonia,1.0,104.862,98.67001,2,0.986,8.725,5110.086461,2.079226,9.0,,18811.0,


In [98]:
# manual insertion
data2.agric[8] = agric.avg[21]  # Bahamas
data2.agric[16] = agric.avg[22]  # Bosnia...
data2.agric[32] = agric.avg[245]  # Dem Rep Congo
data2.agric[37] = agric.avg[63]  # Egypt
data2.agric[54] = agric.avg[102]  # Iran
data2.agric[59] = agric.avg[39]  # Ivory Coast
data2.agric[67] = agric.avg[112]  # Kyrgyzstan
data2.agric[68] = agric.avg[120]  # Laos
data2.agric[72] = agric.avg[147]  # Macedonia
data2.agric[97] = agric.avg[41]  # Rep. of Congo
data2.agric[98] = agric.avg[190]  # Russia
data2.agric[111] = agric.avg[215]  # Syria
data2.agric[125] = agric.avg[236]  # Venezuela
data2.agric[126] = agric.avg[240]  # West Bank and Gaza
data2.agric[127] = agric.avg[243]  # Yemen

In [99]:
print agric.country[21]
print agric.country[22]
print agric.country[245]
print agric.country[63]
print agric.country[102]
print agric.country[39]
print agric.country[112]
print agric.country[120]
print agric.country[147]
print agric.country[41]
print agric.country[190]
print agric.country[215]
print agric.country[236]
print agric.country[240]
print agric.country[243]

Bahamas, The
Bosnia and Herzegovina
Congo, Dem. Rep.
Egypt, Arab Rep.
Iran, Islamic Rep.
Cote d'Ivoire
Kyrgyz Republic
Lao PDR
Macedonia, FYR
Congo, Rep.
Russian Federation
Syrian Arab Republic
Venezuela, RB
West Bank and Gaza
Yemen, Rep.


In [100]:
## see what's left
data2[data2.agric.isnull()]

Unnamed: 0,country,avg_attacks,sexratio,literacy,territory,proprelig,alcohol,gdpcp,gdpgr,democ,corrup,disasters,agric
65,Kosovo,3.75,103.615,,0,0.984,11.65,3804.92157,3.268196,8,33,,
105,South Sudan,14.25,102.1215,,3,0.995,2.56,1192.090141,-0.356402,0,15,744489.0,
112,Taiwan,0.5,107.964,,3,0.873,5.735,,,10,61,37061.0,


<div class="alert alert-warning">
These still have missing values.
</div>

# Create output table

<div class="alert alert-warning">
Note: missing values are represented by NaN.
</div>

In [101]:
data2

Unnamed: 0,country,avg_attacks,sexratio,literacy,territory,proprelig,alcohol,gdpcp,gdpgr,democ,corrup,disasters,agric
0,Afghanistan,1288.25,108.9140,,0,0.999,0.025,658.543414,6.120276,,12,1986163,58.067580
1,Albania,0.75,96.7730,98.811199,0,0.986,6.985,4442.778223,1.871620,9,33,230020,43.669100
2,Algeria,22.75,103.0800,91.779641,3,0.982,0.825,5470.406003,3.250002,2,36,1374,17.384815
3,Argentina,1.25,101.3305,99.215080,2,0.878,9.675,13767.147515,3.135705,8,34,381716,54.412934
4,Armenia,0.25,88.8890,99.756760,2,0.987,12.505,3466.632856,4.700000,5,37,76000,59.281115
5,Australia,2.00,101.4740,,8,0.758,10.115,64751.357082,2.759036,10,80,85284,52.578086
6,Austria,0.75,102.1110,,4,0.865,12.820,50279.914147,1.121005,10,72,200,38.367234
7,Azerbaijan,0.75,94.7420,99.975648,2,0.999,11.970,7569.818690,2.515650,-7,29,22499,57.694526
8,Bahamas,0.25,96.3805,,2,0.969,8.705,22040.151197,0.968758,,71,11004,1.398601
9,Bahrain,29.75,142.3155,98.163596,2,0.981,3.925,23637.291640,3.883306,-8,49,0,11.183393


In [102]:
data2.to_csv("terror2.csv")