In [69]:
%matplotlib notebook
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import scipy.stats as st
from scipy.stats import linregress
import pycountry


#data file to work with
file = '../cleanData/CleanFullDataSet.csv'

#read data file
clean_data = pd.read_csv(file, low_memory = False)

In [126]:
# display the columns related to citizenship data 
start_df = pd.DataFrame(clean_data[['gender', 'ageBroad', 'Citizenship Country', 'Citizenship Region', 'Citizenship Intermediate Region', 'Citizenship Sub-Region']])
start_df.head()

Unnamed: 0,gender,ageBroad,Citizenship Country,Citizenship Region,Citizenship Intermediate Region,Citizenship Sub-Region
0,Female,18--20,Colombia,Americas,South America,Latin America and the Caribbean
1,Female,18--20,Colombia,Americas,South America,Latin America and the Caribbean
2,Female,18--20,Colombia,Americas,South America,Latin America and the Caribbean
3,Female,18--20,Colombia,Americas,South America,Latin America and the Caribbean
4,Female,18--20,Colombia,Americas,South America,Latin America and the Caribbean


In [87]:
#check to see if there is any missing info in the columns
start_df.count()

gender                             48773
ageBroad                           48773
Citizenship Country                48773
Citizenship Region                 48773
Citizenship Intermediate Region    11001
Citizenship Sub-Region             48773
dtype: int64

In [127]:
# create a new data frame that doesn't include the 'Citizenship Intermediate Region' bc it has missing info & isn't of importance to me rn
citizen_df = pd.DataFrame(clean_data[['gender', 'ageBroad', 'Citizenship Country', 'Citizenship Region', 'Citizenship Sub-Region']])
citizen_df.count()

gender                    48773
ageBroad                  48773
Citizenship Country       48773
Citizenship Region        48773
Citizenship Sub-Region    48773
dtype: int64

In [128]:
citizen_df.head()

Unnamed: 0,gender,ageBroad,Citizenship Country,Citizenship Region,Citizenship Sub-Region
0,Female,18--20,Colombia,Americas,Latin America and the Caribbean
1,Female,18--20,Colombia,Americas,Latin America and the Caribbean
2,Female,18--20,Colombia,Americas,Latin America and the Caribbean
3,Female,18--20,Colombia,Americas,Latin America and the Caribbean
4,Female,18--20,Colombia,Americas,Latin America and the Caribbean


In [129]:
citizen_df_renamed = citizen_df.rename(columns = {'gender' : 'Gender', 'ageBroad' : 'Age Range',})
citizen_df_renamed.head()

Unnamed: 0,Gender,Age Range,Citizenship Country,Citizenship Region,Citizenship Sub-Region
0,Female,18--20,Colombia,Americas,Latin America and the Caribbean
1,Female,18--20,Colombia,Americas,Latin America and the Caribbean
2,Female,18--20,Colombia,Americas,Latin America and the Caribbean
3,Female,18--20,Colombia,Americas,Latin America and the Caribbean
4,Female,18--20,Colombia,Americas,Latin America and the Caribbean


In [130]:
citizen_df_renamed['Citizenship Country'].nunique()

45

In [131]:
citizen_df_renamed['Citizenship Country'].value_counts()

Philippines                         11365
unknown                              9136
Ukraine                              7761
Republic of Moldova                  5901
United States of America             3636
Cambodia                             1979
Indonesia                            1971
Belarus                              1463
Myanmar                              1250
Romania                               655
Ghana                                 544
Kyrgyzstan                            437
Bulgaria                              342
Haiti                                 339
Uzbekistan                            261
Viet Nam                              170
Guinea-Bissau                         145
Lao People's Democratic Republic      126
Colombia                              124
Nigeria                               108
Afghanistan                            97
Madagascar                             94
Sri Lanka                              85
Sierra Leone                      

In [132]:
citizen_df_renamed['Citizenship Region'].value_counts()

Asia        17992
Europe      16144
unknown      9136
Americas     4136
Africa       1365
Name: Citizenship Region, dtype: int64

In [133]:
# Print out country column as Pandas Series
# print(citizen_df_renamed['Citizenship Country'])

In [134]:
# Print out country column as Pandas DataFrame
# print(citizen_df_renamed[['Citizenship Country']])

In [135]:
# Print out DataFrame with country and gender columns
#citizen_df_renamed[['Gender', 'Citizenship Country']].head()

In [136]:
regions = clean_data['Citizenship Region'].value_counts()
regions

Asia        17992
Europe      16144
unknown      9136
Americas     4136
Africa       1365
Name: Citizenship Region, dtype: int64

In [137]:
is_female = citizen_df_renamed['Gender'] == 'Female'
is_asia = citizen_df_renamed['Citizenship Region'] == 'Asia'
female_asia = pd.DataFrame(citizen_df_renamed[is_female & is_asia])
female_asia.nunique()

Gender                     1
Age Range                 10
Citizenship Country       15
Citizenship Region         1
Citizenship Sub-Region     4
dtype: int64

In [138]:
is_male = citizen_df_renamed['Gender'] == 'Male'
is_asia = citizen_df_renamed['Citizenship Region'] == 'Asia'
male_asia = pd.DataFrame(citizen_df_renamed[is_male & is_asia])
male_asia.nunique()

Gender                     1
Age Range                 10
Citizenship Country       10
Citizenship Region         1
Citizenship Sub-Region     3
dtype: int64

In [156]:
asia_df = citizen_df_renamed.loc[citizen_df_renamed['Citizenship Region'] == 'Asia', ['Gender', 'Age Range', 'Citizenship Sub-Region', 'Citizenship Country']]
asia_df

Unnamed: 0,Gender,Age Range,Citizenship Sub-Region,Citizenship Country
1977,Female,18--20,Central Asia,Uzbekistan
1978,Female,18--20,Central Asia,Uzbekistan
1979,Female,18--20,Central Asia,Uzbekistan
1980,Female,18--20,Central Asia,Uzbekistan
1981,Female,18--20,Central Asia,Uzbekistan
...,...,...,...,...
46763,Female,30--38,Eastern Asia,China
46764,Female,30--38,Eastern Asia,China
46765,Female,30--38,Eastern Asia,China
46766,Female,30--38,Eastern Asia,China


In [155]:
#create a data frame for each region, based on gender
is_europe = citizen_df_renamed['Citizenship Region'] == 'Europe'
female_europe = pd.DataFrame(citizen_df_renamed[is_female & is_europe])
male_europe = pd.DataFrame(citizen_df_renamed[is_male & is_europe])

#show unique values in the series
female_europe.nunique() 

Gender                    1
Age Range                 9
Citizenship Country       6
Citizenship Region        1
Citizenship Sub-Region    2
dtype: int64

In [154]:
male_europe.nunique()

Gender                    1
Age Range                 9
Citizenship Country       4
Citizenship Region        1
Citizenship Sub-Region    1
dtype: int64

In [None]:
is_americas = citizen_df_renamed['Citizenship Region'] == 'Americas'
female_americas = pd.DataFrame(citizen_df_renamed[is_female & is_americas])
female_americas.nunique()

In [143]:
europe_df = citizen_df_renamed.loc[citizen_df_renamed['Citizenship Region'] == 'Europe', ['Gender', 'Age Range', 'Citizenship Sub-Region', 'Citizenship Country']]
europe_df

Unnamed: 0,Gender,Age Range,Citizenship Sub-Region,Citizenship Country
11,Female,18--20,Eastern Europe,Republic of Moldova
12,Female,18--20,Eastern Europe,Republic of Moldova
13,Female,18--20,Eastern Europe,Republic of Moldova
14,Female,18--20,Eastern Europe,Republic of Moldova
15,Female,18--20,Eastern Europe,Republic of Moldova
...,...,...,...,...
44349,Male,9--17,Eastern Europe,Ukraine
44350,Male,9--17,Eastern Europe,Ukraine
44351,Male,9--17,Eastern Europe,Ukraine
44352,Male,9--17,Eastern Europe,Ukraine


In [144]:
americas_df = citizen_df_renamed.loc[citizen_df_renamed['Citizenship Region'] == 'Americas', ['Gender', 'Age Range', 'Citizenship Sub-Region', 'Citizenship Country']]
americas_df

Unnamed: 0,Gender,Age Range,Citizenship Sub-Region,Citizenship Country
0,Female,18--20,Latin America and the Caribbean,Colombia
1,Female,18--20,Latin America and the Caribbean,Colombia
2,Female,18--20,Latin America and the Caribbean,Colombia
3,Female,18--20,Latin America and the Caribbean,Colombia
4,Female,18--20,Latin America and the Caribbean,Colombia
...,...,...,...,...
48768,Male,9--17,Northern America,United States of America
48769,Male,9--17,Northern America,United States of America
48770,Male,9--17,Northern America,United States of America
48771,Male,9--17,Northern America,United States of America


In [None]:
regions = ['Americas', 'Africa', 'Asia', 'Europe', 'unknown']
counts_per_region = ['4136', '1365', '17992', '16144', '9136']
x_axis = np.arange(len(regions))

In [None]:
plt.bar(x_axis, counts_per_region, color = 'm', align = 'center')

In [None]:
pycountry.countries.lookup('CO')