In [None]:
%matplotlib notebook
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import scipy.stats as st
from scipy.stats import linregress
import pycountry


#data file to work with
file = '../cleanData/CleanFullDataSet.csv'

#read data file
clean_data = pd.read_csv(file, low_memory = False)

In [None]:
# display the columns related to citizenship data 
start_df = pd.DataFrame(clean_data[['gender', 'ageBroad', 'Citizenship Country', 'Citizenship Region', 'Citizenship Intermediate Region', 'Citizenship Sub-Region']])
start_df.head(2)

In [None]:
#check to see if there is any missing info in the columns
start_df.count()

In [None]:
# create a new data frame that doesn't include the 'Citizenship Intermediate Region' bc it has missing info & isn't of importance to me rn
citizen_df = pd.DataFrame(clean_data[['gender', 'ageBroad', 'Citizenship Region', 'Citizenship Sub-Region', 'Citizenship Country',]])
citizen_df.count()

In [None]:
citizen_df_renamed = citizen_df.rename(columns = {'gender' : 'Gender', 'ageBroad' : 'Age Range',})
citizen_df_renamed.head(2)

In [None]:
citizen_df_renamed['Citizenship Country'].nunique()

In [None]:
citizen_df_renamed['Citizenship Country'].value_counts()

In [None]:
regions = clean_data['Citizenship Region'].value_counts()
regions

In [None]:
sub_regions = clean_data['Citizenship Sub-Region'].value_counts()
sub_regions

#### ASIA - data exploration - region and gender

In [None]:
#create a data frame for each region, based on gender
is_female = citizen_df_renamed['Gender'] == 'Female'
is_male = citizen_df_renamed['Gender'] == 'Male'
is_asia = citizen_df_renamed['Citizenship Region'] == 'Asia'

female_asia = pd.DataFrame(citizen_df_renamed[is_female & is_asia])
male_asia = pd.DataFrame(citizen_df_renamed[is_male & is_asia])

female_asia.nunique()

In [None]:
male_asia.nunique()

In [None]:
asia_df = citizen_df_renamed.loc[citizen_df_renamed['Citizenship Region'] == 'Asia', ['Gender', 'Age Range', 'Citizenship Sub-Region', 'Citizenship Country']]
asia_df.head(2)

In [None]:
# copy the df so we don't overwrite original asia_df data
asia_df_copy = asia_df.copy()

In [None]:
#asia_df_copy.head()

In [None]:
country_counts = asia_df_copy['Citizenship Country'].value_counts()
country_counts

In [None]:
country_list = asia_df_copy['Citizenship Country'].unique()
country_list

In [None]:
#asia_countries = ['Uzbekistan', 'Indonesia', 'Tajikistan', 'Kyrgyzstan', 'Cambodia',
       #"Lao People's Democratic Republic", 'Kazakhstan', 'Turkmenistan',
       #'Sri Lanka', 'Myanmar', 'Afghanistan', 'Nepal', 'Philippines',
       #'Bangladesh', 'Thailand', 'Republic of Korea', 'Viet Nam', 'China']

#x_axis = asia_countries

#y_axis = np.arange(1, 12000, 100)
#plt.bar(x_axis, y_axis, color = 'm')

In [None]:
asia_gender_counts = asia_df_copy['Gender'].value_counts()
asia_gender_counts

In [None]:
asia_sub_counts = asia_df_copy['Citizenship Sub-Region'].value_counts()
asia_sub_counts

In [None]:
# use groupby to separate the data into fields acoording to country
grouped_asia_df = asia_df_copy.groupby(['Citizenship Country', 'Gender', 'Age Range'])

#the object returned is a groupby object and cannot be returned normally
print(grouped_asia_df)

#in order to be visualized, a data function must be used
grouped_asia_df.count().head()

#### EUROPE - data exploration  - region and gender

In [None]:
#create a data frame for each region, based on gender
is_female = citizen_df_renamed['Gender'] == 'Female'
is_male = citizen_df_renamed['Gender'] == 'Male'
is_europe = citizen_df_renamed['Citizenship Region'] == 'Europe'
female_europe = pd.DataFrame(citizen_df_renamed[is_female & is_europe])
male_europe = pd.DataFrame(citizen_df_renamed[is_male & is_europe])

#show unique values in the series
female_europe.nunique() 

In [None]:
male_europe.nunique()

In [None]:
europe_df_copy = europe_df.copy()

In [None]:
europe_sub_counts = europe_df_copy['Citizenship Sub-Region'].value_counts()
europe_sub_counts

In [None]:
europe_df = citizen_df_renamed.loc[citizen_df_renamed['Citizenship Region'] == 'Europe', ['Gender', 'Age Range', 'Citizenship Sub-Region', 'Citizenship Country']]
europe_df.head()

In [None]:
# use groupby to separate the data into fields acoording to country
europe_df_copy = europe_df.copy()
grouped_europe_df = europe_df_copy.groupby(['Citizenship Country', 'Gender', 'Age Range'])

#the object returned is a groupby object and cannot be returned normally
print(grouped_europe_df)

#in order to be visualized, a data function must be used
grouped_europe_df.count().head()

#### AMERICAS - data exploration  - region and gender

In [None]:
#create a data frame for each region, based on gender
is_female = citizen_df_renamed['Gender'] == 'Female'
is_male = citizen_df_renamed['Gender'] == 'Male'
is_americas = citizen_df_renamed['Citizenship Region'] == 'Americas'

female_americas = pd.DataFrame(citizen_df_renamed[is_female & is_americas])
male_americas = pd.DataFrame(citizen_df_renamed[is_male & is_americas])

female_americas.nunique()

In [None]:
male_americas.nunique()

In [None]:
americas_df = citizen_df_renamed.loc[citizen_df_renamed['Citizenship Region'] == 'Americas', ['Gender', 'Age Range', 'Citizenship Sub-Region', 'Citizenship Country']]
americas_df

#### AFRICA - data exploration  - region and gender

In [None]:
#create a data frame for each region, based on gender
is_female = citizen_df_renamed['Gender'] == 'Female'
is_male = citizen_df_renamed['Gender'] == 'Male'
is_africa = citizen_df_renamed['Citizenship Region'] == 'Africa'

female_africa = pd.DataFrame(citizen_df_renamed[is_female & is_africa])
male_africa = pd.DataFrame(citizen_df_renamed[is_male & is_africa])

female_africa.nunique()

In [None]:
male_africa.nunique()

In [None]:
africa_df = citizen_df_renamed.loc[citizen_df_renamed['Citizenship Region'] == 'Africa', ['Gender', 'Age Range', 'Citizenship Sub-Region', 'Citizenship Country']]
africa_df

#### UNKNOWN - data exploration  - region and gender

In [None]:
#create a data frame for each region, based on gender 
is_female = citizen_df_renamed['Gender'] == 'Female'
is_male = citizen_df_renamed['Gender'] == 'Male'
is_unknown = citizen_df_renamed['Citizenship Region'] == 'unknown'

female_unknown = pd.DataFrame(citizen_df_renamed[is_female & is_unknown])
male_unknown = pd.DataFrame(citizen_df_renamed[is_male & is_unknown])

female_unknown.nunique()

In [None]:
male_unknown.nunique()

In [None]:
unknown_df = citizen_df_renamed.loc[citizen_df_renamed['Citizenship Region'] == 'unknown', ['Gender', 'Age Range', 'Citizenship Sub-Region', 'Citizenship Country']]
unknown_df

In [None]:
unknown_df.nunique()  

In [None]:
pycountry.countries.lookup('CO')