# How many iPhones can a Data Scientist buy around the world?

The following analysis in its entirety uses only responses from Data Scientists.

I've long wondered about the differences in strength between different currencies and their related economies ever since I came across a concept call PPP (Purchasing Power Parity) [read about it here](https://en.wikipedia.org/wiki/Purchasing_power_parity).

So, I thought of a fun (pretty comical) way to demonstrate it to people while hopefully making them understand the significance of the discussion. I am extremely interested in the discussion this creates and I'm glad to share it with you!

#### Module imports

In [None]:
# Module imports
import os
from collections import defaultdict
from functools import partial
import json

import numpy as np 
import pandas as pd
import geopandas as gpd
import folium
from folium import Marker, Popup
from folium import Icon
import seaborn as sns
import altair as alt
import matplotlib.pyplot as plt

In [None]:
# Reading the required csvs
answers = pd.read_csv('/kaggle/input/kaggle-survey-2020/' +
                      'kaggle_survey_2020_responses.csv', low_memory=False)

# This data was collected externally. Please view annex or dataset page for details.
money_df = pd.read_csv('../input/' +
                       'iphone-prices-and-average-rent-across-the-world/monetary_data.csv',
                       index_col='Country')

answers = answers.loc[answers.Q5 == 'Data Scientist']
answers = answers.replace(to_replace='Republic of Korea', value='South Korea')

cost_living = pd.read_csv('../input/iphone-prices-and-average-rent-across-the-world/cost_living.csv', 
                          index_col='Country')

#### Declaring constants and variables

In [None]:
## CONSTANTS, STRINGS AND CONVERSION DICTIONARIES

# USA
USA = 'United States of America'

# Selection color for altair charts
selection_color = 'dodgerblue'

# age order
age_in_order = ['18-21', '22-24', '25-29', '30-34','35-39', 
        '40-44', '45-49','50-54', '55-59', '60-69', '70+']

age_in_order_spec = ['18-21', '22-24', '25-29',]

#degree order - 'I prefer not to answer' removed
degrees_in_order_spec=   ['Bachelor’s degree',
                          'Master’s degree',
                          'Doctoral degree',]

# Ordering for salary data - taken from kaggle team notebook.
responses_in_order =   ['$0-999',
                        '1,000-1,999',
                        '2,000-2,999',
                        '3,000-3,999',
                        '4,000-4,999',
                        '5,000-7,499',
                        '7,500-9,999',
                        '10,000-14,999',
                        '15,000-19,999',
                        '20,000-24,999',
                        '25,000-29,999',
                        '30,000-39,999',
                        '40,000-49,999',
                        '50,000-59,999',
                        '60,000-69,999',
                        '70,000-79,999',
                        '80,000-89,999',
                        '90,000-99,999',
                        '100,000-124,999',
                        '125,000-149,999',
                        '150,000-199,999',
                        '200,000-249,999',
                        '250,000-299,999',
                        '300,000-500,000',
                        '> $500,000']

# Conversion of intervals to numerical values. The value is the assumed to be the middle of the interval.
# Of course, this creates an issue for the final open interval for which the only safe assumption is
# the boundary itself.
# This is dict is later converted to a defaultdict to remove any bad values from the equation.
responses_dict =      {'$0-999':           500,
                        '1,000-1,999':     1500,
                        '2,000-2,999':     2500,
                        '3,000-3,999':     3500,
                        '4,000-4,999':     4500,
                        '5,000-7,499':     6250,
                        '7,500-9,999':     8750,
                        '10,000-14,999':   12_500,
                        '15,000-19,999':   17_500,
                        '20,000-24,999':   22_500,
                        '25,000-29,999':   27_500,
                        '30,000-39,999':   35_000,
                        '40,000-49,999':   45_000,
                        '50,000-59,999':   55_000,
                        '60,000-69,999':   65_000,
                        '70,000-79,999':   75_000,
                        '80,000-89,999':   85_000,
                        '90,000-99,999':   95_000,
                        '100,000-124,999': 112_500,
                        '125,000-149,999': 137_500,
                        '150,000-199,999': 175_000,
                        '200,000-249,999': 225_000,
                        '250,000-299,999': 275_000,
                        '300,000-500,000': 400_000,
                        '> $500,000':      500_000
                        }


# data to convert AI expenditure intervals into numerical values.
# conversion style is just like for salaries.
expense_dict = {
            '$0 ($USD)'               : 0,
            '$1-$99'                  : 50,
            '$100-$999'               : 500,
            '$1000-$9,999'            : 5000,
            '$10,000-$99,999'         : 55000,
            '$100,000 or more ($USD)' : 100_000
            }

# ordering for investment levels
expense_in_order = [
            '$0 ($USD)',
            '$1-$99',
            '$100-$999',
            '$1000-$9,999',
            '$10,000-$99,999',
            '$100,000 or more ($USD)'
]


#### Setup code for maps and graphs

In [None]:
money_conv = defaultdict(int, responses_dict)
expense_conv = defaultdict(int, expense_dict )

answers['avg_money']   = answers.Q24.apply(lambda x: money_conv[x])
answers['avg_expense'] = answers.Q25.apply(lambda x: expense_conv[x])

data     = answers.iloc[1:].groupby('Q3').avg_money.mean().sort_values(ascending=False)
data_exp = answers.iloc[1:].groupby('Q3').avg_expense.mean().sort_values(ascending=False)

countries = gpd.tools.geocode(data.index.values, provider='nominatim', 
                           user_agent='my-kaggle-ds-nb')

countries.index = data.index

countries

data2 = pd.concat([data, data_exp, countries], axis=1)

data2

countries2 = gpd.tools.geocode(money_df.index.values, provider='nominatim', 
                           user_agent='my-kaggle-ds-nb')

countries2.index = money_df.index

data2['monthly_income'] = data2.avg_money/12

money_df = pd.concat([money_df, countries2], axis=1)


cols_to_use = data2.columns.difference(money_df.columns).values.tolist()
cols_to_use.append('address')
cols_to_use
new_data = pd.merge(data2[cols_to_use], money_df ,on='address', left_index=True)
new_data['monthly_phones'] = (new_data['monthly_income']/new_data['iphone_price'])
new_data['rent/income'] = (100*new_data['avg_rent']/new_data['monthly_income'])
new_data['income_less2rent'] = new_data['monthly_income'] - 2*new_data['avg_rent'] 
new_data['expense/income'] = (100*new_data['avg_expense']/5)/(new_data['avg_money'])

new_data = new_data.round(decimals=2)
# new_data.head()

## Interactive Map - How many iPhones can a Data Scientist buy?

**NOTE**:
<br>Browse the interactive map and find out! You can click on a marker to get some more detailed facts or move on to the charts below the map to do easier comparisons. Unless specified, tooltips are active on all charts to help you understand what you are looking at.
<br><br>The countries shown are based on those for which iPhone price data was readily available.
<br><br>Hover on a marker to see the number of iPhones (iPhone pro max 512 GB) that a data scientist can buy in that country
<br><br>Scroll (pinch) zoom is disabled because I find it horribly annoying.
<br><br>Please see Annex A at the end for choices, justifications and explanations

In [None]:
f = folium.Figure(width=800, height=400)

def iphone_price_marker(row, m):
    text  =f'<b>Country</b>: {row.name}<br><b>You can buy </b>: {row.monthly_phones:.2f} iPhones'
    text2 =f'<b>Average D.S. salary (USD):</b> {row.monthly_income:.2f}<br><b>iPhone price (USD):</b> {row.iphone_price}<br><b>Average rent (USD):</b> {row.avg_rent}'
    Marker([row.geometry.y, row.geometry.x],
           popup=folium.Popup(text2,
                              max_width=300, min_width=300),
           tooltip=text).add_to(m)


m = folium.Map(location=[20, 20], tiles='CartoDB positron',
               zoom_start=2,
               zoom_control=True,
               scrollWheelZoom=False,)
iphone_adder = partial(iphone_price_marker, m=m)
new_data.apply(iphone_adder, axis=1)

print('CLICK ON A MARKER TO SEE MORE (Scroll (pinch) zoom is disabled)')
m.add_to(f)


**Isn't it interesting?**

For Indians this price represents a cost higher than most people's living expenses for a month (for some, even a couple of months)

You can interact with the graphs below to get a sense for the distribution of prices and other related data. A dropdown list lets you highlight countries on the graph for better clarity
and you can hover on a bar for a precise value.

In [None]:
##add titles (properties)

height = 300
width  = 250
dropdown_options = new_data.index.tolist()
dropdown_options.sort()
input_dropdown = alt.binding_select(options=dropdown_options)
input_dropdown2 = alt.binding_select(options=dropdown_options)

selection = (alt.selection_single(fields=['Country'], 
                                  bind=input_dropdown,
                                  name='Country 1:',
                                  init={'Country':'USA'}))

selection2 = (alt.selection_single(fields=['Country'], 
                                  bind=input_dropdown2, 
                                  name='Country 2:',
                                  init={'Country':'India'}))

chrt = alt.Chart(new_data.loc[:,['monthly_income']].reset_index()).mark_bar().encode(
    x='monthly_income:Q',
    y='Country:O',
    tooltip=['Country','monthly_income'],
    color=alt.condition(
            (selection | selection2),
            alt.value(selection_color),
            alt.value('lightgrey'))
).properties(height=height, width=width, title='Monthly income (USD) across countries')


chrt2 = alt.Chart(new_data.loc[:,['iphone_price']].reset_index()).mark_bar().encode(
    x='iphone_price:Q',
    y='Country:O',
    tooltip=['Country','iphone_price'],
    color=alt.condition(
            (selection | selection2),
            alt.value(selection_color),
            alt.value('lightgrey'))
).properties(height=height, width=width, title='iPhone price (USD) acoss countries')

chrt3 = alt.Chart(new_data.loc[:,['rent/income']].reset_index()).mark_bar().encode(
    x='rent/income:Q',
    y='Country:O',
    tooltip=['Country','rent/income'],
    color=alt.condition(
            (selection | selection2),
            alt.value(selection_color),
            alt.value('lightgrey'))
).properties(height=height, width=width, title='Rent to income ratio (%) across countries')


chrt4 = alt.Chart(new_data.loc[:,['avg_rent']].reset_index()).mark_bar().encode(
    x='avg_rent:Q',
    y='Country:O',
    tooltip=['Country','avg_rent'],
    color=alt.condition(
            (selection | selection2),
            alt.value(selection_color),
            alt.value('lightgrey'))
).properties(height=height, width=width, title='Average rent (USD) across Countries')

print('ratios are displayed as percentages -%\nplease pay attention to axes limits')
(alt.hconcat(chrt&chrt2, chrt3&chrt4)
    .add_selection(selection, selection2)
    .configure_axis(
        labelFontSize=12,
        titleFontSize=12
            ))

### Aside 1: High rent to income ratios

UAE has an interesting value for rent/income.

The value is of the ratio is **close to 1** indicating that:
1. the value of rent is wrong and is biased by higher earners (or)
2. the value of salary has been biased by younger members of society who do not earn money (or)
3. people with lower incomes tend to share their homes/rooms with others (or)
4. Salaries have been underreported or stay is taken care of as a part of employement perks

Of course, the spread of reported income also depends on the number of people who have taken part in the survey

In [None]:
answers.iloc[1:].groupby('Q3').Q1.value_counts()['United Arab Emirates'].plot(kind='bar');
plt.grid(True);
plt.title('Age distribution of respondents from UAE');

While the number of people is very low,it also seems like the number of respondents in the 30-34 range is highest of all,
indicating that either salaries are underreported or the rent value on numbeo is inflated (or there are employee benefits at play).

## What is the ratio between Data Scientist Salaries and Data Science infrastructure cost?

Let's compare the same countries as before.

This is important to look at because there are no regional discounts for using cloud resources or hardware accelerators in countries with poor economic power/lower standard of living and consequently lower salaries.

In [None]:
f1 = folium.Figure(width=800, height=400)

def expense_marker(row, m):
    text  =f'<b>Country</b>: {row.name}<br><b>Expense/Salary</b>: {row.loc["expense/income"]:.2f}%'
    text2 =f'<b>Average D.S. yearly salary (USD):</b> {row.avg_money:.2f}<br><b>Average expense (yearly) (USD):</b> {row.avg_expense/5:.2f}'
    Marker([row.geometry.y, row.geometry.x],
           popup=folium.Popup(text2,
                              max_width=300, min_width=300),
           tooltip=text).add_to(m)


m = folium.Map(location=[20, 20], tiles='CartoDB positron',
               zoom_start=2,
               zoom_control=True,
               scrollWheelZoom=False,)
expense_adder = partial(expense_marker, m=m)
new_data.apply(expense_adder, axis=1)

print('CLICK ON A MARKER TO SEE MORE (Scroll (pinch) zoom is disabled)')
m.add_to(f1)


While the values appear to be small, it is unclear if this relationship is caused by the amount of pay or the nature (agressiveness) of investment.
Take a look at the graphs below to understand more.

Let's examine the two extremes in the data: China and Belgium (2nd bar from top - use the dropdown for ease).

Salaries seem to be in the same region for both countries, but the investment in AI is on completely different scales. If the data is assumed correct then it seems like Belgian companies are still new to the space, while China is one of the highest spenders in Asia.

Another issue with this combined question is that it does not highlight the investment an individual might make to learn this because the data has possibly been cross contaminated with expenses from organizations


In [None]:
height = 300
width  = 250
dropdown_options = new_data.index.tolist()
dropdown_options.sort()
input_dropdown = alt.binding_select(options=dropdown_options)
input_dropdown2 = alt.binding_select(options=dropdown_options)

selection = (alt.selection_single(fields=['Country'], 
                                  bind=input_dropdown,
                                  name='Country 1:',
                                  init={'Country':'USA'}))

selection2 = (alt.selection_single(fields=['Country'], 
                                  bind=input_dropdown2, 
                                  name='Country 2:',
                                  init={'Country':'India'}))

chrt = alt.Chart((new_data.loc[:,['avg_expense']]/5).reset_index()).mark_bar().encode(
    x='avg_expense:Q',
    y='Country:O',
    tooltip=['Country','avg_expense'],
    color=alt.condition(
            (selection | selection2),
            alt.value(selection_color),
            alt.value('lightgrey'))
).properties(height=height, width=width, title='Average AI/ML infrastructure expense (USD) by country')

chrt2 = alt.Chart((new_data.loc[:,['expense/income']]).reset_index()).mark_bar().encode(
    x='expense/income:Q',
    y='Country:O',
    tooltip=['Country','expense/income'],
    color=alt.condition(
            (selection | selection2),
            alt.value(selection_color),
            alt.value('lightgrey'))
).properties(height=height, width=width, title='The infrastructure expense to income ratio(%) by country')

chrt3 = alt.Chart(new_data.loc[:,['monthly_income']].reset_index()).mark_bar().encode(
    x='monthly_income:Q',
    y='Country:O',
    tooltip=['Country','monthly_income'],
    color=alt.condition(
            (selection | selection2),
            alt.value(selection_color),
            alt.value('lightgrey'))
).properties(height=height, width=width, title='Average monthly income by country (in USD)')

chrt4 = alt.Chart(new_data.loc[:,['avg_rent']].reset_index()).mark_bar().encode(
    x='avg_rent:Q',
    y='Country:O',
    tooltip=['Country','avg_rent'],
    color=alt.condition(
            (selection | selection2),
            alt.value(selection_color),
            alt.value('lightgrey'))
).properties(height=height, width=width, title='Average rent (in USD) by country')

print('ratios are displayed in terms of percentage (out of 100%)')

(alt.hconcat(chrt&chrt3, chrt2&chrt4)
    .add_selection(selection, selection2)
    .configure_axis(
        labelFontSize=12,
        titleFontSize=12
            ))

## An India vs USA comparison: 

In [None]:
print('Number of responses from USA is:',answers.Q3.value_counts()[USA])
print('Number of responses from India is:',answers.Q3.value_counts()['India'])

### A sample comparison of buying power

The graphs below show sample costs for a variety of products. Prices are displayed in terms of their fraction (of 100%) with respect to monthly salary.

In [None]:
norm_cost_living = cost_living.divide(new_data.loc[['India', 'USA'], 'monthly_income'].values/100, axis=0)
norm_cost_living = norm_cost_living.round(decimals=4)


chrt = alt.Chart(norm_cost_living.reset_index()).mark_bar().encode(
    x='Gasoline_per_L:Q',
    y='Country:O',
    tooltip=['Country','Gasoline_per_L'],
#     color=selection_color
).properties(height=80, width=300, title='Average cost for petrol(gas) as a fraction(%) of monthly salary')

chrt2 = alt.Chart(norm_cost_living.reset_index()).mark_bar().encode(
    x='Diesel_per_L:Q',
    y='Country:O',
    tooltip=['Country','Diesel_per_L'],
#     color=selection_color
).properties(height=80, width=300, title='Average cost for diesel as a fraction(%) of monthly salary')

chrt3 = alt.Chart(norm_cost_living.reset_index()).mark_bar().encode(
    x='Electricity_per_KWHr:Q',
    y='Country:O',
    tooltip=['Country','Electricity_per_KWHr'],
#     color=selection_color
).properties(height=80, width=300, title='Average cost for electricity as a fraction(%) of monthly salary')

chrt4 = alt.Chart(norm_cost_living.reset_index()).mark_bar().encode(
    x='Food_for_2_out:Q',
    y='Country:O',
    tooltip=['Country','Food_for_2_out'],
#     color=selection_color
).properties(height=80, width=300, title='Average cost for a meal as a fraction(%) of monthly salary')

chrt5 = alt.Chart(norm_cost_living.reset_index()).mark_bar().encode(
    x='Jeans:Q',
    y='Country:O',
    tooltip=['Country','Jeans'],
#     color=selection_color
).properties(height=80, width=300, title='Average cost for jeans as a fraction(%) of monthly salary')

chrt6 = alt.Chart(norm_cost_living.reset_index()).mark_bar().encode(
    x='Car (Hatchback):Q',
    y='Country:O',
    tooltip=['Country','Car (Hatchback)'],
#     color=selection_color
).properties(height=80, width=300, title='Average cost for car as a fraction(%) of monthly salary')

print('ratios are displayed in terms of percentage (out of 100%)')

((chrt&chrt2&chrt3)|(chrt4&chrt5&chrt6)).configure_axis(
        labelFontSize=12,
        titleFontSize=12
            )

The graphs above show the tremendous buying power of the American Dollar when compared to the Indian Rupee. These comparisons go beyond simple life and even into grander opportunities.

Think about international flights, gadgets, etc. Chances are prices don't change significantly across geographies unless there are a large fraction of manual labour involved - at which point, cost of labor factors into price. This is especially true of gadgets that are designed by and for the developed countries, and outsourced to a single country for manufacture.


**See Annex D for a true value version of this graph**

### What are the distributions like for degrees earned?

In [None]:
temp_df = (answers.loc[answers.Q3.str.contains('India|United States of America',
                                            regex=True)]
            .groupby('Q4').Q3.value_counts()).unstack()

temp_df

temp_df_norm = (temp_df/temp_df.sum(axis=0)*100).round(decimals=2)
temp_df_norm.loc[degrees_in_order_spec]

chrt = alt.Chart(temp_df.loc[degrees_in_order_spec].reset_index()).mark_bar().encode(
    x='India:Q',
    y='Q4:O',
    tooltip=['India','Q4'],
#     color=selection_color
).properties(height=120, width=250, title='India respondents by qualification')

chrt3 = alt.Chart(temp_df_norm.loc[degrees_in_order_spec].reset_index()).mark_bar().encode(
    x='India:Q',
    y='Q4:O',
    tooltip=['India','Q4'],
#     color=selection_color
).properties(height=120, width=250, title='India respondents by qualification (%)')

chrt2 = alt.Chart(temp_df.loc[degrees_in_order_spec].reset_index()).mark_bar().encode(
    x=USA+':Q',
    y='Q4:O',
    tooltip=[USA,'Q4'],
#     color=selection_color
).properties(height=120, width=250, title='USA respondents by qualification')

chrt4 = alt.Chart(temp_df_norm.loc[degrees_in_order_spec].reset_index()).mark_bar().encode(
    x=USA+':Q',
    y='Q4:O',
    tooltip=[USA,'Q4'],
#     color=selection_color
).properties(height=120, width=250, title='USA respondents by qualification (%)')


print('NOTE:   the axis values are different for graphs on the left')
print('\tratios are displayed in terms of percentage (out of 100%)')

((chrt|chrt3)&(chrt2|chrt4)).configure_axis(
        labelFontSize=12,
        titleFontSize=12
            )

### Observation:
The most interesting trend here is the fact that the fraction of **Master's Degree** holders in the USA outranks the other two classes. 
Unless the data is unnaturally biased in the survey, this should not be the case since the number of masters students/degree holders must always be less than the number of bachelors students or degree holders.

Why? The Bachelor's degree is an essential for entry into either of the next two levels.

**This is our first hint at the fact that there is global interest in the USA as a hub for study and advanced employment.**

We can confirm this by checking the spread of the survey across age groups.

Furthermore, you may notice that the India and USA graphs are mirrored. India has more undergrads while USA has more postgrads. Further strengthing the case that there is a flow of students from India to USA (among other destinations).


In [None]:
fig, ax = plt.subplots(1,2,figsize=(16,6), sharey=True)   

temp_df = answers.iloc[1:].groupby('Q3').Q1.value_counts()[USA].loc[age_in_order]
temp_df = temp_df/temp_df.sum()

temp_df.plot(kind='bar', ax=ax[0]);
ax[0].set_title('Age Distribution for USA survey takers');
ax[0].grid(True)

temp_df = answers.iloc[1:].groupby('Q3').Q1.value_counts()['India'].loc[age_in_order]
temp_df = temp_df/temp_df.sum()

temp_df.plot(kind='bar',ax=ax[1]);
ax[1].set_title('Age Distribution for India survey takers');
ax[1].grid(True)

print('NOTE:   No tool tip for plot below\n\tY-axis is shared')

plt.suptitle('Age distribution in USA and India', fontsize=16)
plt.show();

### Observation:

India has a dstribution that is shifted to the younger side in comparison to USA. While this might immediately highlight the reason for the mirroring discussed earlier, it does not explain why USA has more masters students in the survey. After all, not all bachelor's go on to study more. Some are content working with their current education level.

The only problem with the argument is that it can only be trusted with a population-level survey.

In our case, the number of respondents from America (just shy of 400) is too low to make this generalization.

**See annex B for a version of this graph with an absolute count scale y-axis**

### Aside 2:

It is clear that interest in Data Science is quite new in India compared to USA.

In [None]:
fig, ax  = plt.subplots(1,2, figsize=(14,6), sharey=True)

temp_df = (answers.loc[answers.Q3==USA].groupby('Q1').Q4.value_counts()
                 .unstack().loc[age_in_order_spec,
                                degrees_in_order_spec])
temp_df = temp_df.divide(temp_df.sum(axis=1), axis=0)

temp_df.plot(kind='bar', ax=ax[0])
ax[0].set_title('Distribution in USA by age and degree', fontsize=12)
ax[0].grid(True)



temp_df = (answers.loc[answers.Q3=='India'].groupby('Q1').Q4.value_counts()
                 .unstack().loc[age_in_order_spec,
                                degrees_in_order_spec])
temp_df = temp_df.divide(temp_df.sum(axis=1), axis=0)


temp_df.plot(kind='bar', ax=ax[1])
ax[1].set_title('Distribution in India by age and degree', fontsize=12)
ax[1].grid(True)

plt.suptitle('Comparison of distributions between India and USA', fontsize=16)
print('Note:   No Tooltip for plot below\n\tFractions sum upto 1.0 for each age group\n\tY-axis is shared')

plt.show();

While no definitive conclusion can be made about the nature of student immigration, it is still interesting to see the difference in spread between distributions in America and India. The bars are all fractions of 1.0 and they add up to 1.0 for each age group. E.g. USA (18-21) has ~65% bachelors holders and ~35% masters holders. 

**See annex C for a version of this graph with an absolute count scale y-axis**

### Conclusion

I hope you enjoyed the journey so far. In the future I hope to do something more extensive in this area. But for now, maybe think twice about buying a new iPhone to "fix" a broken screen or buying one every year? :P

In all seriousness though, the final section of this notebook was meant to explore why Indian's choose to immigrate abroad. This was meant to include an analysis of loan repayment options an why Indians have certain advantages over native citizens.

Emprically, Indians have three main advantages. On average, they are younger than native citizens when they enter postgraduate programs (Indians finish school at 16-17 at least 1 year earlier than American children). Second, Indians do not have massive undergraduate loans to worry about that amount to tens of thousands of dollars. This is their biggest advantage over Americans because it means that they can pay off their loans faster than Americans who need to reconcile the constrasting perspectives of more dollars in loans and a better salary. Many choose to stay content with just  undergraduate degree because of this (which is what I wanted to explore in the last part of the notebook). Given that they are younger and have more time left in their careers, this effect often balloons. Finally, Indians are used to a lower standard of living than their American peers. Many of the things that seem ordinary to Americans are luxuries that Indians are able to live without while they wrap up their loans.

Again emprically, the flip-side to this is the risk involved for Indians. Coming back to India without paying off your student loans is a bad place to be in. Given that the salary is about a tenth of what you would earn in the USA (this number was taken from this dataset), you are looking at a significantly longer repayment schedule in India especially once you consider living costs.

## Annex

### Annex A: Notes and Caveats

1. While comparing different countries, I have made no attempt to include tax calculations. This is largely because tax deductions are varied and depend on the special circumstances of each person. It is more straightforward to include.

2. I have included only Data Scientists for this analyis. However, Data Scientists only account for about 15-20% of the survey respondents. The reason I stuck with the choice is in part due to my desire for consistency (with the title and the comical aspect of the data analyis) but also because I did not want to involve more career paths that have smaller and differing levels of interaction with Data Science.

3. I have only accounted for Bachelor, Master, and Doctrate programs. This is because these are the traditional paths available to Indian students who want to work abroad. Including the remaining would not be a fair comparison because there is no way that bridge or cross-over for comparison. You do not have a drop out from India going and working for a FAANG company in USA. Likewise, given the intense compeition,  it is unlikely for plain cetrificate holders to work in the USA. Remote work is an option, one that even I have taken advantage of, but the opportunity is very niche.

4. I have made use of percentages for buying power comparisons because it is the most straightforward way for someone to understand the extent of strength or weakness in their local currency. PPP (Purchasing Power Parity is a similar concept, but I wanted to show how it may not be an entirely accurate measure. Sadly I was not able to get the required data, that would allow me to check if raw material prices at the industrial level do not change depending on the country buying. 

5. All additional data was gathered manually. I have explained further in the dataset description ([link](https://www.kaggle.com/aashishghosh/iphone-prices-and-average-rent-across-the-world)). Data was collected from three sites: www.themacindex.com, www.numbeo.com and www.globalpetrolprices.com. Global petrol prices had a special CC (CC Attribution Noncommercial No derivs) license and I clarified this use case (kaggle competition) via email with the owners of the data.

6. The low number of responses makes it hard to make definite statements regarding the industry and the world. The best I can personally hope for is something along the lines of: "Look, this is how _this data_ is spread". However, the general intuition regarding PPP is what I wanted to demonstrate the most, and I think I have succeeded there.

7. I wish I could delve deeper into data that would allow me to analyse immigration from India and the motivations to do so. I suppose this will need to wait for more data. This is of personal interest to me because I wonder if understanding how to remove this income disparity holds the key to removing income disparities all over the world, including countries with strong economic power. Some of these countries still have extremely poor people and large disparities in income.

### Annex B:

In [None]:
fig, ax = plt.subplots(1,2,figsize=(16,6), sharey=True)   

answers.iloc[1:].groupby('Q3').Q1.value_counts()[USA].loc[age_in_order].plot(kind='bar', ax=ax[0]);
ax[0].set_title('Age Distribution for USA survey takers');
ax[0].grid(True)

answers.iloc[1:].groupby('Q3').Q1.value_counts()['India'].loc[age_in_order].plot(kind='bar',ax=ax[1]);
ax[1].set_title('Age Distribution for India survey takers');
ax[1].grid(True)

print('NOTE:   no tool tip for plot below\n\tY-axis is shared')
plt.suptitle('Age distribution in USA and India', fontsize=16)
plt.show();

### Annex C:

In [None]:
fig, ax  = plt.subplots(1,2, figsize=(14,6))

(answers.loc[answers.Q3==USA].groupby('Q1').Q4.value_counts()
                 .unstack().loc[age_in_order_spec,
                                degrees_in_order_spec]).plot(kind='bar', ax=ax[0])

ax[0].set_title('Distribution in USA by age and degree', fontsize=14)
ax[0].grid(True)

(answers.loc[answers.Q3=='India'].groupby('Q1').Q4.value_counts()
                 .unstack().loc[age_in_order_spec,
                                degrees_in_order_spec]).plot(kind='bar', ax=ax[1])

ax[1].set_title('Distribution in India by age and degree', fontsize=14)
ax[1].grid(True)

plt.suptitle('Comparison of distributions between India and USA', fontsize=20)
print('Note:   No Tooltip for plot below\n\tY-axis is not shared')

plt.show(); 

### Annex D:

In [None]:
chrt = alt.Chart(cost_living.reset_index()).mark_bar().encode(
    x='Gasoline_per_L:Q',
    y='Country:O',
    tooltip=['Country','Gasoline_per_L'],
#     color=selection_color
).properties(height=80, width=300, title='Average cost for petrol(gas) (USD)')

chrt2 = alt.Chart(cost_living.reset_index()).mark_bar().encode(
    x='Diesel_per_L:Q',
    y='Country:O',
    tooltip=['Country','Diesel_per_L'],
#     color=selection_color
).properties(height=80, width=300, title='Average cost for diesel (USD)')

chrt3 = alt.Chart(cost_living.reset_index()).mark_bar().encode(
    x='Electricity_per_KWHr:Q',
    y='Country:O',
    tooltip=['Country','Electricity_per_KWHr'],
#     color=selection_color
).properties(height=80, width=300, title='Average cost for electricity (USD)')

chrt4 = alt.Chart(cost_living.reset_index()).mark_bar().encode(
    x='Food_for_2_out:Q',
    y='Country:O',
    tooltip=['Country','Food_for_2_out'],
#     color=selection_color
).properties(height=80, width=300, title='Average cost for a meal (USD)')

chrt5 = alt.Chart(cost_living.reset_index()).mark_bar().encode(
    x='Jeans:Q',
    y='Country:O',
    tooltip=['Country','Jeans'],
#     color=selection_color
).properties(height=80, width=300, title='Average cost for jeans (USD)')

chrt6 = alt.Chart(cost_living.reset_index()).mark_bar().encode(
    x='Car (Hatchback):Q',
    y='Country:O',
    tooltip=['Country','Car (Hatchback)'],
#     color=selection_color
).properties(height=80, width=300, title='Average cost for car (USD)')

((chrt&chrt2&chrt3)|(chrt4&chrt5&chrt6)).configure_axis(
        labelFontSize=12,
        titleFontSize=12
            )