# Let's Understand how is India adapting the world of Artificial Intelligence.

In [None]:
## Importing Libraries ##

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
from IPython.display import Markdown, display, Image
import pycountry
import plotly.offline as py

pd.set_option('display.max_columns', 500)
#sns.set()

%matplotlib inline
warnings.filterwarnings('ignore')

plt.rcParams['figure.dpi'] = 1000
COLOR_PALETTE = 'Blues_d'
colors = sns.color_palette(COLOR_PALETTE).as_hex()
#sns.set_palette(COLOR_PALETTE)
#sns.color_palette(COLOR_PALETTE);

## Setting Figure Parameters
plt.rcParams["figure.edgecolor"] = "black"
plt.rcParams["figure.facecolor"] = "#f0f0f0"
plt.rcParams["axes.facecolor"] = "#f0f0f0"
plt.rcParams["figure.frameon"] = True
plt.rcParams["grid.color"] = "white"
plt.rcParams["axes.edgecolor"] = "black"
plt.rcParams["axes.linewidth"] = 1
plt.rcParams['legend.edgecolor'] = 'black'
plt.rcParams["legend.frameon"] =True
plt.rcParams["lines.linewidth"] = 1.8
plt.rcParams['font.family'] = 'sans-serif'
plt.rcParams['font.sans-serif'] = 'Helvetica'
plt.rcParams['axes.grid']= True
plt.rcParams['axes.grid.axis']= 'both'
plt.rcParams['axes.grid.which']= 'major'
plt.rcParams['axes.axisbelow'] = True

#plt.rcParams['image.cmap'] = 'tab20b'

# colors =['#4287f5','#0177bb' ,  '#344b5b']
# COLOR_PALETTE = sns.color_palette(colors)
#colors = ['#1d7bff', '#4287f5','#0177bb' ,'#356384']
# colors = [
#     "#30a2da",
#     "#fc4f30",
#     "#e5ae38",
#     "#6d904f",
#     "#8b8b8b",
# ]

# COLOR_PALETTE = sns.color_palette(colors)

colors =['#1E90FF','#0040ff', '#367bac', '#356384' ,  '#344b5b']
COLOR_PALETTE = sns.color_palette(colors)
sns.set_palette(COLOR_PALETTE)

pointplot_palette = sns.color_palette([colors[0], colors[-1]])

#plt.style.use('fivethirtyeight')

In [None]:


sns.palplot(sns.color_palette(colors))

In [None]:
Image('../input/images/39369.jpg')

# <a> Dataset </a>

I am using a modified version of the dataset which is a combined version of data from 2017 to 2020. I created [this dataset](https://www.kaggle.com/harveenchadha/kaggle-survey-20172020-merged-data) using [this notebook](https://www.kaggle.com/harveenchadha/merging-all-historical-survey-data-2017-2020/comments)


In [None]:
## Reading merged data 

df_all_data = pd.read_csv('../input/kaggle-survey-20172020-merged-data/kaggle_survey_17_20_v2.csv')
mcq = df_all_data.copy()

In [None]:
country = 'India'
country_to_compare= 'United States of America'
current_year = 2020

# <a>Storyline</a>

Seeing the data, I figured out so many secrets about the Data Science Landscape in India. I wanted to rush to the head of the country and explain him the next superpower in the world but ! No matter what somethings are not possible. 

This survey affected so much in my subconsious mind that in my dream, I fulfilled my dream. Hard to understand? I am here to explain. In my dream, the Prime Minister of India, Mr. Narendra Modi calls me to get some insights about the AI community in India. What happens next is a true story of what exactly happened step by step in my dream.

So let's start!

<!-- ![](https://images.livemint.com/img/2019/08/30/600x338/20190808295L_1565277936009_1567148847532.jpg) -->

<!-- # <a>Questions Answered in this notebook</a>

1. What are the total re -->

In [None]:
## Helper Functions for printing conversational dialogues

def pm(text):
    PM_text = "<div style= 'background-color:rgb(247, 247, 247); border:1px solid rgb(207,207,207); border-color:rgb(207,207,207); padding: 15px'>\
    <span style=\"color: black; font-size: 16px; font-family:  'Helvetica Neue', Helvetica, Arial, sans-serif; \
font-weight: 400;  letter-spacing: 0.004em; line-height: 1.58; font-style: italic;\"> " + text +" </span></div>"
    display(Markdown(PM_text))
    
    
def response(text):
    my_response = "<div style= 'background-color:rgb(247, 247, 247); border:1px solid rgb(207,207,207); border-color:rgb(107,107,107); padding: 15px'> \
    <span style='color: black; font-size: 16px; font-family: medium-content-serif-font, Georgia, Cambria, 'Times New Roman', Times, serif; \
    font-weight: 400; letter-spacing: -0.004em; line-height: 1.58; '>"+ text +"</span></div>"
    display(Markdown(my_response))

In [None]:
## Helper function to get the counts of unique classes per column. This is a configurable function.
def value_counts(df, column, normalize= True, rename= 'Percentage', return_percent= True):
    if normalize:
        if rename is not None and return_percent:
            mod_df= df[column].value_counts(normalize=normalize).rename(rename).mul(100).reset_index()
            mod_df= mod_df.rename(columns ={'index':column})
            return mod_df
    else:
        mod_df= df[column].value_counts(normalize=False).rename(rename).reset_index()
        mod_df= mod_df.rename(columns ={'index':column})
        return mod_df

    
## Helper function to annotated bar plots
def show_bar_plot(x , y,  data, hue = None, axis = 'vertical', title= None, show_percent = True, xlabel = None, ylabel= None, legend= None):
    if hue is None:
        ax = sns.barplot(y = y , x = x, data = data, color=colors[0])
        
    if hue is not None:
        ax = sns.barplot(y = y , x = x, hue=hue, data = data, palette=(COLOR_PALETTE))
    plt.title(title)
    
    if show_percent:
        if axis == 'vertical':
            for p in ax.patches:
                percentage = '{:.1f}%'.format(p.get_y() + p.get_height())
                x = p.get_x() + p.get_width()/2
                y = p.get_y() + p.get_height() + 0.4
                ax.text(x, y, percentage, ha="center") 


        elif axis == 'horizontal':    
            for p in ax.patches:
                percentage = '{:.1f}%'.format(p.get_x() + p.get_width())
                x = p.get_x() + p.get_width() + 0.2
                y = p.get_y() + p.get_height()/2
                ax.annotate(percentage, (x, y))

    if xlabel is not None:
        plt.xlabel(xlabel);
    if ylabel is not None:
        plt.ylabel(ylabel);

    plt.show()

    
def show_worldmap(df, column, title):
    def get_name(code):
        '''
        Translate code to name of the country
        '''
        try:
            name = pycountry.countries.get(alpha_3=code).name
        except:
            name=code
        return name

    country_number = pd.DataFrame(df[column].value_counts())
    country_number['country'] = country_number.index
    country_number.columns = ['number', 'country']
    country_number.reset_index().drop(columns=['index'], inplace=True)
    country_number['country'] = country_number['country'].apply(lambda c: get_name(c))
    


    worldmap = [dict(type = 'choropleth', locations = country_number['country'], locationmode = 'country names',
                     z = country_number['number'], colorscale = "Blues", reversescale = True, 
                     marker = dict(line = dict( width = 0.5)), 
                     colorbar = dict(autotick = False, title = 'Number of respondents'))]

    layout = dict(title = title, geo = dict(showframe = False, showcoastlines = True, 
                                                                    projection = dict(type = 'Mercator')))

    fig = dict(data=worldmap, layout=layout)
    py.iplot(fig, validate=False)
    

## Helper function to break labels if they are lengthy. Labels are broken by spaces.
def break_labels(df, column):
    unique_vals = df[column].unique()
    dict_val = {}
    for item in unique_vals:
        arr = item.split(' ')
        new_name = "\n".join(arr)
        dict_val[item] = new_name
    df[column] = df[column].map(dict_val)
    return df

    
## Helper function to show point plots
def show_point_plot(x , y,  data= None, hue = None, axis = 'vertical', title= None, show_percent = False, xlabel = None, ylabel= None, legend= None):
    fig, ax = plt.subplots(figsize=(16,5))
    #fig = plt.figure(linewidth=10, edgecolor="#04253a")
    fig.linewidth = 10
    fig.edgecolor="#04253a"
    if data is not None:
        data = break_labels(data, x)
        
        plt.title(title)
        
        if hue is None:
            ax = sns.pointplot( x = x, y = y , data = data, color=colors[0])
        if hue is not None:
            ax = sns.pointplot(x = x, y = y ,  hue=hue, data = data, markers=["o", "x"], color=colors[0])
        

        if show_percent:
            for c in ax.collections:
                for of in c.get_offsets():
                    val = str(int(round(of[1],1)))
                    #print(of)
                    
                    ax.annotate(val, [of[0]-0.1, of[1]+100])
                    #print([of[0]-0.1, of[1]-10])


        if xlabel is not None:
            plt.xlabel(xlabel);
        if ylabel is not None:
            plt.ylabel(ylabel);
        plt.show()
        
    else:
        plt.title(title)
#         if hue is None:
#             ax = sns.pointplot(y = y , x = x,  palette=(COLOR_PALETTE))
#         if hue is not None:
#             ax = sns.pointplot(y = y , x = x, hue=hue,  palette=(COLOR_PALETTE))

        if hue is None:
            ax = sns.pointplot(y = y , x = x, color=colors[0])
        if hue is not None:
            ax = sns.pointplot(y = y , x = x, hue=hue, color=colors[0])

        if show_percent:
            for c in ax.collections:
                for of in c.get_offsets():
                    val = str(round(of[1],1))
                    ax.annotate(val, of)


        if xlabel is not None:
            plt.xlabel(xlabel);
        if ylabel is not None:
            plt.ylabel(ylabel);
        plt.show()

        
## Helper function to combine dataframe of selected country with data frames of rest of the world. Certain checks are done to make sure the lists sizes are equal
def combine_row_country(df_country, df_row, country_compare='ROW'):
    df_country['Geography'] = country
    df_row['Geography'] = country_compare
    concat = pd.concat([df_country,  df_row], axis = 0).reset_index(drop=True)
    column = list(df_country.columns)[0]
    length_1 = list(df_country[column].unique())
    length_2 = list(df_row[column].unique())
    dict_more=[]
    if(len(length_1) == len(length_2)):
        return concat
    elif(len(length_1) > len(length_2)):
        not_present = []
        for col in length_1:
            if col not in length_2:
                not_present.append(col)
        for col in not_present:
            dict_more.append([col, 0, country_compare])
            
    elif(len(length_1) < len(length_2)):
        not_present = []
        for col in length_2:
            if col not in length_1:
                not_present.append(col)
        for col in not_present:
            dict_more.append([col, 0, country])

    new_df = pd.DataFrame(dict_more, columns=[column, 'Percentage', 'Geography'])
    concat = pd.concat([concat,  new_df], axis = 0).reset_index(drop=True)
    return concat


## Helper function to map column using a dictionary
def map_column(df, col, dict_map):
    df[col] = df[col].map(dict_map)
    df = df[~df[col].isnull()].reset_index(drop=True)
    return df

## Helper function to combine historical values from previous 4 years
def combine_historical_data(df_2017, df_2018, df_2019, df_2020, col, map_dict= None):
    historical_4 = value_counts(df_2020, col)
    historical_3 = value_counts(df_2019, col)
    historical_2 = value_counts(df_2018, col)
    historical_1 = value_counts(df_2017, col)
    
    historical_1['Year'] = 2017
    historical_2['Year'] = 2018
    historical_3['Year'] = 2019
    historical_4['Year'] = 2020
    
    
    historical_concat = pd.concat([historical_1, historical_2, historical_3, historical_4])
    historical_concat.Year = historical_concat.Year.astype('str')
    
    
#     if replace_cols is not None:
#         for key,value in replace_cols.items():
#             historical_concat['Degree'].replace(key, value, inplace=True)


    if map_dict is not None:
        historical_concat = map_column(historical_concat, col, map_dict)
    return historical_concat



## Helper function to apply group by using multiple parameters
def multiple_group_by(df, groupby, map_col1 = None, map_col2 = None, year= None):
    
    grouped = df.groupby(groupby).agg('count')[['index']].reset_index()
    col1 = groupby[0]
    col2 = groupby[1]
    
    if map_col1 is not None:
        grouped = map_column(grouped, col1, map_col1)
    
    col1_unique = grouped[col1].unique()
    col2_unique = grouped[col2].unique()
    

    _pd_new = pd.DataFrame()

    for col_1 in col1_unique:
        filter_group = grouped[grouped[col1] == col_1]
        sum_group = filter_group['index'].sum()
        
        for col_2 in col2_unique:
            filter_group_2 = filter_group[(filter_group[col2] == col_2)]
            if(len(filter_group_2) > 1):
                filter_group_2 = filter_group_2.groupby(groupby).agg('sum')[['index']].reset_index()
            filter_group_2['Percentage'] = filter_group_2['index'] / sum_group * 100
            _pd_new = pd.concat([_pd_new, filter_group_2])
    
    if map_col2 is not None:
        _pd_new = map_column(_pd_new, col2, map_col2)
    if year is not None:
        _pd_new['Year'] = year
    return _pd_new


## Helper function to combine multiple answers into one column
def multiple_answers(limit, df, col, mod_name):
    dict_local = {}

    for i in range(1, limit+1):
        col_name = col + str(i)
        new_col_name = df[col_name].value_counts().reset_index().iloc[0,0]
        dict_local[new_col_name]  = df[col_name].value_counts().reset_index().iloc[0,1]

    new_df = pd.DataFrame(dict_local, index=[0])
    new_df = new_df.melt()
    new_df = new_df.sort_values(by='value', ascending=False)
    new_df = new_df.rename(columns= {'variable':mod_name, 'value':'Count'})
    return new_df

In [None]:
df_20  =pd.read_csv('../input/kaggle-survey-2020/kaggle_survey_2020_responses.csv')


In [None]:
df_20[df_20['Q3']=='India']

In [None]:
pm("PM: Hello Harveen, Welcome to my office. My assistant tells me that you are of the best analysts in India today.")
response("Me: My pleasure sir. I always wanted to meet you in person. It's an honor to be here.")
pm("PM: Today I have called you for a very important reason that is in the interest of the country. As I talk, do let me know if this topic interests you.")
response("Me: Sir, I always wanted to join the Indian Army so that I can serve my country. Unfortunately that did not happen, if today I am getting a chance to serve my country in any way, I would like to take it. If it is of nation's interest then it is my interest.")
pm("PM: That's the spirit. We are delighted to have such enthusiastic individuals as you are.")
response("Me: Not all battles are fought on battlefield sir. Please let me know how can I contribute to my country?")
pm("PM: Harveen, the world has gone through numerous Industrial revolutions. The First Industrial Revolution used water and steam power to mechanize production. The invention of the steam engine created a new type of energy back then. The Second used electric power to create mass production. Usage of steel to build ships, railroads at a less price point increased with introduction of Bicycle and automobiles as well. ")
response("Me: Right Sir!")
pm("PM: The Third used electronics and information technology to automate production. This revolution witnessed the rise of electronics - with the transistor and microprocessor - but also the rise of telecommunications and computers. And Now we have the fourth revolution.")
response("Me: Yes! The Digital Revolution")
pm("PM: Yes, Absolutely, in this Digital Revolution we are seeing the emergence of Internet and other emerging technology breakthroughs in fields such as artificial intelligence, robotics, the Internet of Things, autonomous vehicles etc.")
response("Me: AI is currently the talk of the town sir!")
pm("PM: Right, and we should be too! \"I feel Artificial intelligence, machine learning, Internet of Things, blockchain & big data hold potential to take India to new heights\". \"India was not independent when the first and second industrial revolution happened. When third industrial revolution happened, India was struggling with challenges of just attained independence\". \"While the previous industrial revolutions eluded the country, India's contribution to the 4th Industrial Revolution would be astonishing\"")
response("Me: Absolutely sir, I am delighted to hear that you are thinking in all perspectives.")
pm("PM: I want India to become a superpower in the 4th Industrial Revolution and for this I need inputs from your side.")
response("Me: Sure sir! Let me know what information you require from my end.")
pm("PM: While we are launching a lot of schemes in association with NITI Ayog in India and trying to use Artificial Intelligence in every possible way ever, I need to understand from the ground level on how the Indian Techies and students are adopting this Technology. I need an in-depth analysis on what is the age group, qualification, gender, compensation, job titles, companies of these Techies so that we can roll out better policies which can actually benefit them and which can help me in achieving my dream of becoming superpower in 4th Industrial revolution.")

response("Me: I understood sir. You are in luck, Kaggle which is a platform for Data Science. Actually Kaggle is not a website it is an experience and they currently posted their survey from techies all over the world who are working in this technology. This survey is conducted annually. I happen to do an analysis on the data they have provided.")
pm("PM: That is great! Please show me analysis as much in depth as possible.")
response("Me: Sure sir. First of all, a total of 20,037 people from 56 countries responded to this survey in 2020. The respondent of this survey are the techies/students who use Kaggle to participate in Data Science competitions. Since Artificial Intelligence involves working with Data, so Data Science is a sub-field of AI.")


pm("PM: Great! No matter what science, it has the power to change the world! Look at our scientists in ISRO doing wonders for the country.")
response("Me: Absolutely sir. So a total of 5851 participants were from India. I understand that this is not a large number but this sample represents the whole population.")
pm("PM: Oh! You mean, when we have elections, like the exit polls are conducted. Is it something similar ?")
response("Me: Yes sir. And you will be glad to know that India had the maximum number of respondents which comprises close to 29.2% of the total participants.")

In [None]:
## Multiple dataframes for country, country_to_compare, country in current_year, country_to_compare in current_year, row(rest of world), row in current year

mcq_country = mcq[((mcq['Country'] == country))]
mcq_country_compare = mcq[((mcq['Country'] == country_to_compare))]
mcq_row = mcq[((mcq['Country'] != country))]

mcq_current = mcq[(mcq['Year'] == current_year)]
mcq_country_current = mcq[((mcq['Country'] == country) & (mcq['Year']==current_year))]
mcq_country_compare_current = mcq[((mcq['Country'] == country_to_compare) & (mcq['Year']==current_year))]
mcq_row_current = mcq[((mcq['Country'] != country) & (mcq['Year']==current_year))]


In [None]:
## Dataframes for current country in 2017,2018 and 2019

mcq_country_2017 = mcq[((mcq['Country'] == country) & (mcq['Year']==2017))]
mcq_country_2018 = mcq[((mcq['Country'] == country) & (mcq['Year']==2018))]
mcq_country_2019 = mcq[((mcq['Country'] == country) & (mcq['Year']==2019))]

In [None]:

print("Total Number of Respondents from "+country+": "+str(len(mcq_country_current)))
print("Total Percentage of "+country+"n Respondents: "+ str(round(len(mcq_country_current) / len(mcq_current) *100, 2)) + "%") 
total_respondents = len(mcq_current) - len(mcq_country_current)


labels = [country,'ROW']

fig, ax = plt.subplots(figsize=(6,6), subplot_kw=dict(aspect="equal"))
wedges, texts = ax.pie([ len(mcq_country_current), total_respondents], wedgeprops=dict(width=0.5), startangle=-0)
bbox_props = dict(boxstyle="square,pad=0.3", fc="w", ec="k", lw=0.72)
kw = dict(arrowprops=dict(arrowstyle="-"),
          bbox=bbox_props, zorder=0, va="center")

for i, p in enumerate(wedges):
    ang = (p.theta2 - p.theta1)/2. + p.theta1
    y = np.sin(np.deg2rad(ang))
    x = np.cos(np.deg2rad(ang))
    #print(ang, x, y)
    horizontalalignment = {-1: "right", 1: "left"}[int(np.sign(x))]
    connectionstyle = "angle,angleA=0,angleB={}".format(ang)
    kw["arrowprops"].update({"connectionstyle": connectionstyle})
    ax.annotate(labels[i], xy=(x, y), xytext=(1.35*np.sign(x), 1.4*y),
                horizontalalignment=horizontalalignment, **kw)
    
plt.title("Respondents from "+country+" vs ROW in "+str(current_year))
plt.show()


In [None]:
pm("PM: Great, what about from USA and China?")


In [None]:
# colors =['#4287f5','#0177bb' ,  '#344b5b']

#sns.set_palette(sns.color_palette(colors))

In [None]:
top_respondents = value_counts(mcq_current, 'Country')
top_5_respondents = top_respondents.head(5)


your_country = top_respondents[top_respondents['Country']==country]
rank =  int( your_country.index[0] + 1 )
filter_top_5 = top_5_respondents[top_5_respondents['Country'] == country]
if len(filter_top_5) == 0:
    top_5_respondents = pd.concat([ your_country, top_5_respondents])

print("Rank in terms of correspondents of your country "+country+" is: "+str(rank))

fig, ax = plt.subplots(figsize=(10,7))

show_bar_plot(x='Country', y='Percentage', data=top_5_respondents, title='Top 5 Respondents in '+str(current_year))


In [None]:
response("Me: USA stood second with 11.2 % respondents followed by Brazil and Japan. China did not had enough respondents so it is not in top 5 sir. Here, have a look at the world map sir.")


In [None]:
show_worldmap(mcq_current, 'Country', 'Top Respondents in '+str(current_year))

In [None]:
pm("PM: That is so beautiful.")
response("Me: Indeed, since India has the maximum number of respondents, it is the one with the darkest blue color")
pm("PM: Interesting, that reminds me of the Indian cricket team jersey as well. And how have these respondents ratio increased over the years as you have told that this is an annual survey?")
response("Me: Here you go sir")

In [None]:
responses_year = mcq_country['Year'].value_counts().reset_index().rename(columns={'index':'Year', 'Year':'Responses'})
responses_year['Year'] = responses_year['Year'].astype('str')

show_point_plot(x = 'Year', y = 'Responses',  data=responses_year, title='Respondents by Year from '+country, show_percent=True)


In [None]:
responses_year = mcq['Year'].value_counts().reset_index().rename(columns={'index':'Year', 'Year':'Responses'})
responses_year['Year'] = responses_year['Year'].astype('str')

show_point_plot(x = 'Year', y = 'Responses',  data=responses_year, title='Respondents by Year over the years', show_percent=True)


In [None]:
# import plotly.express as px

# df = pd.DataFrame(dict(time=['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec'],
#                         product_A = [20, 35, 30, 35, 27, 43, 24, 34, 14, 27, 22, 50],
#                         product_B = [25, 32, 34, 20, 25, 34, 54, 23, 43, 33, 27, 29]))


# fig = px.bar(df, x="time", y=["product_A","product_B"], title="Sales of Product A and Product B")
# fig.show()

In [None]:
pm("PM: Great. The figure has kind off doubled from 2017 for India. That's exciting, So let's move on! As you speak I will maintain a list of positives and negatives as well.")

# <a>Age</a> 


In [None]:
response("Me: 9So sir, the first thing I want to focus is <b>'Who are they?'</b> i.e. the <b>Age group</b> of the people who have responded to this survey. Since AI is the new and emerging technology we need a mix of young and experienced blood to drive this sector.")
pm("PM: Absolutely. While experience brings in reliability, youth brings in creativity.")
response("Me: [Laughs] So the age group are as follows:")

In [None]:
## AGE


age = value_counts(mcq_country_current, 'Age')
fig, ax = plt.subplots(figsize=(14,8))
show_bar_plot(x='Age' ,y= 'Percentage', data = age, axis = 'vertical', title= 'Age Group of respondents for '+country+' in '+str(current_year), xlabel='Age Group', ylabel='Percentage')



In [None]:
response("Me: 1. More than 75% people are in the age group 18–29. <br> \
2. Close to 5% of the respondents are from a higher age group who may be at senior architect, technologist level. <br> \
3. Surpisingly there are records for people in the category > 60. So this shows there is no age for passion. People of age 60 years are also using Data Science to run their businesses or just for the learning purpose. <br> \
4. But the good part is more than half of the current or prospective Data Scientists are way too young which is a positive sign as the youth is interested in this. <br> \
5. On the other hand, when you reach a higher age, some saturation comes in terms of learning, but that is not the case here. About 15% people are in the age group 30–39. <br>")

pm("PM: Great, and how is this compared to the Rest of the world?")

In [None]:
age_row = value_counts(mcq_row_current, 'Age')
age_concat = combine_row_country(age, age_row)

fig, ax = plt.subplots(figsize=(14,8))
show_bar_plot(x='Age' ,y= 'Percentage', hue='Geography', data = age_concat, axis = 'vertical', title= 'Age Group for '+country+' vs ROW in '+str(current_year), xlabel='Age Group', ylabel='Percentage')

In [None]:
response("Me: Highly positive sir. <br><br> \
We have the highest youth percentage as compared to the rest of the world. But there is also a worry sign here. We have high percentages of youth but as you can see we have started declining from age group 25–29 onwards as compared to the world. As you said youth brings creativity, but this creativity needs direction and that direction can only be given by senior, experienced people. <br> \
<br>Another factor for this could be Brain Drain, but that is just a speculation at the moment.")

pm("PM: Yes, you are right! I have on small request, I also want to see a one on one comparison with USA. I believe USA is our direct competitor and I should be aware of that too!")
response("Me: Here you go sir!")

In [None]:
age_compare = value_counts(mcq_country_compare_current, 'Age')
age_compare = combine_row_country(age, age_compare, 'USA')


show_point_plot(x='Age' ,y= 'Percentage',hue='Geography', xlabel='Age Group', data = age_compare, title='Age group comparison for '+country+' vs '+country_to_compare+' in '+str(current_year))

In [None]:
pm("PM: It looks pretty much the same as we did in comparison to the rest of the world. Youth ratio is 35–5, then 26–10 then they always have more experienced people than us.")
response("Me: Exactly sir, you have a talent for reading graphs sir.")
pm('PM: Actually I was a great analyst before entering politics.')
response('Me: [Laughs] Sir you are a jack of all trades! Have a look at this interesting chart too')


In [None]:
age_year_count = mcq_country.groupby(['Age','Year'])['index'].count().reset_index()
# sns.boxplot(x="Year", y="df_index", hue="Age",  data=age_year_count)

fig, ax = plt.subplots(figsize=(14,8))
age_year_count.pivot('Age','Year','index').plot.bar(stacked=True,width=0.8, ax=ax);
plt.xlabel('Age Group')
plt.title('Growth of Age groups for India over the years');

In [None]:
response("Me: 2020 was the year that saw maximum growth in terms of youth switching to AI.")
pm("PM: That's good to know, at least one good thing happened in 2020")
response("Me: Yes, and not even in the age group 18-21 the AI adoption took place across all the age groups maximum in 2020")
pm("PM: Well, that is great! What is the diversity in this age group? How are the girls of our nation doing?")
response("Me: Even I am very much interested in this because recently I attended an event arranged by Google in Bangalore in 2019 ofcourse because in 2020 due to corona everything has gone digital. The host asked \"How many women are AI Scientists or Data Scientists\". Only 5 hands went up from around 200 Females present at the venue. I was quite surprised by this. Lets see what happens here.")

# <a>Gender</a> 


In [None]:
## gender

gender = value_counts(mcq_country_current, 'Gender')
fig, ax = plt.subplots(figsize=(8,6))
show_bar_plot(x='Gender' ,y= 'Percentage', data = gender.head(3), axis = 'vertical', xlabel='Gender', ylabel='Percentage', title='Gender Diversity of respondents for '+country+' in '+str(current_year))

In [None]:
response("Me: Sir, the numbers are not at all encouraging. Out of all the respondents only 22% are females.")
pm("PM: [Shocked] What! That is very very disappointing.")
response("Me: Exactly, if this is a war then how can we win a war if half of the population does not takes part fully?")

In [None]:
gender_row = value_counts(mcq_row_current, 'Gender')
gender_concat = combine_row_country(gender.head(3), gender_row.head(3))

fig, ax = plt.subplots(figsize=(8,6))
show_bar_plot(x='Gender' ,y= 'Percentage', data = gender_concat, hue='Geography', axis = 'vertical', title= 'Gender Percentages of respondents for '+country+' and ROW in '+str(current_year), xlabel='Gender', ylabel='Percentage')


In [None]:
response("Me: And this number not even in India but even in the rest of the world is quite discouraging.")
pm("PM: So this is true for even the younger generation that is coming in?")

In [None]:
filter_age_gender= mcq_country_current.groupby(['Gender','Age']).count().reset_index()[['Gender','Age','index']]
list_groups = list(filter_age_gender['Age'].unique())

local_dict = {}

for group in list_groups:
    female = filter_age_gender.loc[((filter_age_gender['Age'] == group) & (filter_age_gender['Gender'] == 'Female'))]['index']
    male = filter_age_gender.loc[((filter_age_gender['Age'] == group) & (filter_age_gender['Gender'] == 'Male'))]['index']
    if(len(female) >0):
        female = int(female)
    else:
        local_dict[group] = 0
        continue;
    
    if(len(male) >0):
        male = int(male)
    else:  
        local_dict[group] = 0
        continue;

    if female!=0 and male!=0:
        local_dict[group] = male / female

ratio_df = pd.DataFrame(local_dict, index = [0])
ratio_df = ratio_df.melt()
ratio_df = ratio_df[ratio_df['value']>0]
fig, ax = plt.subplots(figsize=(12,6))

show_bar_plot(x = 'variable', y='value', data= ratio_df, show_percent=False, title='Diversity in terms of gender for India in '+str(current_year), xlabel='Age Group', ylabel='Number of Males per 1 Female')

In [None]:
response("Me: Unfortunately yes sir. The numbers tell the same story as in this chart. <br><br> \
<b>You know what this means sir, if we have a Data Science team of 12, then only 2 will be female and 10 will be male.</b> ")

pm("PM: Negative point noted and circled. ")

response("Me: Yes sir, but there is something that is hidden behind the numbers")

pm("PM: What exactly?")

response("Me: The number of girls are actually increasing year by year!")

In [None]:
# group_by_country_gender = multiple_group_by(mcq, ['Year', 'Country', 'Gender'])
# group_by_country_gender = group_by_country_gender[(group_by_country_gender['Gender'].isin(['Male', 'Female'])) & (group_by_country_gender['Country'].isin(['India','United States of America','Japan']))]
# group_by_country_gender = group_by_country_gender.reset_index(drop=True)

def apply_transformation(row):
    df = gender_year
    if row['Gender']=='Female':
        year = row['Year']
        country = row['Country']
        gender = 'Male'
        one_record = df[((df['Gender'] == gender) & (df['Country'] == country) & (df['Year'] ==year))]
        male_val = int(one_record['index'])
        
        return row['index'] / male_val * 100
    else:
        return 0
    
# group_by_final_df = group_by_country_gender    
# group_by_final_df['Actual'] = group_by_country_gender.apply(apply_transformation, axis=1)

In [None]:
gender_year = mcq.groupby(['Gender','Year','Country'])['index'].count().reset_index()
gender_year = gender_year[(gender_year.Gender.isin(['Male', 'Female'])) & (gender_year.Country.isin(['India']))]
gender_year['Actual'] = gender_year.apply(apply_transformation, axis=1)
gender_year= gender_year[gender_year['Gender']=='Female']
#gender_year.Year = gender_year.Year.astype('str')
gender_year

In [None]:
fig, ax = plt.subplots(figsize=(8,6))
show_bar_plot(x='Year', y='Actual', data=gender_year, xlabel='Year', ylabel='Percentage', title='Percentage of female respondents over the years')


In [None]:
pm("PM: That is great! You know one of schemes is 'Beti padhao Beti bachao' [Save a girl child by educating here] and India needs to be an example to break this gender bias.  ")
response("Me: Absolutely sir, and not only gender bias but degree bias also.")
pm("PM: What do you mean?")
response("Me: Sir, my personal belief is <b>\"Degrees don't matter, your attitude does\"</b>.  A major factor why the young people are not able to get good jobs in this field is that the companies are demanding PHD's and Masters in this field. Now to be very frank, they want qualified people but if you ask someone 5 years back that did they see this wave of AI coming, then nobody would have predicted that. And that is what I want to show you here")

# <a>Degree</a>

In [None]:
degrees_propotion = value_counts(mcq_country_current,'Degree') 

degrees_propotion['Degree'].replace('Some college/university study without earning a bachelor’s degree','Pursuing Bachelors', inplace=True)
degrees_propotion['Degree'].replace('No formal education past high school','No formal education', inplace=True)

fig, ax = plt.subplots(figsize=(12,6))

show_bar_plot(y='Degree' ,x= 'Percentage', data = degrees_propotion, axis = 'horizontal', xlabel='Percentage',ylabel='Degree', title='Degree Holders of '+country+' in '+str(current_year))


In [None]:
response("Me: 1. Close to 53% of Data Scientist or ML Engineers are Bachelors and surpirsinly 32% are Masters.<br> \
         2. I am assuming Masters people are the one's who have done Masters in Computers or Electronics and not in some specific field like Data Science or ML, because I can't recall a college providing Masters degree before 2017.<br> \
         3. As most of the Job boards require, there are only 5% people with Doctoral Degrees. This is an indicator that why there is a talent shortage because if the employer is not flexible with the degree requirement then from where will they hire? ")

fig, ax = plt.subplots(figsize=(12,6))

degrees_propotion_row = value_counts(mcq_row_current,'Degree')
degrees_propotion_row['Degree'].replace('Some college/university study without earning a bachelor’s degree','Pursuing Bachelors', inplace=True)
degrees_propotion_row['Degree'].replace('No formal education past high school','No formal education', inplace=True)


degree_concat = combine_row_country(degrees_propotion, degrees_propotion_row)
show_bar_plot(y='Degree' ,x= 'Percentage', hue='Geography', data = degree_concat, axis = 'horizontal', xlabel='Percentage',ylabel='Degree', title='Degree Holders of '+country+' vs ROW in '+str(current_year))


In [None]:
response("Me: On the other hand if you see India vs ROW you will observe that most of the people from other parts of the world have done their masters or PHD degrees. <b>The comparison of 5% to 15% PHD's is the one figure where we are losing.</b>")

pm("PM: That is another serious concern, Harveen. Over the past years we have included a lot of reforms for our PHD students, including a proposal to increase their stipend by 100% but I am shocked to see these numbers.")
response("Me: I understand that sir, but those reforms are not working. If you see our comparison with US, then you will feel this as well. Nearly 44% of their respondents are masters and they have 3 times the PHD's that India has. Also the number of PHD's is decreasing over the years.")

In [None]:

degree_compare = value_counts(mcq_country_compare_current, 'Degree')
degree_compare['Degree'].replace('Some college/university study without earning a bachelor’s degree','Pursuing Bachelors', inplace=True)
degree_compare['Degree'].replace('No formal education past high school','No formal education', inplace=True)


degree_compare = combine_row_country(degrees_propotion, degree_compare, 'USA')

show_point_plot(x='Degree' ,y= 'Percentage',hue='Geography', data = degree_compare, title='Degree Holder Comparison for '+country+' vs '+country_to_compare+' in '+str(current_year))

In [None]:

map_dict= {'Bachelor’s degree' : 'Bachelors', "Bachelor's degree":'Bachelors', 'Master’s degree': 'Masters',
           "Master's degree": 'Masters', 'Doctoral degree':'PHDs'
          }

degree_historical = combine_historical_data( mcq_country_2017,mcq_country_2018, mcq_country_2019,  mcq_country_current, 'Degree',map_dict)
ax = sns.catplot(x="Year", y="Percentage", aspect=0.8,  col="Degree", data=degree_historical, col_wrap =3, sharex = False, sharey=True, kind='bar', palette=COLOR_PALETTE, margin_titles=True);
plt.subplots_adjust(top=0.9)
ax.fig.suptitle("Historical Degree Holders Comparison for "+country);


In [None]:
pm("PM: That's disappointing. ")
response("Me: Not everything is disappointing sir.")
pm("PM: What do you mean?")
response("Me: Statistics can be sometimes deceptive sir. If you see the historical analysis, <b>you will be happy to know that the ratio of girls pursuing PHD's has increased by 1.5 times from 2017.</b>")

In [None]:
dict_gender = {"Female":'Female', 'Male':'Male'}

hist0 = multiple_group_by(mcq_country_current.copy(), ['Degree', 'Gender'], map_dict, dict_gender, 2020)
hist1 = multiple_group_by(mcq_country_2019.copy(), ['Degree', 'Gender'], map_dict, dict_gender, 2019)
hist2 = multiple_group_by(mcq_country_2018.copy(), ['Degree', 'Gender'], map_dict, dict_gender, 2018)
hist3 = multiple_group_by(mcq_country_2017.copy(), ['Degree', 'Gender'], map_dict, dict_gender, 2017)
hist_concat = pd.concat([hist0, hist1, hist2, hist3])


ax = sns.catplot(x="Year", y="Percentage", hue='Gender', aspect=0.8,  col="Degree", data=hist_concat, kind="bar", col_wrap=3, sharex=False, sharey=True, palette=COLOR_PALETTE);
plt.subplots_adjust(top=0.9)
ax.fig.suptitle("Historical Degree Holders Gender wise Comparison for "+country);


In [None]:
pm("PM: That is a great great sign. I need to understand at what point in terms of age does an individual takes a decision to go for higher studies?")
    
response("Me: Here you go!")

In [None]:
degree_age = mcq_country_current.groupby(['Degree','Age']).agg('count').reset_index()[['Degree', 'Age', 'Gender']]
degree_age = map_column(degree_age, 'Degree', map_dict)

fig, ax = plt.subplots(figsize=(14,8))

degree_age.pivot('Age','Degree','Gender').plot.bar(stacked=True,width=0.8, ax=ax)
ax.set_xlabel('Age group')
ax.set_ylabel('Count')
ax.set_title('Degrees held according to Age Groups for India in '+str(current_year));

In [None]:
response("Me: The number of masters is increasing from age group 22-24 onwards. Maybe respondents feel that at this age is the right time to pursue masters and advance career in Data Science.")
pm("PM: This is great to see that by 35-39 most of the techies attain their masters but I agree with you, there should not be any degree bias. A prime example is sitting in front of you. I used to see tea and with just a simple degree I am now the PM of India.")

response("Me: [Laughs] Truly well said sir. Let's move on to Job Titles")
pm("PM: When I was back in Silicon valley previous year, I saw they have very different job titles as we have in India like SDE and all.")
response("Me: Wow! I am amazed that you have knowledge about the job titles as well. When Data Science was not there, then engineers who write code where called software engineer but now since they have moved to Data Science field they are proudly called 'Data Scientists' and <b>'Data Scientist is the sexiest job of 21st century'</b>")

# <a>Job Title</a>

In [None]:
pm("PM: Which takes me to my next question. How many % of the respondents are Data Scientists currently. Because I am assuming that a lot of people are still in transition.")
response("Me: Absolutely, have a look at this.")

In [None]:
## Job title

job_title = value_counts(mcq_country_current, 'Job Title')
fig, ax= plt.subplots(figsize=(14,8))

plt.xticks(rotation=90)
show_bar_plot(x='Job Title' ,y= 'Percentage', data = job_title, axis = 'vertical',xlabel='Designation', ylabel='Percentage', title='Job Titles for '+country+' in '+ str(current_year))


In [None]:
pm("PM: What! Only 10% of the respondents are data scientists.")

response("Me: 1. Yes, and about 10% are Software Engineers. <br> \
2. 40% of the respondents are students maybe in college or passed out looking for job change. <br> \
3. This means that in the coming years we will get more students turning into professionals who are interested in Data Science because these students started way early in their college. So the job creators need to create a lot of new jobs like Machine Learning Intern, Data Science intern to support the interest of wider public. Also Trainee Data Scientist, Jr. Data Scientist positions will pop up in future. <br><br> \
 But there is a negative sign in the graph. ")

pm("PM: What ?")
response("Me: India has just 2.5% research scientists.")
pm("PM: What do research scientists primarily do?")
response("Me: They advance the state of the art in any technology.")
pm("PM: So we do not have enough researchers currently. How do we do as compared to the rest of the world.?")

In [None]:
job_title_row = value_counts(mcq_row_current, 'Job Title')
job_title_concat = combine_row_country(job_title, job_title_row)

fig, ax= plt.subplots(figsize=(14,8))
plt.xticks(rotation=90)
show_bar_plot(x='Job Title' ,y= 'Percentage',hue='Geography' ,data = job_title_concat, axis = 'vertical',xlabel='Designation', ylabel='Percentage', title='Job Titles for '+country+' vs ROW')


In [None]:
response("Me: Not good.")
pm("PM: So, the rest of the world has 3 times more researchers than India has. This is again a very serious issue.")
response("Me: Yes sir. If you compare with USA also, it is the same statistic.")

In [None]:
job_title_compare = value_counts(mcq_country_compare_current, 'Job Title')
job_title_compare = combine_row_country(job_title, job_title_compare, 'USA')

show_point_plot(x='Job Title' ,y= 'Percentage',hue='Geography', data = job_title_compare, title='Job Titles for '+country+' vs '+country_to_compare + ' in '+str(current_year));


In [None]:
pm("PM: Yes, and how is this affected over the years?")

response("Me: Here is the historical analysis sir")

In [None]:

dict_job_title = {'Data Scientist' : 'Data Scientist', 'Software Developer/Software Engineer': 'Software Engineer', 'Software Engineer': 'Software Engineer',
                 'Scientist/Researcher':'Research Scientist', 'Research Scientist':'Research Scientist','Student':'Student'}



job_title_concat = combine_historical_data(mcq_country_2017, mcq_country_2018,mcq_country_2019, mcq_country_current , 'Job Title', dict_job_title)

_country_2017 = pd.read_csv('../input/kaggle-survey-2017/multipleChoiceResponses.csv', encoding='latin-1')
one_record = {'Job Title' : 'Student', 'Percentage' : _country_2017['StudentStatus'].value_counts(normalize= True, dropna= False).reset_index().iloc[1,1]*100, 'Year': '2017'}

df_one_record = pd.DataFrame(one_record, index=[0])
job_title_concat = pd.concat([df_one_record, job_title_concat]).reset_index(drop=True)

ax= sns.catplot(x="Year", y="Percentage", aspect =0.9,  col="Job Title", data=job_title_concat, col_wrap =4, sharex = False, sharey=True, kind='bar', palette=COLOR_PALETTE);
plt.subplots_adjust(top=0.9)
ax.fig.suptitle("Historical Job Title Comparison for "+country);


In [None]:
response("Me: Over the years, students have increased, the Software Engineers have decreased and Data Scientists too have decreased while research scientists have increased but not so rapidly.")
pm("PM: So the decrease in Software Engineer roles is that people are loosing their jobs?")
response("Me: Not exactly sir. Since there is so much hype for Data Science and ML in the market, companies are luring people into Data Scientist roles but the popularity of Software Engineer means some companies might want Data Science in addition to some work from Software Engineering life cycle. \
<br><br>Also Data Science is built on top of Statistics and you can see that Statisticians have fell out of favour with the advent of Machine Learning roles. ")
pm('But then why are not Data Scientists increasing?')
response('Can you guess that from an earlier chart I showed you?')
pm("PM: Hmm. 10% people are not employed. So I want to ask you the hard question. Has the unemployment rate increased or decreased in India?")

In [None]:
unemployment = mcq_country.groupby(['Job Title', 'Year']).agg({'index':'count'}).reset_index()
unemployment_status = unemployment[unemployment['Job Title'].isin(['Currently not employed','Not employed'])]
unemployment_total = unemployment.groupby('Year')['index'].sum().reset_index()
unemployment_selected = unemployment_status.groupby('Year')['index'].sum().reset_index()

unemployment_merged = pd.merge(unemployment_total, unemployment_selected, on='Year', how='right')
unemployment_merged['Percentage'] = unemployment_merged['index_y'] / unemployment_merged['index_x'] * 100
#unemployment_merged

In [None]:

fig, ax = plt.subplots(figsize=(8,6))
ax = sns.pointplot(x= 'Year', y='Percentage', data=unemployment_merged)

show_bar_plot( x='Year', y='Percentage',data=unemployment_merged,title="Unemployment Rate in "+country+" over the years", xlabel='Year', ylabel='Unemployment Rate')


In [None]:
response("Me: Here are the numbers, sir")
pm("PM: [Disappointed] Noooo, if people with such high skills are not getting jobs then what will happen to those who have not even acquired these skills as of now. I will definitely note this down.")

# <a>Company Size</a>

In [None]:
response("Me: Absolutely Sir, another factor that is very crucial is that where are these engineers working. I mean in big companies, startups and all.")
pm("PM: Yes, I am interested in this as we have rolled out a lot of policies in Startup India and given so much tax rebates to big IT companies. So, if there is a requirement for change of any policies in these companies then we can advise them to do so.")
response("Me: [Surprised] Interesting!")

In [None]:
## Company Size

company_size = value_counts(mcq_country_current, 'Company Size')
company_size_row = value_counts(mcq_row_current, 'Company Size')
company_size_concat = combine_row_country(company_size, company_size_row)

fig, ax= plt.subplots(figsize=(12,6))
show_bar_plot(y='Company Size' ,x= 'Percentage', data = company_size, axis = 'horizontal', xlabel='Percentage', ylabel='Strength', title='Strength of Companies for India in '+str(current_year))

In [None]:

response("Me: 31% of the people work in large organizations. Obviously they have a lot of money to burn in employee upskilling, hardware investment and wait for the right client who can give hefty compensation for that investment.")
pm("PM: Hmm, most of the employers in Indian IT are service based companies like Infosys, TCS, Accenture, Wipro, HCL.")
response("Me: This can be a positive as well as a negative point. While service based companies are keen on investing and have money to invest, they are also very keen to shut down the investment if no returns are made in a particular time frame.")

response("Me: See this India vs ROW chart.")

In [None]:
fig, ax= plt.subplots(figsize=(12,6))
show_bar_plot(y='Company Size' ,x= 'Percentage', hue='Geography', data = company_size_concat, axis = 'horizontal', xlabel='Percentage', ylabel='Strength', title='Strength of Companies for India in '+str(current_year))


In [None]:
response("Me: As you can see 18% people from the rest of the world are working in companies with strength from 1000–9999. It is my assumption that this strength corresponds to a product company. <br>\
<br>And interestingly 37% of the techies from the rest of the world working in Data Science are engaged in early stage startups. Even India is not behind and stands at a comparable 34%. This shows that we need to support the early stage startups even more as <b>they are creating employment in this field and are helping the government cope up with unemployment.</b>")

In [None]:
company_size_compare = value_counts(mcq_country_compare_current, 'Company Size')
company_size_compare = combine_row_country(company_size, company_size_compare, 'USA')

show_point_plot(x='Company Size' ,y= 'Percentage',hue='Geography', data = company_size_compare, title='Company Size comparison for '+country +' vs '+country_to_compare+' in '+str(current_year))

In [None]:
response("Me: Another factor that supports this is the fact we are ahead of USA in terms on employment in startups. ")

In [None]:
map_company_size = {'10,000 or more employees':'10,000 or more employees', '1000-9,999 employees':'1000-9,999 employees',
       '250-999 employees':'250-999 employees', '0-49 employees':'0-49 employees', '50-249 employees':'50-249 employees'}

company_size_2017 = value_counts( map_column(mcq_country_2017.copy(), 'Company Size', map_company_size), 'Company Size')
company_size_2017['Year'] = 2017
company_size_2020 = value_counts( map_column(mcq_country_current.copy(), 'Company Size', map_company_size), 'Company Size')
company_size_2020['Year'] = 2020
company_size_historical = pd.concat([company_size_2017, company_size_2020])
company_size_historical
ax = sns.catplot(x='Year', y='Percentage', aspect=0.9, col='Company Size', data = company_size_historical, sharex= False, col_wrap= 3, kind='bar', palette=COLOR_PALETTE)
plt.subplots_adjust(top=0.9)
ax.fig.suptitle("Historical Company Size Comparison for "+country);


In [None]:
response("Me: Even if you see the historical you will see a huge rise in people working in early stage startups. \
Some of the promising startups are recognized by Niti Ayog. ")

pm("PM: Absolutely, we support startups through Startup Indian Initiative, and this way people are getting jobs, investors are getting money, the government is getting support of startups to solve real life problems which is plus for everyone.")

response("Me: Absolutely! Look at the next chart, it is exciting as well.")

In [None]:
company_size_2017 = map_column(mcq_country_2017.copy(), 'Company Size', map_company_size)
company_size_2017['Year'] = 2017

company_size_2020 = map_column(mcq_country_current.copy(), 'Company Size', map_company_size)
company_size_2020['Year'] = 2020

cols= ['Job Title', 'Year', 'index', 'Company Size']
company_size_historical = pd.concat([ company_size_2017[cols], company_size_2020[cols] ])

dict_job_title_company = {'Data Scientist' : 'Data Scientist', 'Software Developer/Software Engineer': 'Software Engineer', 'Software Engineer': 'Software Engineer',
                 'Scientist/Researcher':'Research Scientist', 'Research Scientist':'Research Scientist'}

company_jt_2017 = multiple_group_by(company_size_2017, ['Company Size', 'Job Title'],  map_col2=dict_job_title_company, year= 2017)
company_jt_2020 = multiple_group_by(company_size_2020, ['Company Size', 'Job Title'],  map_col2=dict_job_title_company, year = 2020)

company_jt_concat = pd.concat([company_jt_2017, company_jt_2020])
company_jt_concat

ax = sns.catplot(x='Year', y='Percentage', hue='Job Title', aspect=0.9,col='Company Size', data = company_jt_concat, sharex= False, col_wrap= 3, kind='bar', palette=COLOR_PALETTE)
plt.subplots_adjust(top=0.9)
ax.fig.suptitle("Historical Company wise Degree Holders Comparison for "+country);


In [None]:
response("Me: And another interesting fact is that most of the Data Scientists and research scientists are working in Startups only. This shows the fact the startups are AI Driven. While the Software Engineering roles are increasing in IT and Product based companies, these companies are seeing a declining trend in terms of Data Scientists being employed.")

In [None]:

# pm("PM: I am convinced that startups can solve huge problems for government like unemployment. Once the youth starts getting employed they will create more jobs, like I am assuming you would have kept a cook at home to prepare food for you so indirectly your employment is giving you the priviledge to create more jobs. So Harveen, what does it takes in terms of infrastructure to start a AI/ML startup? ")

# response("Me: I 100% agree with you sir")

pm("PM: What do you mean by AI driven?")

response("Me: Like you have a manifesto for election where you mention your priorities and focus, similarly startups also are focused on certain technologies and companies who are targeting AI as a sole technology are AI Driven.")

# <a>Current Company using ML</a>

In [None]:
## Current Company ML?

current_ML = value_counts(mcq_country_current, column ='ML Status in Company') 
current_ML_row = value_counts(mcq_row_current, column ='ML Status in Company') 
current_ML_concat = combine_row_country(current_ML, current_ML_row)

dict = {'We are exploring ML methods (and may one day put a model into production)' : 'Exploring ML methods \n(May put a model into production one day)',
        'We recently started using ML methods (i.e., models in production for less than 2 years)': 'Recently started using ML methods \n(models in production < 2 years)',
       'We have well established ML methods (i.e., models in production for more than 2 years)':'Well established ML methods \n(models in production > 2 years)',
       'No (we do not use ML methods)':'No, we do not use ML methods',
       'I do not know':'No idea',
       'We use ML methods for generating insights (but do not put working models into production)':'ML models for Generating insights \n(but do not put working models into production)'}

current_ML_concat['ML Status in Company'] = current_ML_concat['ML Status in Company'].map(dict)


fig, ax= plt.subplots(figsize=(10,7))

show_bar_plot(y='ML Status in Company' ,x= 'Percentage', hue='Geography', data = current_ML_concat, axis = 'horizontal', xlabel='Percentage', ylabel='',title='State of AI in production for '+country+' in '+str(current_year))


In [None]:
response("Me: See this graphic sir. This answers your question.<br> \
<br>1. 17 % of the people don\'t even know if ML or DL is being used in their company. This is bad, if they don\'t even know about their company how will they advance into higher roles.<br>\
<br>2. 16%, I repeat 16% people have bluntly said that they do not use ML or DL in their company.  <br>\
<br>3. Also 22% are just exploring the capabilities of Machine Learning and have not put a model into production till Date. This means they do not have an end product ready which uses AI as of now. <br>\
<br>4. But all is not lost. Still 18% of the companies are using ML from past 2 years and have deployed models while the other 16% have just started and catching up. ")

In [None]:
current_ML_compare = value_counts(mcq_country_compare_current, 'ML Status in Company')
current_ML_compare = combine_row_country(current_ML, current_ML_compare, 'USA')

show_point_plot(x='ML Status in Company' ,y= 'Percentage',hue='Geography', data = current_ML_compare, title='State of AI in '+country+' vs '+country_to_compare+' for '+ str(current_year))

In [None]:
response("Me: A worrysome statistic if you compare with our competitor USA is that they have about 9% more companies who are using AI in their products or services. This shows that how early and steadily they have adopted the technology and we should also do the same. ")

# pm("PM: So which are the companies that are incubating this technology more? Is it the startups or service based?")

# response("Me: Here you go sir")

In [None]:
# response("Me: In Large IT companies the technology has been adapted quite quickly. But this adaptation is not uniform, the inexperienced professionals have also increased. \
#          On the other hand, in startups or product companies, experienced techies have increased which is a very good sign for us.")
# pm("PM: Great! How are these individuals compensated? I know as a PM, I should not have such interests, but I want to understand if people with certain skills, degrees, gender are paid differently.")


pm("PM: How were they able to scale up so fast?")
response("Me: Due to their people who have done so much hard work.")
pm("PM: So Indians are not hard working?")
response("Me: Sir, some of the most hard working Indians have migrated to US. I hope you met a lot of them in your previous US trip")
pm("PM: That is correct. So why is it happening? Are they not earning enough in India?")
response("Me: Compensation is a million dollar question sir. And this is one reason why India is loosing so much of its talent")

pm("PM: Show me with data")

response("Me: Absolutely!")

# <a>Compensation</a>


In [None]:
## Compensation

compensation = value_counts(mcq_country_current, 'Compensation Status')
compensation_row = value_counts(mcq_row_current, 'Compensation Status')
compensation_concat = combine_row_country(compensation, compensation_row)
compensation_concat

fig, ax= plt.subplots(figsize=(12,16))
plt.legend( loc=20)

show_bar_plot(y='Compensation Status' ,x= 'Percentage', hue='Geography',  data = compensation_concat, axis = 'horizontal', xlabel='Percentage', ylabel='Salary Range in USD', title='Salary Range of Data Scientists for '+country+" in "+str(current_year))


In [None]:
response("Me: <br>1. A lot of folks are just doing it as an hobby as they are not employed or they are just interning given the fact we had a large number of student ratio<br>\
<br>2. Apart from that the salary ranges mostly from 0-20000 USD which is not high enough even from Indian Standards. Maybe these ranges indicate the opening salary of fresh hires in digital units of some big MNC's<br>\
<br>3. Another popular bracket is 10000- 20000 USD which can account for people working in the Industry from past 3-5 years and might have recently adopted data science")

In [None]:
compensation_compare = value_counts(mcq_country_compare_current, 'Compensation Status')
compensation_compare = combine_row_country(compensation, compensation_compare, 'USA')
compensation_compare = compensation_compare.sort_values(by='Compensation Status')

s = compensation_compare['Compensation Status'].str.len().sort_values().index
compensation_compare = compensation_compare.reindex(s)


fig, ax = plt.subplots(figsize=(16,5))
sns.pointplot(x='Compensation Status' ,y= 'Percentage',hue='Geography', data = compensation_compare, color=colors[0] )
plt.xticks(rotation=90);
plt.title('Salary Ranges of '+country+' vs '+country_to_compare +' for India in '+str(current_year));

In [None]:
response("Me: This is a chart that is another game changer. \
    You can clearly see that Indian's mostly earn in the range 10,000 to 20,000 USD which is very low if you compare it to US. In US, a data Scientist earns somewhat between 100k - 150k which is almost 10 times as that of Indian salary.")

pm("PM: Yes but the standard of living is also high in US, isn't it?")
    
response("Me: Yes sir, but still more the money more people can save also")

pm("PM: Can you please convert these compensation ranges to Indian salary?")

response("Me: Sure sir. L means Lakh (1,00,000 Indian Rupees) and C means Crore (1,00,00,000 Indian Rupees)")

In [None]:

dict_salary = {'Band 0: 0 - 7.5L': ['0-10,000'] , 
'Band 1: 7.5L - 14.2L': ['10,001-20,000'],
'Band 2: 14.3L - 21.3L': ['20,001-30,000'], 
'Band 3: 21.4L - 35.6L': ['30,000-39,999', '40,000-49,999'],
'Band 4: 35.7L - 49.8L': ['50,000-59,999', '60,000-69,999'],
'Band 5: 49.9L - 64.1L': ['70,000-79,999', '80,000-89,999'],
'Band 6: 64.2L - 89L': ['90,000-99,999','100,000-124,999'],
'Band 7: 89.1L - 1.42C': ['125,000-149,999','150,000-199,999'],
'Band 8: 1.42C - 2.1C': ['200,000-249,999','250,000-299,999'],
'Band 9: 2.2C - 3.5C': ['300,000-500,000'],
'Band 10: > 3.5C': ['> $500,000']}

reverse_dict = {}
for key, value in dict_salary.items():
    for val in value:
        reverse_dict[val] = key

        salary_2019 = map_column(mcq_country_current.copy(), 'Company Size', map_company_size)
salary_2019['Year'] = 2020

#cols= ['Compensation Status', 'Year', 'Time', 'Company Size']
#salary_2019

company_ml_2019 = multiple_group_by(salary_2019, ['Compensation Status', 'Company Size'], map_col1=reverse_dict)
company_ml_2019 =  company_ml_2019.sort_values('Compensation Status')

company_ml_2019 = break_labels(company_ml_2019, 'Company Size')
ax = sns.catplot(x='Company Size', y='Percentage', col='Compensation Status', aspect=0.95,  data= company_ml_2019, col_wrap=3, kind='bar', sharex= False, color=colors[0] )
plt.subplots_adjust(top=0.95)
ax.fig.suptitle("Compensation variation according to company for "+country);


In [None]:
response("Me: So if you see a company size wise breakup for the compensation structured in bands, you will be able to observe startups provide the most internships. While it is highly unclear due to lack of data the entire salary group for different companies, but IT companies are not the best paymasters.")

In [None]:
salary_2019 = map_column(mcq_country_current.copy(), 'Job Title', dict_job_title_company)
salary_2019['Year'] = 2020

#cols= ['Compensation Status', 'Year', 'Time', 'Company Size']
#salary_2019

company_ml_2019 = multiple_group_by(salary_2019, ['Compensation Status', 'Job Title'], map_col1=reverse_dict)
company_ml_2019 =  company_ml_2019.sort_values('Percentage')

ax = sns.catplot(col='Job Title', x='Percentage', y='Compensation Status', aspect=0.5, height=10, data= company_ml_2019, col_wrap=3, kind='bar', sharex= False,  color=colors[0] )
plt.subplots_adjust(top=0.9)
ax.fig.suptitle("Salary comparison according to Job Title for "+country+' in '+str(current_year));


In [None]:
response("Me: The salary for software engineer varies a lot. But out of Data Scientist and Research Scientists, Research Scientists have a well defined salary range.")

In [None]:
salary_2019 = map_column(mcq_country_current.copy(), 'Degree', map_dict)
salary_2019['Year'] = 2020


company_ml_2019 = multiple_group_by(salary_2019, ['Compensation Status', 'Degree'], map_col1=reverse_dict)

ax= sns.catplot(col='Degree', x='Percentage', y='Compensation Status', aspect=0.5, height=10, data= company_ml_2019, col_wrap=3, kind='bar', sharex= False,  color=colors[0] )

plt.subplots_adjust(top=0.9)
ax.fig.suptitle("Salary Comparison according to degree for "+country+' in '+str(current_year));

In [None]:
response("Me: If you observe from qualification perspective in terms of degree, you will observe that Bachelors are paid less on average than masters. \
    Masters are paid somewhat in the range 1CR-3CR. And on an average PHD's are also paid more than Masters.")

pm("PM: I think this is very much correct and will be present all around the globe, but still I can see some outliers. Some people with bachelors are earning more than 3.5 Cr.")

response("Me: Exceptions are everywhere sir.")

pm("PM: Exactly, nothing can stop talent. What about the gender pay gap in India?")

In [None]:
salary_2019 = map_column(mcq_country_current.copy(), 'Gender', {"Male":"Male", "Female":"Female"})
salary_2019['Year'] = 2020


company_ml_2019 = multiple_group_by(salary_2019, ['Compensation Status', 'Gender'], map_col1=reverse_dict)
total_female = company_ml_2019.groupby('Gender').sum().reset_index()[['Gender', 'index']].iloc[0,1]
total_male = company_ml_2019.groupby('Gender').sum().reset_index()[['Gender', 'index']].iloc[1,1]


def normalized_row(row):
  if row['Gender'] == 'Female':
    row['Degree_P'] = (row['index'] / total_female) * 100
  if row['Gender'] == 'Male':
    row['Degree_P'] = (row['index'] / total_male) * 100
  return row

company_ml_2019 = company_ml_2019.apply(normalized_row, axis=1)
fig, ax = plt.subplots(figsize=(14,8))
plt.xticks(rotation=90)
show_bar_plot('Compensation Status', 'Degree_P', hue='Gender', data=company_ml_2019, title='Salary Brackets of respondents with respect to gender in 2020', xlabel='Salary Range', ylabel='Percentage', show_percent= False);

In [None]:
response("Me: Seeing the chart, it is very hard but still it reveals. I don't want to say this but yes, <b>There is a gender pay gap.</b>")

pm("PM: At an entry level package there are a lot of females which indicates they are paid less and as the package increases the number of females reduce. Indeed another negative point. \
I think we have discussed enough on the personal Information. Let's talk about the investment companies are making.")

pm("PM: I am convinced that startups can solve huge problems for government like unemployment. Once the youth starts getting employed they will create more jobs, like I am assuming you would have kept a cook at home to prepare food for you so indirectly your employment is giving you the priviledge to create more jobs. So Harveen, what does it takes in terms of infrastructure to start a AI/ML startup? ")

response("Me: I 100% agree with you sir")


pm("PM: Interesting. I want to understand how hard is it to start a company in this field and what is the amount of investment needed? I am interested because I strongly believe in the founders of startups and want to be in their shoes for a while.")

response("Me: That's great, this is a very hardware intensive field which means that a lot of money is spent on Hardware rather than software. Let's see how companies are spending money")


# <a>Money Spent on Cloud Computing/ ML Products</a>

In [None]:
## Money Spent

money = value_counts(mcq_country_current, 'Money Spent')
money_row = value_counts(mcq_row_current, 'Money Spent')
money_concat = combine_row_country(money, money_row)

fig, ax= plt.subplots(figsize=(8,6))
plt.xticks(rotation=90)
show_bar_plot(x='Money Spent' ,y= 'Percentage', hue='Geography', data = money_concat, axis = 'vertical', xlabel='Money Spent', ylabel='Percentage',title='Money Spent on CC/ML products in '+str(current_year))

money_compare = value_counts(mcq_country_compare_current, 'Money Spent')
money_compare = combine_row_country(money, money_compare, 'USA')


#plt.xticks(rotation=90)


In [None]:
response("Me: As it is clear from the graph, there is not much investment from Indian point of view. Companies are investing but not that much. <br>\
<br> On the other hand you can see Americans companies are making significant investment in Hardware and Cloud products. There is a gap of about 10% which needs to be filled.")

show_point_plot(x='Money Spent' ,y= 'Percentage',hue='Geography', data = money_compare, title='Money Spent on ML products for India vs United States of America in 2020')

In [None]:

pm("PM: Absolutely, when I talk about making smart cities in India, I often stress on the fact that our infrastructure needs to improve if we need to develop smarter roads. Same is the case here, we can't help companies in any way but through Startup India we can just consult the companies on this.")

response("Me: Absolutely Sir, which brings me to the last point in today's discussion")

pm("PM: Go ahead! I think I have now the full knowledge on what to do, just let me know how to spread the message")

response("Me: This is exactly what my last point is. Let's look at the Media Sources and Platforms Indian students are using for AI")


# <a>Media Sources & Platforms</a>


In [None]:
df_2020= pd.read_csv('../input/kaggle-survey-2020/kaggle_survey_2020_responses.csv')
df_2020_india = df_2020[df_2020['Q3']=='India']
df_2020_row = df_2020[~(df_2020['Q3']=='India')]
df_2020_india.head()

In [None]:
## Media Sources
limit = 10

media = multiple_answers(limit, df_2020_india, 'Q39_Part_', 'Media Sources')
media['Percentage']= (media.Count /  media.Count.sum()) * 100
media_row = multiple_answers(limit, df_2020_row, 'Q39_Part_', 'Media Sources')
media_row['Percentage']= (media_row.Count /  media_row.Count.sum()) * 100
media_concat = combine_row_country(media, media_row)

#media_sources
fig, ax= plt.subplots(figsize=(12, 10))
show_bar_plot(y='Media Sources' ,x= 'Percentage', hue='Geography', data = media_concat, axis = 'horizontal', ylabel='Source', xlabel='Percentage', title='Data Science Media Sources')
#media_concat

# media_compare = multiple_answers(limit, mcq_country_compare, 'Q39_Part_', 'Media Sources')
# media_compare['Percentage']= (media_compare.Count /  media_compare.Count.sum()) * 100
# media_compare = combine_row_country(media, media_compare, 'USA')


# fig, ax= plt.subplots(figsize=(15,5))
# show_point_plot(x='Media Sources' ,y= 'Percentage',hue='Geography', data = media_compare, title='Media Sources for '+country+' vs '+country_to_compare)


In [None]:
response("Me: 1. So sir, Kaggle is the most preferred platform for Data Science in India.<br> \
2. Other Digital Media like Towards Data Science, Analytics Vidhya are preferred because of blog posts and all.\
         3. Apart from that youtubers also spread the work.\
")

pm("PM: Great! Why don't you create your youtube channel. I feel you will get a lot of subscribers")

response("Me: Sure sir, it is work in progress as of now.")

In [None]:
## Media Sources
limit = 10

media = multiple_answers(limit, df_2020_india, 'Q37_Part_', 'Platforms')
media['Percentage']= (media.Count /  media.Count.sum()) * 100
media_row = multiple_answers(limit, df_2020_row, 'Q37_Part_', 'Platforms')
media_row['Percentage']= (media_row.Count /  media_row.Count.sum()) * 100
media_concat = combine_row_country(media, media_row)

#media_sources
fig, ax= plt.subplots(figsize=(12,10))
show_bar_plot(y='Platforms' ,x= 'Percentage', hue='Geography', data = media_concat, axis = 'horizontal', ylabel='Source', xlabel='Percentage', title='Data Science Platform Sources')
#media_concat

# media_compare = multiple_answers(limit, mcq_country_compare, 'Q37_Part_', 'Platforms')
# media_compare['Percentage']= (media_compare.Count /  media_compare.Count.sum()) * 100
# media_compare = combine_row_country(media, media_compare, 'USA')


# fig, ax= plt.subplots(figsize=(15,5))
# show_point_plot(x='Platforms' ,y= 'Percentage',hue='Geography', data = media_compare, title='Data Science Platform sources for '+country+ 'vs '+country_to_compare)


In [None]:
response("Regarding Plaforms for Data Science where students study.<br> \
         1. Coursera (founded by Andrew Ng) from China leads the race followed by Udemy and Kaggle <br> \
         2. An interesting fact in this chart is that close to 13% Americans attain knowledge from Universities as compared to 9% in India<br> \
         ")

pm("PM: You know that our government is on a blocking spree. So I want to ask, are there any Indian platforms that are going ahead?")

In [None]:
#text_response = text.copy()
#text_response = text_response[text_response['Country'] == country]
df_2019 = pd.read_csv('../input/kaggle-survey-2019/other_text_responses.csv')
text_response = pd.DataFrame()
text_response['count'] = 1
text_response['ML_algo'] = df_2019['Q13_OTHER_TEXT'].str.lower()
text_response
text_response[['ML_algo','count']].groupby('ML_algo').sum()[['count']].sort_values('count', ascending=False)

# Create wordcloud
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
plt.figure(figsize=[14,8])

# Create and generate a word cloud image:
ide_words = ' '.join(text_response['ML_algo'].dropna().values)
wordcloud = WordCloud(colormap="tab10",
                      width=1200,
                      height=480,
                      normalize_plurals=False,
                      background_color="white",
                      random_state=5).generate(ide_words)

# Display the generated image:
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.show()

# <a>Conclusion</a>

In [None]:
response("Me: Absolutely sir, you will be so happy to know that nptel is one of the leading platforms in India")

pm("PM: Oh yes, how could I forget National Programme on Technology Enhanced Learning.<br><br>\
It an initiative by seven Indian Institutes of Technology (IIT Bombay, Delhi, Guwahati, Kanpur, Kharagpur, Madras and Roorkee) and Indian Institute of Science (IISc) for creating courses.<br><br>\
This is the biggest plus point in our discussion today. Thanks for bringing this up.")

response("Me: No problem sir. But not only nptel, upgrad, applied AI course, analytics vidhya, great learning are also the preferred choices for some students")

pm("PM: But why are these not popular as Coursera?")

response("Me: Maybe they need to change the structure of the way they are making students learn and include a lot of inputs from industry professionals.")

pm("PM: I see! Yes making people learn is also an art.")

pm("PM: Well, That's it then.<br><br>\
I can't thank you enough for all this detailed analysis. I have got a lot of working points for me.")

pm("PM: Positives: <br>\
1.	Youth Percentage Increasing <br><br>\
2.	Ratio of girls pursuing PHD’s increased by 1.5 times from 2017 <br><br>\
3.	Strong Startup Presence. <br><br>\
4.	Startups creating employment for young generation<br><br>\
5.	NPTEL creating valuable courses <br><br>\
6.	Great startup ecosystem in Edtech space. <br><br>\
\
Negatives:<br>\
1.	Lack of Experienced professionals <br><br>\
2.	Only 22% females <br><br>\
3.	Only 5% PHD’s <br><br>\
4.	India has 3 times less researchers than ROW. <br><br>\
5.	10% unemployment rate <br><br>\
6.	Low salary as compared to other nations <br><br>\
7.	Gender Pay gap. <br><br>\
")
   


In [None]:
response("Me: These are just a few points sir. I hope you will bring huge reforms in AI through your yognas.")
pm("PM: Absolutely. You know India has partnerships with other nations to strengthen the country’s artificial intelligence industry. These countries are Germany, Singapore, Canada, Russia, UAE, China")

response("Me: That's great sir. I wish you all the best for the new policies and reforms.")
pm("PM: Anything you would like to suggest in particular?")
response("Me: Sir, I am waiting for a day when India will have their own minister of Artificial Intelligence.")
pm("PM: Minister of AI as appointed by Dubai. Sure, one of the candidates has to be you for that post" )
response("Me: Why do you think I suggested that?")
pm("PM: Haha. Thank you Harveen for all your hard work.")


I was just about to shake hands with PM and the alarm went off. Even though Mr. Modi was keen to make me the Minister of AI, I was more interested to become PM of India.

# <a>Closing Comments</a>

Thank you for taking out time to read this. AI in India is in promising state but brain drain has hurt India a lot and will continue to do so if no actions are taken on time.

I just want to thank myself for continuously devoting time to this kernel as I promised myself in the starting of this competition and thank to you for reading it upto here. If you enjoyed this conversation then please don't forget to upvote. I want to write a book someday if you have any feedback please let me know!