# Loading Data and Libraries

In [None]:
import warnings
warnings.filterwarnings('ignore')

# import libraries
import pandas as pd
import numpy as np

# for visualization
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

colors = ["#fcba03", "#fc6703", "#870077", "#ff0073", "#0056a6"]
sns.set_style('whitegrid')
sns.set_palette(colors)

plt.rc("figure",
        figsize=(13, 7),)

plt.rc("axes",
       linewidth=2,
       labelsize=14.5,
       labelweight='regular',
       labelcolor='#000000',
       labelpad = 13,
       titlesize=18,
       titleweight=550,
       titlecolor="#000000",
       edgecolor='#000000',
       titlepad=50,)

plt.rc("text", 
      color='#555',)

plt.rc("xtick",
      labelsize=14,
      labelcolor='#000000',)

plt.rc("ytick",
      labelsize=14,
      labelcolor='#000000',)

plt.rc("grid", ls = "--", linewidth=1, c="#aaa")

plt.rc("legend",
       shadow=False,
       fancybox=True,
       edgecolor='#000000',
       handlelength=0.8,
       fontsize=12,
       fc="#ffffff")

font={'family':'fantasy','fantasy':'Chicago'}
plt.rc("font", **font)

pd.set_option("display.max_rows", None, "display.max_columns", None)


kaggle_data = pd.read_csv("../input/kaggle-survey-2021/kaggle_survey_2021_responses.csv")
data = pd.read_csv("../input/kaggledataset/KaggleDataAnalyst2021.csv", index_col=0)
data['id'] = list(range(0, len(data)))
data.head(3)

In [None]:
data.shape

# Percentage of Women in Data Analytics

In [None]:
# Gender diversity
gender = round((data.groupby("Gender").count()['id'])*100 / (data.groupby("Gender").count()['id']).sum(), 1).sort_values(ascending=True)

# plot
gender.plot(kind='barh')

# annotate
i = 0
for val in gender:
    val = round(val, 2)
    plt.text(val+4, i, s=str(val)+"%", ha='center', va='center', fontsize=12, fontweight=900, bbox=dict(fc='white',ec='black', boxstyle='round, pad=0.5'), color='#000')
    i = i+1
    

plt.xlabel("Percentage")
plt.ylabel("Gender")
plt.title("Percentage of Women in Data Analytics", loc='left', fontstyle='italic')
plt.text(x=0, y=4.6, s ="Gender gap observed in Data Anlytics aswell, as Women represent roughly 22% respondent, \nwhere as Men represent 76% respondent as Data Analyst", fontsize=14.5)

plt.xlim(0, 90)

plt.show()

# Age ranges of Data Analyst

In [None]:
# age
a = data[data.Gender.apply(lambda x: x in ['Man', 'Woman'])]
age = pd.DataFrame(a.groupby(['Age', 'Gender']).count()['id'])

# plot
age.unstack().plot(kind='bar', color=colors[2:], width=0.7)

plt.xlabel("Age group")
plt.ylabel("Count")
plt.title("Age Count of Data Analyst", loc='left', pad=40, fontstyle='italic')
plt.text(x=-0.6, y=475, s = "50% of kaggle respondent women Data Analyst are in twenties", fontsize=14.5)

plt.legend(['Man', 'Woman'])
plt.show()

# Country

In [None]:
# Data Prepration
woman_data = data[data.Gender.apply(lambda x: x == 'Woman')]
country = pd.DataFrame((woman_data.groupby('Country').count()['id']).sort_values(ascending=False))[:10]

# plot
country.plot(kind='bar')

plt.ylabel('Count')
plt.title("Most Common Countries of Kaggle Woman Data Analyst", loc='left', fontstyle='italic', pad=40)
plt.text(-0.5, 150, "Common countries among the kaggle women data analyst respondents are India and USA", fontsize=14.5)

plt.legend(['Country'])
plt.show()

# Women in Data Analytics in India

In [None]:
# Selecting data from country 'India'
df = woman_data[woman_data.Country == 'India']

# Find the highest education of woman who, work as data analyst in India.
grouped_data = pd.DataFrame((df.groupby('HighestEducation').count()['id'])*100/(df.groupby('HighestEducation').count()['id']).sum())[:5]
grouped_data.sort_values(by='id', ascending=False, inplace=True)

# plot   
grouped_data.plot(kind='bar', color=colors[-1], width=0.3)

# Annotation
i = 0
for val in grouped_data['id']:
    val = round(val, 2)
    plt.text(i, val+3, s = str(val)+"%", ha='center', va='center', fontsize=12, color='#000000', fontweight=600, 
            bbox=dict(fc='white', pad=0.5, ec='black', boxstyle='round'))
    i+=1
    
plt.xlabel('Highest Education')
plt.ylabel('Percentage')
plt.title("Education levels of Women in Data Analytics", loc='left',fontstyle='italic')

text = """More than 80% of Women Data Analyst's in India either having Master's or Bachelor's Degree, \nand 2% of Women Data Analyst have a Doctorate Degree"""
plt.text(-0.4, 56.5, text,fontsize=14.5, wrap=True)

plt.ylim(0, 55)

plt.legend(['Highest Education'])
plt.show()

# Programming Laguages

In [None]:
# Programming Languages used by Data Analyst in India on daily basis.

# data preparation
lang = df.loc[:, 'Python':'MATLAB'].reset_index(drop=True)
lang_df = pd.DataFrame({"Language":lang.count()})
lang_df = lang_df.sort_values(by='Language', ascending=False)[:10]

# plot
lang_df.plot(kind='bar', color=colors[2])

# Annotation
i = 0
for val in lang_df['Language']:
    val = round(val, 2)
    plt.text(i, val+5, s = " "+str(val)+" ", ha='center', va='center', fontsize=12, fontweight=600, color='#000000',
            bbox=dict(fc='white', pad=0.4, ec='black', boxstyle='round'))
    i+=1

plt.xlabel('Programming Language')
plt.ylabel('Count')
plt.title("Programming Language used by Data Analyst", loc='left', fontstyle='italic', pad=40)

text = "Python, SQL, and R are the top three programming languages used by Data Analyst on Daily Basis."
plt.text(-0.5, 135, text, fontsize=14.5, color='#555')

plt.xticks(rotation=0)
plt.ylim(0, 130)
plt.show()

# Data Visualization Libraries

In [None]:
# Visualization Libraries used by Woman Data Analyst in India on daily basis.

# data preparation
lang = df.loc[:, 'Matplotlib':'Leaflet'].reset_index(drop=True)
lang_df = pd.DataFrame({"count":lang.count()})
lang_df = lang_df.sort_values(by='count', ascending=False)[:10]

# plot
lang_df.plot(kind='bar', color=colors[1], alpha=0.9)

# Annotation
i = 0
for val in lang_df['count']:
    val = round(val, 2)
    plt.text(i, val+4, s = " "+str(val)+" ", ha='center', va='center', fontsize=12, color='#000000', fontweight=600, 
            bbox=dict(fc='white', pad=0.4, ec='black', boxstyle='round'))
    i+=1
    
plt.xlabel('Data Visulization Libraries')
plt.ylabel('Count')
plt.title("Data Visualization Libraries used by Data Analyst", loc='left', fontstyle='italic')

text = "Matplotlib and Seaborn are the most common data visualization libraries used by Woman \nData Analyst in India on Daily Basis"
plt.text(-0.5, 112, text, fontsize=14.5, color='#555', wrap=True)

plt.xticks(rotation=0)
plt.ylim(0, 108)
plt.legend(['DataViz Libraries Count'])
plt.show()

# Relational Databases

In [None]:
# Data preparation
db = df.loc[:, 'MySQL ':'Spanner'].count().sort_values(ascending=False)[:10]
db_df = pd.DataFrame({'DBcount': db[db > 0]})

# plot
db_df.plot(kind='bar', color=colors[3])

# Annotation
i = 0
for val in db_df['DBcount']:
    val = round(val, 2)
    plt.text(i, val+1.5, s = " "+str(val)+" ", ha='center', va='center', fontsize=12, color='#000000', fontweight=600, 
            bbox=dict(fc='white', pad=0.4, ec='black', boxstyle='round'))
    i+=1
    
plt.xlabel("Relational Databases")
plt.ylabel("Count")
plt.title("Relation Databases used by Data Analyst on Daily Basis", loc='left', fontstyle='italic')

text = "SQL is one of most important language and used daily by Data Analyst, popular relational database \namong Data Analyst in India is MySQL"
plt.text(-0.5, 35, text, fontsize=14.5)

plt.ylim(0, 34)
plt.legend(['Databases'])

plt.show()

# Cloud Computing Platform used by Data Analyst

In [None]:
# Popular cloud computing platform among Data Analyst.
# Prepare data
cloud_df = pd.DataFrame({'cloud': df.loc[:, 'AWS':'AlibabaCloud'].count().sort_values(ascending=False)[:5]})

# plot
c = ['#193c73','#134aa1', '#1c64d9','#4886e8','#7aacfa']
plt.pie(cloud_df.cloud, labels=cloud_df.index.to_list(),startangle=-265, counterclock=False, wedgeprops=dict(width=0.35, linewidth=3,),textprops=dict(size=18, color='#000'), colors=c)

# title
plt.title("Top 5 most popular Cloud computing platform by Data Analyst" ,loc='center', pad=30, fontstyle='italic')

# subtitle
plt.text(0, 1.24, 'Amazon Web Services and Microsoft Azure are the most popular cloud platforms \namong Data Analyst in India', size=15, va='center', ha='center')
plt.text(0, 0, 'Cloud\nComputing\nPlatform', size=28, weight='demi', va='center', ha='center', color='#01234580')

plt.show()

In [None]:
# Data preparation
bi_tools = df.loc[:, 'Quicksight':'Thoughtspot'].count().sort_values(ascending=False)[:5]

# plot
c = [colors[0],colors[1],'#ff7b00','#ff8717', '#f79436']
plt.pie(bi_tools, labels=bi_tools.index.to_list(),startangle=-75, counterclock=False, wedgeprops=dict(width=0.45, linewidth=3,),textprops=dict(size=18, color='#000'),colors=c)

# title
plt.title("Top 5 most popular BI tools by Data Analyst" ,loc='center', pad=20, fontstyle='italic')

# subtitle
plt.text(0, 1.24, 'Tableau and PowerBI are the most popular BI tools in Data Analyst in India', size=15, va='center', ha='center')
plt.text(0, 0, 'BI Tools', size=30, weight='demi', va='center', ha='center')

plt.show()
plt.show()

# Data Analytics Industries

In [None]:
# Obeservation of industries of Woman Data Analyst.
# prepare data
a = kaggle_data.loc[(kaggle_data.Q5 == 'Data Analyst') & (kaggle_data.Q2 == 'Woman') & (kaggle_data.Q3 == 'India'), ['Q5', 'Q2', 'Q3', 'Q20']]
b = a.groupby('Q20').count()['Q2'].sort_values(ascending=False)[:10]
c = (b * 100) / (b.sum())

# plot
c.plot(kind='bar', color=colors[2])

# Annotation
i = 0
for val in c.values:
    val = round(val, 1)
    plt.text(i, val+1.5, s = str(val)+"%", ha='center', va='center', fontsize=12, color='#000000', fontweight=600, 
            bbox=dict(fc='white', pad=0.5, ec='black', boxstyle='round'))
    i+=1

plt.xlabel("Industry")
plt.ylabel("Percentage")
plt.title("Different Industries of Women Data Analyst", loc='left', fontstyle='italic')
text = "Percentage of woman work in academia as Data analyst is nearly same as the percentage of \nwoman work in Computer/Technology industry."
plt.text(-0.5, 36, text,fontsize=14.5)
plt.ylim(0, 35)
plt.show()

# Salary Comparison between top 2 industries

In [None]:
# Woman Data Analyst Salary by Industries

# data prepare
a = kaggle_data.loc[(kaggle_data.Q5 == 'Data Analyst') & (kaggle_data.Q2 == 'Woman') & (kaggle_data.Q3 == 'India'), ['Q20', 'Q22', 'Q25', 'Q5']]

# selecting only computer and academics industry.
filter1 = (lambda x: x in ['Computers/Technology', 'Academics/Education'])
a = a.loc[a.Q20.apply(filter1)].reset_index()

a.rename(columns={'Q20':'Industry', 'Q22':'CodingExp', 'Q25':'Salary', 'Q5':'DataAnalyst'}, inplace=True)
a.Salary.replace({'10,000-14,999':'10k-15k', '$0-999':'0-1k', '200,000-249,999':'200k-250k', 
                  '5,000-7,499':'5k-7.5k', '1,000-1,999':'1k-2k', '7,500-9,999':'7.5k-10k', 
                  '4,000-4,999':'4k-5k', '25,000-29,999':'25k-30k', '15,000-19,999':'15k-20k',
                  '300,000-499,999':'300k-500k', '2,000-2,999':'2k-3k', '3,000-3,999':'3k-4k',
                 '40,000-49,999':'40k-50k', '30,000-39,999':'30k-40k'}, inplace=True)

# pivot table 
pd.pivot_table(a, values='CodingExp', index='Salary', columns='Industry', aggfunc='count').plot(kind='bar')

plt.xlabel('Salary in dollars')
plt.ylabel('Count')
plt.title("Woman Data Analyst Salary by Industries and Total Coding Experience", loc='left', fontstyle='italic',pad=60)

text = "Salary difference observed in academics and computer/technology industry, Highest Salary for \nData analyst in academia is between 30k-40k where as for computer it is between 300k-500k \nin India"
plt.text(-0.5, 16.1, text, fontsize=14.6)

plt.xticks(rotation=45)
plt.legend(['Academics/Education', 'Computers/Technology'])
plt.show()