In [None]:
# Setting html stuff for the rest of the notebook
from IPython.core.display import display, HTML, Javascript
html_contents ="""
<!DOCTYPE html>
<html lang="en">
    <head>
    <style>
    
    .section_title{
        background-color: #C7AEBF;
        color: black;
        font-family: Verdana;
        font-weight: 500;
        font-size: 20px;
        padding: 6px 12px;
        margin-bottom: 5px;
    }
    .subsection_title{
        background: #D3CFD2;
        font-weight: 400;
        color: black;
        font-family: Verdana;
        font-size: 15px;
        padding: 6px 12px;
        margin-bottom: 0px;
    }
    </style>
    </head>
    
</html>
"""

HTML(html_contents)

# <div class="section_title">1. Notebook setup</div>

# <div class="subsection_title">1.0 Pyspark installation</div>
In order to use Apache Spark python lib in Kaggle notebook, we need to install Pyspark library using pip command.

In [None]:
#Install Pyspark
!pip install pyspark
!pip install findspark

# <div class="subsection_title">1.1 Import Python libraries for data analysis</div>

In [None]:
import findspark
findspark.init

# Apache Spark
from pyspark.sql import SparkSession
import pyspark.sql.functions as f
from pyspark.sql.types import StructType,StructField,StringType,IntegerType
import os
from pyspark.sql.functions import count,when,col
import pyspark.sql.functions as func

#Plotting Libraries
import plotly.graph_objects as go
import plotly.express as px
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import functools
from pyspark.sql import DataFrame

cust_color = ['#7ec2f7','#73b5e8','#92b1c9','#9eb0bd','#a5afb6','#abaeb0',
'#a9a9a9',
'#b3b3b3',
'#bdbdbd',
'#c6c6c6',
'#d0d0d0',
'#dadada'              
]

In [None]:
#reused from - https://github.com/myrthings/catscatter/blob/master/catscatter.py
# More info about this function is available at
# - https://towardsdatascience.com/visualize-categorical-relationships-with-catscatter-e60cdb164395
# - https://github.com/myrthings/catscatter/blob/master/README.md
def catscatter(df,colx,coly,cols,annotation,color=['grey','black'],ratio=10,font='Helvetica',save=False,save_name='Default'):
    '''
    Goal: This function create an scatter plot for categorical variables. It's useful to compare two lists with elements in common.
    Input:
        - df: required. pandas DataFrame with at least two columns with categorical variables you want to relate, and the value of both (if it's just an adjacent matrix write 1)
        - colx: required. The name of the column to display horizontaly
        - coly: required. The name of the column to display vertically
        - cols: required. The name of the column with the value between the two variables
        - color: optional. Colors to display in the visualization, the length can be two or three. The two first are the colors for the lines in the matrix, the last one the font color and markers color.
            default ['grey','black']
        - ratio: optional. A ratio for controlling the relative size of the markers.
            default 10
        - font: optional. The font for the ticks on the matrix.
            default 'Helvetica'
        - save: optional. True for saving as an image in the same path as the code.
            default False
        - save_name: optional. The name used for saving the image (then the code ads .png)
            default: "Default"
    Output:
        No output. Matplotlib object is not shown by default to be able to add more changes.
    '''
    # Create a dict to encode the categeories into numbers (sorted)
    agg_data_copy = df.copy()
    colx_codes=dict(zip(df[colx].sort_values().unique(),range(len(df[colx].unique()))))
    coly_codes=dict(zip(df[coly].sort_values(ascending=False).unique(),range(len(df[coly].unique()))))
    
    # Apply the encoding
    df[colx]=df[colx].apply(lambda x: colx_codes[x])
    df[coly]=df[coly].apply(lambda x: coly_codes[x])
    
    
    # Prepare the aspect of the plot
    plt.rcParams['xtick.bottom'] = plt.rcParams['xtick.labelbottom'] = False
    plt.rcParams['xtick.top'] = plt.rcParams['xtick.labeltop'] = True
    plt.rcParams['font.sans-serif']=font
    plt.rcParams['xtick.color']=color[-1]
    plt.rcParams['ytick.color']=color[-1]
    plt.box(False)

    
    # Plot all the lines for the background
    for num in range(len(coly_codes)):
        plt.hlines(num,-1,len(colx_codes)+1,linestyle='dashed',linewidth=1,color=color[num%2],alpha=0.5)
    for num in range(len(colx_codes)):
        plt.vlines(num,-1,len(coly_codes)+1,linestyle='dashed',linewidth=1,color=color[num%2],alpha=0.5)
        
    # Plot the scatter plot with the numbers
    plt.scatter(df[colx],
               df[coly],
               s=df[cols]*ratio,
               zorder=2,
               color=color[-1])
    
    # To annotate the scatter plot
    for x in df[colx]:
        for y in df[coly]:
            percentage = df.loc[((df[colx] == x) & (df[coly] == y)),'%count'].values[0]
            plt.annotate(str(percentage)+'%', (x+.1, y+.1), size=30)
            
    
    # Change the ticks numbers to categories and limit them
    plt.xticks(ticks=list(colx_codes.values()),labels=colx_codes.keys(),rotation=90)
    plt.yticks(ticks=list(coly_codes.values()),labels=coly_codes.keys())
    plt.xlim(xmin=-1,xmax=len(colx_codes))
    plt.ylim(ymin=-1,ymax=len(coly_codes))
    
   
    

In [None]:
class survey_clss:
    def __init__(self,df):
        self.df=df
        self.z=[col  for col in df.columns if 'What programming languages do you use on a regular basis? (Select all that apply)' in col ]
        #self.ide=[col for col in df.columns if 'Which of the following integrated development environments' in col]
        self.output_dfs=[]
        self.output_role_dfs=[]
        self.ide_out_df=[]
        self.lst=[]
        
   
    def survey_colStats_func(self,col_lst,aliasName):
        for c in col_lst:
            # do some calculation
            df_output = self.df.filter(col(c).isNotNull())\
                              .groupBy(col(c).alias(aliasName))\
                              .agg(count('*').alias('recCOUNT'))
            df_output=df_output.withColumn('%CNT',f.round((df_output['recCOUNT']/self.df.count())*100,2))
            self.lst.append(df_output)
        df_output = functools.reduce(DataFrame.union, self.lst)
        lst=[]
        return df_output
       
    def role_surveyattr(self,col_lst,aliasName):
        role_df=self.df.groupBy(col('Select the title most similar to your current role (or most recent title if retired): - Selected Choice').alias('currentRole'))\
               .agg(count('*').alias('totalRoleCOUNT'))
        for c in col_lst:
            # do some calculation
            df_output = self.df.filter(col(c).isNotNull())\
                              .groupBy(col('Select the title most similar to your current role (or most recent title if retired): - Selected Choice').alias('currentRole'),col(c).alias(aliasName))\
                              .agg(count('*').alias('recCOUNT'))\
                              .orderBy('currentRole',ascending=False)
            self.output_role_dfs.append(df_output)
        df_output = functools.reduce(DataFrame.union, self.output_role_dfs)
        df_output=df_output.join(role_df,role_df.currentRole==df_output.currentRole)\
          .select(df_output.currentRole,df_output[aliasName],df_output.recCOUNT,role_df.totalRoleCOUNT)\
          .withColumn("%cnt",f.round((col('recCOUNT')/col('totalRoleCOUNT'))*100,2))\
          .select(col('currentRole'),col(aliasName),col('recCOUNT'),col('%cnt'))\
          .orderBy('currentRole',ascending=False)
        self.output_role_dfs=[]
        return df_output

# <div class="subsection_title">1.2 Start Spark session</div>
Before executing any Spark command ,a Sparksessoion needs to be created which will be used for execution of all Spark code. SparkSession is the entry point to Spark SQL. It is one of the very first objects user creates while developing a Spark SQL application.Once created, SparkSession allows for creating a DataFrame, accessing the Spark SQL services.

In [None]:
spark = SparkSession.builder.master("local[*]").getOrCreate()

# <div class="subsection_title">1.2 Load Data in Spark dataframe</div>

In [None]:
#1st two rows of the csv file has column names. As 2nd row has more user friendly header name, hence
#load steps drops 1st row and keeps 2nd row as header while loading in Spark dataframe.

#Data is first loaded as RDD and then converted to Spark dataframe - Survey.
rdd = spark.sparkContext.textFile("../input/kaggle-survey-2021/kaggle_survey_2021_responses.csv")
rdd_header=rdd.first()
#drop 1st row 
rdd_minus_1strow=rdd.filter(lambda x:x!=rdd_header)
#convert RDD to Spark dataframe
survey = spark.read.option("multiline",True)\
          .option("escape", "\"")\
          .option("multiline",True)\
          .option("inferschema",True)\
          .csv(rdd_minus_1strow,header=True)


In [None]:
#remove '.' from column names else Spark gives error due to invalid column names
survey = survey.toDF(*(c.replace('.','_') for c in survey.columns))


# <div class="section_title">2. Metadata about data (Data Overview)</div>

# <div class="subsection_title">2.1 View Data</div>

In [None]:
spark.conf.set("spark.sql.debug.maxToStringFields", 100)

survey.limit(10).toPandas()
#It returns 1st 10 rows as dataframe and then convert to Pandas DF

# <div class="subsection_title">2.2 Record count/stats</div>

In [None]:
metaschema=StructType([\
           StructField("fileName",StringType(),True),\
           StructField("recordCount",IntegerType(),True),\
           StructField("noofColumns",IntegerType(),True)\
    
])

metadata=[("kaggle_survey_2021_responses.csv",survey.count(),len(survey.columns))]
df=spark.createDataFrame(data=metadata,schema=metaschema)
df.show(truncate=False)

# <div class="subsection_title">2.3 Check for Null values</div>

In [None]:
df=survey.select([count(when(f.isnan(c)|col(c).isNull(),True)).alias(c) for c in survey.columns])\
   .toPandas()
df

# <div class="section_title">3. Demography Data analysis</div>

# <div class="subsection_title">3.1 Country distribution</div>

In [None]:
from wordcloud import WordCloud, STOPWORDS
stopwords = set(STOPWORDS)
#converting list of countries in comma seperated list for World cloud
d=survey.select(col("In which country do you currently reside?")).rdd.flatMap(lambda x:x[0].split(',')).collect()
text=",".join(d)

word_cloud = WordCloud(stopwords = stopwords,max_words=100, collocations = False, repeat = False, background_color = '#FEFBF3', width = 5000, height = 3000)
word_cloud.generate(text)

fig,ax=plt.subplots(2,1,figsize=(15, 10))
ax[0].imshow(word_cloud, interpolation='bilinear')
ax[0].axis("off")


#barplot
userperCntry_df=survey.groupby(col("In which country do you currently reside?")).agg(count("*").alias('NoOfUsers')).persist()

userperCntry=userperCntry_df.withColumn('per%',func.round((userperCntry_df.NoOfUsers/survey.count())*100,2))\
.orderBy(col('per%'),ascending=False).limit(12).toPandas()


sns.barplot(x=userperCntry["In which country do you currently reside?"],y=userperCntry['per%'],ax=ax[1],palette=cust_color)
ax[1].set_facecolor('#FEFBF3')
# Hide the right and top spines
ax[1].spines['right'].set_visible(False)
ax[1].spines['top'].set_visible(False)

ax[1].grid(which='major', axis='y',  color='#EEEEEE', lw=0.4)
ax[1].set_xticklabels(ax[1].get_xticklabels(), rotation=90, ha='right')

#plot title
plt.rcParams['figure.titlesize']=14
plt.rcParams['figure.titleweight']='bold'
fig.suptitle('Kaggle developers by Country', fontdict={'fontsize': plt.rcParams['figure.titlesize'],
                                          'fontweight' : plt.rcParams['figure.titleweight'],
                                          'verticalalignment': 'baseline',
                                          'horizontalalignment': 'left'})

#Labels
ax[1].set_xlabel('Countries', labelpad=7,weight='semibold')
ax[1].set_ylabel('NoOfUsers', labelpad=7,weight='semibold')

# data label
for i in ax[1].patches:
    userCnt=i.get_height()
    val=f'{userCnt:.1f}%'
    x = i.get_x()+ i.get_width() / 2 
    y = i.get_y()+i.get_height()+0.5
    ax[1].text(x, y, val, ha='center', va='center', fontsize=7, weight='semibold',color='red') 

plt.show()

**Few points :**
* Kaggle website popularity among data professional spreads across geographical boundaries.World cloud above indicates kagglers are located across multiple countries and they use Kaggle to pratice different data skill sets.

* India followed by USA has largest population of Kagglers.
    
 
# Let's plot grographical distribution on world map.





In [None]:
df_pd_cntry=survey.groupBy("In which country do you currently reside?").count().withColumnRenamed('count','NoOfUsers')\
.withColumnRenamed("In which country do you currently reside?","cntry_name")\
.orderBy(col('NoOfUsers').desc()).toPandas()


fig=go.Figure(data=go.Choropleth(
    locations = df_pd_cntry['cntry_name'],
    z = df_pd_cntry['NoOfUsers'],
    text = df_pd_cntry['cntry_name'],   
    autocolorscale=True,
    locationmode='country names',
    reversescale=True,
    marker_line_color='darkgray',
    marker_line_width=0.5,
    colorbar_title = '# of Kagglers',
))


fig.update_layout(
    title_text='Kagglers around the world',
    geo=dict(
        showframe=False,
        showcoastlines=False,
        projection_type='equirectangular'
    ),
    annotations = [dict(
        x=0.55,
        y=0.1,
        xref='paper',
        yref='paper',
        showarrow = False
    )]
)


* Overall, India  has most number of Kagglers.
* USA is leading in North America region. In South America, Brazil has the biggest ommunity followed by Colombia.
* Kagglers community in Europe is uniformaly distributed 

# <div class="subsection_title">3.2 Age</div>

In [None]:
age_grp=survey.groupBy('What is your age (# years)?').agg(count("*").alias('NoOfUsers'))\
       .orderBy('NoOfUsers',ascending = False).toPandas()
age_grp['per%']=round((age_grp.NoOfUsers/age_grp.NoOfUsers.sum())*100,2)

fig,ax=plt.subplots(1,2,figsize=(14, 8), facecolor='#FEFBF3',constrained_layout=True)

sns.barplot(x=age_grp['What is your age (# years)?'],y=age_grp['NoOfUsers'],ax=ax[0],palette=cust_color)
ax[0].set_facecolor('#FEFBF3')
# Hide the right and top spines
ax[0].spines['right'].set_visible(False)
ax[0].spines['top'].set_visible(False)

ax[0].grid(which='major', axis='y',  color='#EEEEEE', lw=0.4)

#plot title
plt.rcParams['figure.titlesize']=14
plt.rcParams['figure.titleweight']='bold'
fig.suptitle('Kaggle developers by Age', fontdict={'fontsize': plt.rcParams['figure.titlesize'],
                                          'fontweight' : plt.rcParams['figure.titleweight'],
                                          'verticalalignment': 'baseline',
                                          'horizontalalignment': 'left'})

#Labels
ax[0].set_xlabel('What is your age (# years)?', labelpad=7,weight='semibold')
ax[0].set_ylabel('NoOfUsers', labelpad=7,weight='semibold')


ax[1].pie(age_grp['per%'], labels=age_grp['What is your age (# years)?'], colors = cust_color, 
          autopct='%1.1f%%', 
          explode = (0.03, 0.03, 0.03, 0.03, 0.03, 0.03, 0.03, 0.1, 0.2, 0.2, 0.2)
        )


plt.show()

1. Most of Kagglers are less than 35 years of age with approx. 37% are below 24 years. This indicates most of the Kagglers are either student or they have just started on their professional jobs.

2. As age progresses, folks seems to be become less active on Kaggle and in some cases leave Kaggle platform.

3. On the other side of the chart, interestingly, almost 3% of the Kagglers are above age of 60. My guess is either this age group folks are researchers, or they got familiar with data science in the later part of their carriers.

# <div class="subsection_title">3.3 Gender</div>

In [None]:
plt.rcParams['axes.titlesize']=15
plt.rcParams['axes.titleweight']='bold'
# Gender plot
gender_df=survey.select(col('What is your gender? - Selected Choice'))\
.groupBy("What is your gender? - Selected Choice")\
.agg(count("*").alias('NoOfUsers'))\
.orderBy(col('NoOfUsers'),ascending=False)\
.toPandas()

fig = plt.figure(figsize=(14, 18), facecolor='#FEFBF3',constrained_layout=True)
gs = fig.add_gridspec(2, 1)
ax=fig.add_subplot(gs[0, :])
ax.set_facecolor('#FEFBF3')

ax.pie(gender_df['NoOfUsers'], labels=gender_df['What is your gender? - Selected Choice'], colors = cust_color, 
          autopct='%1.1f%%', 
          explode = (0.03, 0.03, 0.03, 0.2, 0.4)
        )

ax.set_title('Gender distribution of Kagglers',loc='center',fontdict={'fontsize': plt.rcParams['axes.titlesize'],
                                          'fontweight' : plt.rcParams['axes.titleweight'],
                                          'verticalalignment': 'baseline',
                                          'horizontalalignment': 'left'})

#Gender vs Age plot
genlist=['Man','Woman']
age_df=survey.filter(col('What is your gender? - Selected Choice').isin(genlist)).groupBy("What is your gender? - Selected Choice","What is your age (# years)?")\
.agg(count("*").alias('NoOfUsers'))\
.orderBy('What is your gender? - Selected Choice','What is your age (# years)?')


sumt=age_df.agg({'NoOfUsers': 'sum'}).collect()[0]
t=sumt['sum(NoOfUsers)']
age_df=age_df.withColumn('%per',f.round(((col('NoOfUsers')/t)*100),2))

age_pd_df=age_df.toPandas()


ax0 = fig.add_subplot(gs[1, :])
ax0.set_facecolor('#FEFBF3')
# Hide the right and top spines
ax0.spines['right'].set_visible(False)
ax0.spines['top'].set_visible(False)
ax0=sns.barplot(ax=ax0,data=age_pd_df,x='What is your age (# years)?',y='NoOfUsers',
                hue='What is your gender? - Selected Choice',linewidth=0,saturation=1)

ax0.fill_between(np.array([-0.5,2.5]),4000, color='#90AACB', alpha=0.3,  linewidth=1,linestyle='--')
ax0.grid(which='major', axis='y',  color='#EEEEEE', lw=0.4)

#plot title

ax0.set_title('Age by Gender',loc='center',fontdict={'fontsize': plt.rcParams['axes.titlesize'],
                                          'fontweight' : plt.rcParams['axes.titleweight'],
                                          'verticalalignment': 'baseline',
                                          'horizontalalignment': 'left'})

ax0.legend(loc="upper right", prop={'size': 10}, frameon=False, ncol=5, title_fontsize=10, title='Gender')

# data label
for i in ax0.patches:
    userCnt=i.get_height()
    val=f'{round(((userCnt/t)*100),2):.1f}%'
    x = i.get_x() + i.get_width() / 2 
    y = i.get_y() + i.get_height()+75
    ax0.text(x, y, val, ha='center', va='center', fontsize=7, weight='semibold',color='red') 
    
ax0.tick_params(axis='both', which='major', labelsize=8,width=0.3, length=1)

#Labels
ax0.set_xlabel('What is your age (# years)?', labelpad=10,weight='semibold')
ax0.set_ylabel('NoOfUsers', labelpad=8,weight='semibold')

plt.show()

# <div class="subsection_title">3.4 Gender distribution per country</div>

In [None]:
z=list(userperCntry['In which country do you currently reside?'])
z=[x for x in z if x!='Other']

cust_color1=['#7ec2f7','#a5afb6']

cntryGender=survey.filter(col("In which country do you currently reside?").isin(z))\
.filter(col('What is your gender? - Selected Choice').isin(genlist))\
.groupBy("In which country do you currently reside?","What is your gender? - Selected Choice").agg(count("*").alias('userCnt'))\
.orderBy("In which country do you currently reside?","What is your gender? - Selected Choice")

cntryUserCNT=cntryGender.groupBy("In which country do you currently reside?").agg(func.sum('userCnt').alias('sumcnt'))

c=cntryGender.join(cntryUserCNT,cntryGender["In which country do you currently reside?"] ==  cntryUserCNT["In which country do you currently reside?"],"inner")\
.select(cntryGender["*"],cntryUserCNT['sumcnt']).orderBy(col('sumcnt'),col("In which country do you currently reside?"),ascending = False)

c=c.withColumn('%',func.round((c.userCnt/c.sumcnt)*100,2)).toPandas()
c.rename(columns={'In which country do you currently reside?': 'cntryName', 'What is your gender? - Selected Choice': 'gender'}, inplace=True)

g = sns.FacetGrid(c, col="cntryName", col_wrap=4, height=3.5,aspect=0.8)
g.figure.subplots_adjust(wspace=.8)
g.map(sns.barplot, "gender", "userCnt",order=['Man','Woman'],palette=cust_color1)
g.set_titles(col_template = '{col_name}')
sns.set_style("darkgrid", {"axes.facecolor": ".9"})


for ax in  g.axes.flat:
    for i in ax.patches:
        val=c[(c.cntryName==ax.get_title()) & (c.userCnt==int(i.get_height()))]['%'].reset_index(drop = True)[0]
        x = i.get_x() + i.get_width() / 2 
        y = i.get_y() + i.get_height()+95
        ax.text(x, y, val, ha='center', va='center', fontsize=9, weight='semibold',color='red')
    
    if (ax.get_title()=='United Kingdom of Great Britain and Northern Ireland'):
        ax.set_title("United Kingdom")
    ax.set_ylabel("UserCnt",fontsize=9,weight='semibold')
    ax.set_xlabel("Gender",fontsize=9,weight='semibold')
    ax.spines['left'].set_linewidth(0.5)
    ax.spines['left'].set_color('black')
    ax.spines['bottom'].set_linewidth(0.5)
    ax.spines['bottom'].set_color('black')

plt.show()

1. Woman kagglers in countries like Japan, China are less active compared to countries like India , USA

2. India, USA, UK, Egypt are countries where woman contribution comapred to overall active kagglers population from that country is higher than 20%.

# <div class="subsection_title">3.5 Education</div>

In [None]:
df_onlyeducation=survey.select(col('What is the highest level of formal education that you have attained or plan to attain within the next 2 years?').alias('Education_level'))\
.groupBy(col('Education_level'))\
.agg(count("*").alias('userCnt'))\
.withColumn('%userCNT',f.round((col('userCnt')/survey.count())*100,2))\
.orderBy(col('userCnt'),ascending=False)\
.toPandas()

headerColor = 'grey'
rowEvenColor = 'lightgrey'
rowOddColor = 'white'

fig = go.Figure(data=[go.Table(
  header=dict(
    values=list(df_onlyeducation.columns),
    line_color='darkslategray',
    fill_color=headerColor,
    align=['left','center'],
    font=dict(color='white', size=12)
  ),
  cells=dict(
    values=df_onlyeducation.transpose().values.tolist(),
    line_color='darkslategray',
    # 2-D list of colors for alternating rows
    fill_color = [[rowOddColor,rowEvenColor,rowOddColor, rowEvenColor,rowOddColor,rowEvenColor,rowOddColor]*5],
    align = ['left', 'center'],
    font = dict(color = 'darkslategray', size = 11)
    ))
])

fig.show()

# <div class="subsection_title">3.6 Education level per Country</div>

In [None]:
df_education=survey.select(col("In which country do you currently reside?").alias('countryName'),col('What is the highest level of formal education that you have attained or plan to attain within the next 2 years?').alias('Education_level'))\
.groupBy(col('countryName'),col('Education_level'))\
.agg(count("*").alias('userCnt'))\
.orderBy(col('userCnt'),ascending=False)
#.show(truncate=False)

df_education=df_education.join(userperCntry_df,df_education['countryName']==userperCntry_df['In which country do you currently reside?'])\
.select(col('countryName'),col('Education_level'),col('userCnt'),col('NoOfUsers').alias('totalUserCntPerCntry'))\
.withColumn('educationLvl%',f.round((col('userCnt')/col('totalUserCntPerCntry'))*100,2))\
.select(col('countryName'),col('Education_level'),'educationLvl%')\
.orderBy(col('countryName'),col('educationLvl%'),ascending=False)

df_education_pd=df_education.groupBy('countryName').pivot('Education_level').sum('educationLvl%').toPandas()
df_education_pd=df_education_pd.set_index('countryName').fillna(0)

df_education_pd.rename(columns={'Bachelor’s degree': 'Bachelor’s degree', 
                   'Doctoral degree': 'Doctoral degree',
                   'I prefer not to answer':'I prefer not to answer',
                   'Master’s degree':'Master’s degree',
                   'No formal education past high school':'Only high school',
                   'Professional doctorate':'Professional doctorate',
                   'Some college/university study without earning a bachelor’s degree':'College-No Degree'
                   }, inplace=True)

fig, ax = plt.subplots(figsize=(14, 30))
sns.heatmap(df_education_pd,xticklabels=True, yticklabels=True, cmap='YlOrBr', annot=True, linewidths=0.005, linecolor='black'
            , annot_kws={"fontsize":12}, fmt='.2f', cbar=False)
plt.yticks(fontsize=9)
ax.xaxis.set_ticks_position('top')
plt.show()

# <div class="subsection_title">3.7 Education level vs Age group</div>

In [None]:
edu_age_df=survey.groupBy(col('What is your age (# years)?'),col('What is the highest level of formal education that you have attained or plan to attain within the next 2 years?').alias("Education_LVL"))\
.agg(count("*").alias('educationLVLbyAge'))
    
age_grp_df=survey.groupBy('What is your age (# years)?').agg(count("*").alias('NoOfUsers'))\
       .orderBy('NoOfUsers',ascending = False)

edu_age_df=edu_age_df.join(age_grp_df,edu_age_df['What is your age (# years)?']==age_grp_df['What is your age (# years)?'])\
                     .select(edu_age_df['What is your age (# years)?'],col("Education_LVL"),col('educationLVLbyAge'),col('NoOfUsers'))\
                     .withColumn("%userCnt",f.round((col('educationLVLbyAge')/col('NoOfUsers'))*100,2))\
                     .select(col('What is your age (# years)?'),col("Education_LVL"),col("%userCnt"))\
                     .orderBy(col('What is your age (# years)?'))
                     
edu_age_df_pd=edu_age_df.groupBy('What is your age (# years)?').pivot("Education_LVL").sum("%userCnt").toPandas()\
                        .set_index('What is your age (# years)?').fillna(0)

edu_age_df_pd.rename(columns={'Bachelor’s degree': 'Bachelor’s degree', 
                   'Doctoral degree': 'Doctoral degree',
                   'I prefer not to answer':'I prefer not to answer',
                   'Master’s degree':'Master’s degree',
                   'No formal education past high school':'Only high school',
                   'Professional doctorate':'Professional doctorate',
                   'Some college/university study without earning a bachelor’s degree':'College-No Degree'
                   }, inplace=True)


fig, ax = plt.subplots(figsize=(14, 16))
sns.heatmap(edu_age_df_pd,xticklabels=True, yticklabels=True, cmap='YlOrBr', annot=True, linewidths=0.005, linecolor='black'
            , annot_kws={"fontsize":10}, fmt='.2f', cbar=False)


plt.yticks(fontsize=9)
ax.xaxis.set_ticks_position('top')
plt.show()

# <div class="subsection_title">3.8 Where Kagglers are learning ML</div>

In [None]:
datascience_cols = [col  for col in survey.columns if 'On which platforms have you begun or completed data science courses?' in col ]
survey_obj=survey_clss(survey)
datascience_df=survey_obj.survey_colStats_func(datascience_cols,'datascienceCourses').toPandas()
datascience_df.sort_values(by=['recCOUNT'], inplace=True, ascending=False)



trace2 = go.Bar(
    y = datascience_df['datascienceCourses'],
    x = datascience_df['recCOUNT'],
    orientation = "h",
    name = "Professionals",
    text=datascience_df['%CNT']
)


large_title_format = "<span style='font-size:32px; font-family:Times New Roman'> Where Kagglers are learning ML?</span>"
small_title_format = "<span style='font-size:16px; font-family:Tahoma'>   Only 19% Kagglers have taken university degree </span>"

layout = dict(
   title = large_title_format + "<br>" + small_title_format,
    margin = dict(t=150),
    width=1000,
    height=600,
    legend=dict(
                orientation="h",
                yanchor='top',xanchor='center',
                y= 1.06,x=0.5,
                font=dict(size= 16),
            
               ),
    
    plot_bgcolor = "#f4f0ea"
)



data = [trace2]
main_annot_format = "<span style='font-size:12px; font-family:Tahoma;'><b> %s </b><br> %s</span>"
fig = go.Figure(data = data,layout=layout)


fig.add_annotation(dict(
        x=0.68,
        y=0.68,
        xref = "paper",
        yref = "paper",
        text= main_annot_format % ("ML Courses",
                                   """Coursera and Kaggle are two mediums <br>where kagglers are learning ML. """),
        
))

fig.show()

1. Coursera , Kaggle learn and Udemy courses are two mostly used platform to learn ML.
2. Only 18.8% Kaggers have taken university courses 

# <div class="subsection_title">3.9 Job Title</div>

In [None]:
role_df_pd=survey.groupBy(col('Select the title most similar to your current role (or most recent title if retired): - Selected Choice').alias('currentRole'))\
.agg(count("*").alias("noOfUsers"))\
.withColumn("%userCount",f.round((col('noOfUsers')/survey.count())*100,2))\
.select(col('currentRole'),col('%userCount'))\
.orderBy(col('%userCount'),ascending=False)\
.toPandas()

fig, ax = plt.subplots(figsize=(14, 6),facecolor='#FEFBF3',constrained_layout=True)
sns.scatterplot(data=role_df_pd, x=role_df_pd["currentRole"], y=role_df_pd["%userCount"],ax=ax,s=550,
               zorder=3, linewidth=0.3, alpha=0.7)


for i,val in enumerate(role_df_pd['%userCount']):
    val=f'{val:.1f}%'
    ax.annotate(val, (role_df_pd['currentRole'][i], role_df_pd['%userCount'][i]),
               ha='center', va='center', fontsize=6,color='white',weight='semibold')


for tick in ax.get_xticklabels():
    tick.set_rotation(90)  
    
# getting the array of values of y-axis
ticks = ax.get_yticks()
# formatted the values into strings beginning with dollar sign
new_labels = [f'{val:.1f}%' for val in ticks]
# Set the new labels
ax.set_yticks(ax.get_yticks().tolist())
ax.set_yticklabels(new_labels)

#Labels
ax.set_xlabel('Current Role', labelpad=10,weight='semibold')
ax.set_ylabel('User count percentage', labelpad=8,weight='semibold')


#title
x0, x1 = ax.get_xlim()
y0, y1 = ax.get_ylim()
ax.text(x0, y1+2.5, 'Select the title most similar to your current role (or most recent title if retired)', fontsize=8, ha='left', va='top')

plt.show()

1. As expected, students and data scientist are two most common roles kagglers are playing in their work/study life.

2. ML engineer, Research scientist and Statistican are also roles that requires ML concepts in their daily work. Other roles like Data analyst, I think , don't require ML concepts in their day to day work. My guess is these folks are on Kaggle platform due to their own self interest in ML field.

# <div class="subsection_title">3.10 Job Title vs Gender</div>

In [None]:
genlist=['Man','Woman']

gender_role_pd_df=survey.filter(col('What is your gender? - Selected Choice').isin(genlist))\
      .groupBy(col('What is your gender? - Selected Choice'),
             col('Select the title most similar to your current role (or most recent title if retired): - Selected Choice').alias('currentRole'))\
      .agg(count("*").alias("userCount"))\
      .orderBy(col('currentRole'),col('userCount'),ascending=False)\
      .toPandas()



fig=px.bar(gender_role_pd_df,x="currentRole",y="userCount",color="What is your gender? - Selected Choice"
           ,title="Gender vs Job profile",width=1000)
fig.show()

1. No Females are working as DBAs. I am not sure why?

# <div class="subsection_title">3.11 Job Title vs Education</div>

In [None]:
df_jobtitle_d=survey.select(col('Select the title most similar to your current role (or most recent title if retired): - Selected Choice').alias('currentTitle'))\
.groupBy(col('currentTitle'))\
.agg(count("*").alias('totaluserCnt_title'))

df_role_edu=survey.select(col('Select the title most similar to your current role (or most recent title if retired): - Selected Choice').alias('currentTitle'),
          col('What is the highest level of formal education that you have attained or plan to attain within the next 2 years?').alias('educationLvl'))\
.groupBy(col('currentTitle'),col('educationLvl'))\
.agg(count('*').alias('recCount'))

df_role_edu=df_role_edu.join(df_jobtitle_d,df_jobtitle_d.currentTitle==df_role_edu.currentTitle,'inner')\
.withColumn('%recCountPerRole',(col('recCount')/col('totaluserCnt_title')))\
.select(df_role_edu.currentTitle,df_role_edu.educationLvl,col('%recCountPerRole'))\
.orderBy(col('currentTitle'),col('educationLvl'),col('%recCountPerRole'),ascending=False)\
.toPandas()

df_role_edu

fig = px.bar(df_role_edu, y="currentTitle", x="%recCountPerRole", color="educationLvl", barmode = "stack"
             ,width = 900,height = 800,orientation='h')

fig.update_layout(legend=dict(
    orientation="h",
    yanchor="bottom",
    y=1.2,
    xanchor="left",
    x=0.01,
    title=''
),
xaxis_title="",
yaxis_title="",
xaxis_tickformat = '0.0%'
                 )



fig.show()

Master's and Bachelor's degrees are most common education level for  majority of job profiles. The case of Research Scientists however, is an exception  where a <b>doctoral degree</b> is a common education level.

# <div class="title">4 ML/Coding profile data</div>

# <div class="subsection_title">4.1 Coding experience</div>

In [None]:
coding_df=survey.groupBy(col('For how many years have you been writing code and/or programming?'))\
      .agg(count("*").alias("yearCount"))\
      .orderBy(col("yearCount"),ascending=False)



coding_df_pd=coding_df.toPandas()

totalrecCNT=survey.count()
fig = plt.figure(figsize=(12, 10), constrained_layout=True)
spec = fig.add_gridspec(ncols=1, nrows=2)
ax0 = fig.add_subplot(spec[0, 0])
ax0=sns.barplot(data=coding_df_pd,x='For how many years have you been writing code and/or programming?'
            ,y='yearCount',palette=cust_color,ax=ax0)#

for i in ax0.patches:
        val= str(int(i.get_height()))+' | '+ str(round((int(i.get_height())/totalrecCNT)*100,2))
        x = i.get_x() + i.get_width() / 2 
        y = i.get_y() + i.get_height()+1000
        ax0.text(x, y, val, ha='center', va='center', fontsize=9, weight='semibold',color='red'
                ,bbox=dict(facecolor='none', edgecolor='black', boxstyle='round'),rotation='vertical')
    
#Labels
ax0.set_xlabel('For how many years have you been writing code and/or programming?', labelpad=10,weight='semibold')
ax0.set_ylabel('NoOfUsers', labelpad=8,weight='semibold')

plt.show()

1. 52% of Kagglers have coding experience of less than 3 years of which 22% have less than 1 years of experience which is expected as we saw above that highest percentage of kagglers are students.

# <div class="subsection_title">4.2 Gender vs Coding experience</div>

In [None]:
coding_gender_df=survey.groupBy(col('For how many years have you been writing code and/or programming?'),
                        col('What is your gender? - Selected Choice'))\
      .agg(count("*").alias("yearCount"))\
      .orderBy(col("yearCount"),ascending=False)


gender_df=survey.groupBy(col('What is your gender? - Selected Choice'))\
      .agg(count("*").alias("userCount"))\
      .orderBy(col("userCount"),ascending=False)


coding_gender_df_pd=coding_gender_df.join(gender_df,gender_df['What is your gender? - Selected Choice']==
                     coding_gender_df['What is your gender? - Selected Choice'])\
                .select(col('For how many years have you been writing code and/or programming?').alias('Years of programing'),
                       coding_gender_df['What is your gender? - Selected Choice'],col("yearCount"),col("userCount"))\
                .filter(col('What is your gender? - Selected Choice').isin (['Man','Woman']))\
                .withColumn("%count",f.round((col('yearCount')/col('userCount'))*100,2))\
                .orderBy(col('What is your gender? - Selected Choice'),col('yearCount'),ascending=False)\
                .toPandas()

fig, ax = plt.subplots(figsize=(14, 6),facecolor='#FEFBF3',constrained_layout=True)
sns.scatterplot(data=coding_gender_df_pd, x=coding_gender_df_pd["Years of programing"], y=coding_gender_df_pd["%count"],
                hue=coding_gender_df_pd['What is your gender? - Selected Choice'],ax=ax,s=550,
                zorder=3, linewidth=0.3, alpha=0.7)

for i,val in enumerate(coding_gender_df_pd['%count']):
    val=f'{val:.1f}%'
    ax.annotate(val, (coding_gender_df_pd['Years of programing'][i], coding_gender_df_pd['%count'][i]),
               ha='center', va='center', fontsize=6,color='white',weight='semibold')
    
    # getting the array of values of y-axis
ticks = ax.get_yticks()
# formatted the values into strings beginning with dollar sign
new_labels = [f'{val:.1f}%' for val in ticks]
# Set the new labels
ax.set_yticks(ax.get_yticks().tolist())
ax.set_yticklabels(new_labels)

#Labels
ax.set_xlabel('Years of programing', labelpad=10,weight='semibold')
ax.set_ylabel('%count', labelpad=8,weight='semibold')

#title
x0, x1 = ax.get_xlim()
y0, y1 = ax.get_ylim()
ax.text(x0, y1+2.5, 'Gender vs years of programing', fontsize=8, ha='left', va='top')


plt.show()

1. Plot confirms general trend that most of Kagglers are student and their coding experience follow similar trends irrespective of gender.

2. 22% of female kagglers  have less than 1 years of experience and my guess is , most of then would be student.

# <div class="subsection_title">4.3 Age vs Coding experience</div>

In [None]:
df_pd=survey.groupBy(col('For how many years have you been writing code and/or programming?').alias('Year of coding'),
              col('What is your age (# years)?').alias('Age'))\
      .agg(count('*').alias('reccount'))\
      .orderBy(col('reccount'),ascending=False)\
      .toPandas()

df_pd=df_pd.merge(df_pd.groupby(['Year of coding'])['reccount'].agg(['sum']).reset_index().rename(columns={'sum':'totalcnt'})
,on='Year of coding')

df_pd['%count']=round((df_pd['reccount']/df_pd['totalcnt'])*100,1)
colors=['blue', 'grey', 'green']
# create the plot
plt.figure(figsize=(50,20))
catscatter(df_pd.copy() , 'Age', 'Year of coding', 'reccount',annotation='%count', font='Helvetica', color=colors, ratio=3)

plt.xticks(fontsize=40)
plt.yticks(fontsize=40)
#title
x0, x1 = plt.xlim()
y0, y1 = plt.ylim()
plt.text(x0, y1+2, 'Kaggle 2021 Survey Respondens by Age and Programming Experience', fontsize=30,
         ha='left', va='top',weight='bold')
plt.show()

fig = px.bar(
    df_pd, 
    x='Age', 
    y='reccount', 
    color='Year of coding', 
    height=600)
fig.show()

As we can see, the charts above support our  intution about the positive correlation between the age and programming exp

* the majority of young kagglers are relatively inexperienced (having up to 3-5 years of programming experience at maximum)
* more senior people tend to demonstrate more years of programming experience
* In every age group (even in the senior ones), there is always a tiny fraction of people who indicated no programming experience at all

# <div class="subsection_title">4.4 Job title vs Coding Language preference</div>

In [None]:
codelanguage_lst=[col  for col in survey.columns if 'What programming languages do you use on a regular basis? (Select all that apply)' in col ]

programLan_df=survey_clss(survey)
language_df=programLan_df.role_surveyattr(codelanguage_lst,'Language').toPandas()
fig = go.Figure(data=[go.Table(
                                header=dict(values=list(language_df.columns),
                                fill_color='paleturquoise',
                                align='left'),
                cells=dict(values=[language_df.currentRole, language_df.Language, language_df.recCOUNT, language_df['%cnt']],
               fill_color='lavender',
               align='left'))
])



fig.show()

1. Python is clearly preferred language for EDA/ML work for all professional roles. Statistician role has slightly higher preference for R compared to Python but for other roles , Python is far ahead of R.

2. SQL is next mosted used langugae across all job titles.

3. Java script is also used by significant section of Kagglers.

# <div class="subsection_title">4.5 Coding Language preference</div>

In [None]:
survey_lang_df=programLan_df.survey_colStats_func(codelanguage_lst,'Language').toPandas()

survey_lang_df.sort_values(by=['recCOUNT'], inplace=True, ascending=True)

trace2 = go.Bar(
    y = survey_lang_df['Language'],
    x = survey_lang_df['recCOUNT'],
    orientation = "h",
    name = "Professionals",
    text=survey_lang_df['%CNT']
)


large_title_format = "<span style='font-size:32px; font-family:Times New Roman'> Which languages should I learn?</span>"
small_title_format = "<span style='font-size:16px; font-family:Tahoma'>    Python is the most popular language followed by SQL and C++.</span>"

layout = dict(
   title = large_title_format + "<br>" + small_title_format,
    margin = dict(t=150),
    legend=dict(
                orientation="h",
                yanchor='top',xanchor='center',
                y= 1.06,x=0.5,
                font=dict(size= 16),
            
               ),
    
    plot_bgcolor = "#f4f0ea"
)



data = [trace2]
main_annot_format = "<span style='font-size:12px; font-family:Tahoma;'><b> %s </b><br> %s</span>"
fig = go.Figure(data = data,layout=layout)


fig.add_annotation(dict(
        x=0.68,
        y=0.68,
        xref = "paper",
        yref = "paper",
        text= main_annot_format % ("Python",
                                   """Python is favourite with more than 84% of <br>kagglers  using it.  However,in statistician community, <i>\"R\"</i> seems to <br> be more popular language."""),
        
))

fig.show()

# <div class="subsection_title">4.6 IDEs </div>

In [None]:
ide_lst=[col for col in survey.columns if 'Which of the following integrated development environments' in col]

surveyobj=survey_clss(survey)
ide_df_pd=surveyobj.survey_colStats_func(ide_lst,'IDE').toPandas()
ide_df_pd.sort_values(by=['recCOUNT'], inplace=True, ascending=False)

trace2 = go.Bar(
    x = ide_df_pd['IDE'],
    y = ide_df_pd['recCOUNT'],
    #orientation = "h",
    name = "Professionals",
    text=ide_df_pd['%CNT']
)


large_title_format = "<span style='font-size:32px; font-family:Times New Roman'> What IDEs are used?</span>"
small_title_format = "<span style='font-size:16px; font-family:Tahoma'>    Juypter notebook is the most popular IDE followed by VScode and Pycharm.</span>"

layout = dict(
   title = large_title_format + "<br>" + small_title_format,
    margin = dict(t=150),
 
    plot_bgcolor = "#f4f0ea"
)



data = [trace2]
main_annot_format = "<span style='font-size:12px; font-family:Tahoma;'><b> %s </b><br> %s</span>"
fig = go.Figure(data = data,layout=layout)
fig.show()

# <div class="subsection_title">4.7 Visualization Libraries </div>

In [None]:
plt.rcParams.update(plt.rcParamsDefault)
visualization_col=[col  for col in survey.columns if 'What data visualization libraries or tools do you use on a regular basis?  (Select all that apply)' in col ]
survey_obj=survey_clss(survey)
visualizationlib_df=survey_obj.survey_colStats_func(visualization_col,'VisulizationTools').toPandas()
visualizationlib_df.sort_values(by=['recCOUNT'], inplace=True, ascending=False)



fig,ax=plt.subplots(1,1,figsize=(15, 10))
ax=sns.barplot(x=visualizationlib_df["VisulizationTools"],y=visualizationlib_df['recCOUNT'],ax=ax,palette=cust_color)
ax.set_facecolor('#FEFBF3')
# Hide the right and top spines
ax.spines['right'].set_visible(False)
ax.spines['top'].set_visible(False)

ax.grid(which='major', axis='y',  color='#EEEEEE', lw=0.4)
ax.set_xticklabels(ax.get_xticklabels(), rotation=90)

#plot title
plt.rcParams['figure.titlesize']=14
plt.rcParams['figure.titleweight']='bold'
fig.suptitle('Kaggle developers by Country', fontdict={'fontsize': plt.rcParams['figure.titlesize'],
                                          'fontweight' : plt.rcParams['figure.titleweight'],
                                          'verticalalignment': 'baseline',
                                          'horizontalalignment': 'left'})

#Labels
ax.set_xlabel('Visualization Library', labelpad=7,weight='semibold')
ax.set_ylabel('NoOfUsers', labelpad=7,weight='semibold')

# data label
for i in ax.patches:
    userCnt=round((i.get_height()/survey.count())*100,2)
    val=f'{userCnt:.1f}%'
    x = i.get_x()+ i.get_width() / 2 
    y = i.get_y()+i.get_height()+200
    ax.text(x, y, val, ha='center', va='center', fontsize=7, weight='semibold',color='red') 

plt.show()
    

1. As Python is most widely used language , hence its not suprising to see Matplotlib and 
 Seaborn and highest % usage followed by Ggplot which is R library for visulization.

# <div class="subsection_title">4.8 Job Title vs Visualization Libraries </div>

In [None]:

df_pd=survey_obj.role_surveyattr(visualization_col,'VisulizationTools').toPandas()
df_pd=df_pd.pivot(index='currentRole', columns='VisulizationTools', values='%cnt')


fig, ax = plt.subplots(figsize=(14, 16))
sns.heatmap(df_pd,xticklabels=True, yticklabels=True, cmap="YlGnBu", annot=True, linewidths=0.005, linecolor='black'
            , annot_kws={"fontsize":10}, fmt='.2f', cbar=False)
plt.yticks(fontsize=9)
plt.title('Percentage of Kagglers using certain visualization library', fontname = 'monospace', weight='bold')
plt.xlabel('Visualization Library', fontname = 'monospace', weight='semibold')
plt.show()

# <div class="subsection_title">4.9 ML Libraries </div>

In [None]:
ml_cols = [col  for col in survey.columns if 'Which of the following machine learning frameworks do you use on a regular basis? (Select all that apply)' in col ]
survey_obj=survey_clss(survey)
mllib_df=survey_obj.survey_colStats_func(ml_cols,'MLlib').toPandas()
mllib_df.sort_values(by=['recCOUNT'], inplace=True, ascending=False)

fig,ax=plt.subplots(1,1,figsize=(15, 10))
sns.barplot(x=mllib_df["MLlib"],y=mllib_df['recCOUNT'],ax=ax,palette=cust_color)
ax.set_facecolor('#FEFBF3')
# Hide the right and top spines
ax.spines['right'].set_visible(False)
ax.spines['top'].set_visible(False)

ax.grid(which='major', axis='y',  color='#EEEEEE', lw=0.4)
ax.set_xticklabels(ax.get_xticklabels(), rotation=90, ha='right')

#plot title
plt.rcParams['figure.titlesize']=14
plt.rcParams['figure.titleweight']='bold'
fig.suptitle('ML library used by Kagglers', fontdict={'fontsize': plt.rcParams['figure.titlesize'],
                                          'fontweight' : plt.rcParams['figure.titleweight'],
                                          'verticalalignment': 'baseline',
                                          'horizontalalignment': 'left'})

#Labels
ax.set_xlabel('ML Library', labelpad=7,weight='semibold')
ax.set_ylabel('NoOfUsers', labelpad=7,weight='semibold')

# data label
for i in ax.patches:
    userCnt=round((i.get_height()/survey.count())*100,2)
    val=f'{userCnt:.1f}%'
    x = i.get_x()+ i.get_width() / 2 
    y = i.get_y()+i.get_height()+200
    ax.text(x, y, val, ha='center', va='center', fontsize=7, weight='semibold',color='red') 

plt.show()

SkLearn continues to be the top ML framework used by many Kagglers followed by Tensor-flow and Keras. It is also worth noticing that Pytorch, XGBoost, LightGBM and CatBoost also have decent popularity among Kagglers

# <div class="subsection_title">4.10 Job Title vs ML Libraries </div>

In [None]:
df_pd=survey_obj.role_surveyattr(ml_cols,'MLlib').toPandas()
df_pd=df_pd.pivot(index='currentRole', columns='MLlib', values='%cnt')


fig, ax = plt.subplots(figsize=(14, 16))
sns.heatmap(df_pd,xticklabels=True, yticklabels=True, cmap="YlGnBu", annot=True, linewidths=0.005, linecolor='black'
            , annot_kws={"fontsize":10}, fmt='.2f', cbar=False)
plt.yticks(fontsize=9)
plt.title('Percentage of Kagglers using certain ML library', fontname = 'monospace', weight='bold')
plt.xlabel('ML Library', fontname = 'monospace', weight='semibold')
plt.ylabel('Current Professional Role', fontname = 'monospace', weight='semibold')
plt.show()

# <div class="subsection_title">4.11 Hosted Notebook </div>

In [None]:
cloudNotebook_cols = [col  for col in survey.columns if 'Which of the following hosted notebook products do you use on a regular basis?' in col ]
survey_obj=survey_clss(survey)
cloudNotebook_df=survey_obj.survey_colStats_func(cloudNotebook_cols,'hostedNotebook').toPandas()
cloudNotebook_df.sort_values(by=['recCOUNT'], inplace=True, ascending=False)
cloudNotebook_df.reset_index(drop=True, inplace=True)
fig, ax = plt.subplots(figsize=(14,10),facecolor='#FEFBF3',constrained_layout=True)
ax.set_facecolor('#FEFBF3')
for s in ["right", "top"]:
    ax.spines[s].set_visible(False)

ax.grid(which='major', axis='y', zorder=0, color='#CDD0CB', linewidth=0.3, alpha=0.5)
ax.grid(which='major', axis='x', zorder=0, color='#E8EAE6', linewidth=18, alpha=0.7)


sns.scatterplot(data=cloudNotebook_df, x=cloudNotebook_df["hostedNotebook"], y=cloudNotebook_df["%CNT"],s=550,
               zorder=3, linewidth=0.3, alpha=0.7)


for i,val in enumerate(cloudNotebook_df['%CNT']):
    val=f'{val:.1f}%'
    ax.annotate(val, (cloudNotebook_df['hostedNotebook'][i], cloudNotebook_df['%CNT'][i]),
              ha='center', va='center', fontsize=6,color='white',weight='semibold')


for tick in ax.get_xticklabels():
    tick.set_rotation(90)  
    
# getting the array of values of y-axis
ticks = ax.get_yticks()
# formatted the values into strings beginning with dollar sign
new_labels = [f'{val:.1f}%' for val in ticks]
# Set the new labels
ax.set_yticks(ax.get_yticks().tolist())
ax.set_yticklabels(new_labels)

#Labels
ax.set_xlabel('Hosted Notebooks', labelpad=10,weight='semibold')
ax.set_ylabel('User count percentage', labelpad=8,weight='semibold')


#title
x0, x1 = ax.get_xlim()
y0, y1 = ax.get_ylim()
ax.text(x0, y1+5, 'Select the title most similar to your current role (or most recent title if retired)', fontsize=8, ha='left', va='top')

plt.show()


# <div class="subsection_title">4.12 Big Data tools </div>

In [None]:
bigdata_cols = [col  for col in survey.columns if 'Which of the following big data products (relational databases, data warehouses, data lakes, or similar) do you use on a regular basis? (Select all that apply) - Selected Choice' in col ]
survey_obj=survey_clss(survey)
bigdata_df=survey_obj.survey_colStats_func(bigdata_cols,'BigDataTools').toPandas()
bigdata_df.sort_values(by=['recCOUNT'], inplace=True, ascending=False)

fig,ax=plt.subplots(1,1,figsize=(15, 10))
sns.barplot(x=bigdata_df["BigDataTools"],y=bigdata_df['recCOUNT'],ax=ax,palette=cust_color)
ax.set_facecolor('#FEFBF3')
# Hide the right and top spines
ax.spines['right'].set_visible(False)
ax.spines['top'].set_visible(False)

ax.grid(which='major', axis='y',  color='#EEEEEE', lw=0.4)
ax.set_xticklabels(ax.get_xticklabels(), rotation=90, ha='right')

#plot title
plt.rcParams['figure.titlesize']=14
plt.rcParams['figure.titleweight']='bold'
fig.suptitle('Big Data tools used by Kagglers', fontdict={'fontsize': plt.rcParams['figure.titlesize'],
                                          'fontweight' : plt.rcParams['figure.titleweight'],
                                          'verticalalignment': 'baseline',
                                          'horizontalalignment': 'left'})

#Labels
ax.set_xlabel('Big Data Tools', labelpad=7,weight='semibold')
ax.set_ylabel('NoOfUsers', labelpad=7,weight='semibold')

# data label
for i in ax.patches:
    userCnt=round((i.get_height()/survey.count())*100,2)
    val=f'{userCnt:.1f}%'
    x = i.get_x()+ i.get_width() / 2 
    y = i.get_y()+i.get_height()+50
    ax.text(x, y, val, ha='center', va='center', fontsize=7, weight='semibold',color='red') 

plt.show()

# <div class="subsection_title">4.13 Hosted Notebook </div>

In [None]:
BI_cols = [col  for col in survey.columns if 'Which of the following business intelligence tools do you use on a regular basis? (Select all that apply) - Selected Choice' in col ]
survey_obj=survey_clss(survey)
BItools_df=survey_obj.survey_colStats_func(BI_cols,'BItools').toPandas()
BItools_df.sort_values(by=['recCOUNT'], inplace=True, ascending=False)
BItools_df.reset_index(drop=True, inplace=True)
fig, ax = plt.subplots(figsize=(14,10),facecolor='#FEFBF3',constrained_layout=True)

for s in ["right", "top"]:
    ax.spines[s].set_visible(False)

ax.grid(which='major', axis='y', zorder=0, color='#CDD0CB', linewidth=0.3, alpha=0.5)
ax.grid(which='major', axis='x', zorder=0, color='#E8EAE6', linewidth=18, alpha=0.7)


sns.scatterplot(data=BItools_df, x=BItools_df["BItools"], y=BItools_df["%CNT"],s=550,
               zorder=3, linewidth=0.3, alpha=0.7)


for i,val in enumerate(BItools_df['%CNT']):
    val=f'{val:.1f}%'
    ax.annotate(val, (BItools_df['BItools'][i], BItools_df['%CNT'][i]),
              ha='center', va='center', fontsize=6,color='white',weight='semibold')


for tick in ax.get_xticklabels():
    tick.set_rotation(90)  
    
# getting the array of values of y-axis
ticks = ax.get_yticks()
# formatted the values into strings beginning with dollar sign
new_labels = [f'{val:.1f}%' for val in ticks]
# Set the new labels
ax.set_yticks(ax.get_yticks().tolist())
ax.set_yticklabels(new_labels)

#Labels
ax.set_xlabel('Hosted Notebooks', labelpad=10,weight='semibold')
ax.set_ylabel('User count percentage', labelpad=8,weight='semibold')


#title
x0, x1 = ax.get_xlim()
y0, y1 = ax.get_ylim()
ax.text(x0, y1+5, 'Which of the following business intelligence tools do you use on a regular basis.', fontsize=8, ha='left', va='top')

plt.show()

# <div class="subsection_title">4.14 NLPs </div>

In [None]:
NLP_cols = [col  for col in survey.columns if 'Which of the following natural language processing (NLP) methods do you use on a regular basis? ' in col ]
survey_obj=survey_clss(survey)
NLP_df=survey_obj.survey_colStats_func(NLP_cols,'NLP_lib').toPandas()
NLP_df.sort_values(by=['recCOUNT'], inplace=True, ascending=False)

fig,ax=plt.subplots(1,1,figsize=(15, 10))
sns.barplot(x=NLP_df["NLP_lib"],y=NLP_df['recCOUNT'],ax=ax,palette=cust_color)
ax.set_facecolor('#FEFBF3')
# Hide the right and top spines
ax.spines['right'].set_visible(False)
ax.spines['top'].set_visible(False)

ax.grid(which='major', axis='y',  color='#EEEEEE', lw=0.4)
ax.set_xticklabels(ax.get_xticklabels(), rotation=90, ha='right')

#plot title
plt.rcParams['figure.titlesize']=14
plt.rcParams['figure.titleweight']='bold'
fig.suptitle('NLP libraries used by Kagglers', fontdict={'fontsize': plt.rcParams['figure.titlesize'],
                                          'fontweight' : plt.rcParams['figure.titleweight'],
                                          'verticalalignment': 'baseline',
                                          'horizontalalignment': 'left'})

#Labels
ax.set_xlabel('NLP Libraries', labelpad=7,weight='semibold')
ax.set_ylabel('NoOfUsers', labelpad=7,weight='semibold')

# data label
for i in ax.patches:
    userCnt=round((i.get_height()/survey.count())*100,2)
    val=f'{userCnt:.1f}%'
    x = i.get_x()+ i.get_width() / 2 
    y = i.get_y()+i.get_height()+50
    ax.text(x, y, val, ha='center', va='center', fontsize=7, weight='semibold',color='red') 

plt.show()

# <div class="subsection_title">4.15 Managed ML products </div>

In [None]:
managed_ml_cols = [col  for col in survey.columns if 'Do you use any of the following managed machine learning products on a regular basis? (Select all that apply) - Selected Choice' in col ]
survey_obj=survey_clss(survey)
managed_ml_df=survey_obj.survey_colStats_func(managed_ml_cols,'managedMLtools').toPandas()
managed_ml_df.sort_values(by=['recCOUNT'], inplace=True, ascending=False)
managed_ml_df

It appears that most of Kagglers don't use hosted ML platform on public clouds like AWS, Azure.

# <div class="subsection_title">4.16 Where Kagglers work </div>

In [None]:

industry_df=survey.select(when 
              (
              col('In what industry is your current employer/contract (or your most recent employer if retired)? - Selected Choice').isNull(),'No Industry specified')
              .otherwise(col('In what industry is your current employer/contract (or your most recent employer if retired)? - Selected Choice')).alias('CurrentIndustry')
             )\
.groupBy(col('CurrentIndustry'))\
.agg(count('*').alias('recCount'))\
.toPandas()

fig = px.treemap(industry_df, path=['CurrentIndustry'],  values=industry_df['recCount'],
                )
fig.update_traces(root_color="lightgrey")
fig.update_layout(margin = dict(t=50, l=25, r=25, b=25))
## to display count 
fig.data[0].texttemplate = "%{label}<br>%{value}"
fig.show()

# <div class="subsection_title">4.17 Yearly Compensation </div>

In [None]:
compensation_df=survey.select(col('What is your current yearly compensation (approximate $USD)?').alias('Yearlycompensation'))\
.groupBy(col('Yearlycompensation'))\
.agg(count("*").alias('recCount'))\
.toPandas()
compensation_df.sort_values(by=['recCount'], inplace=True, ascending=False)
compensation_df.reset_index(drop=True, inplace=True)

fig, ax = plt.subplots(figsize=(14,10),facecolor='#FEFBF3',constrained_layout=True)
ax.set_facecolor('#FEFBF3')
# Hide the right and top spines
ax.spines['right'].set_visible(False)
ax.spines['top'].set_visible(False)
ax=sns.barplot(data=compensation_df,x='Yearlycompensation',y='recCount',linewidth=0,saturation=1
              ,palette= sns.color_palette("YlOrBr", 26))

#ax0.fill_between(np.array([-0.5,2.5]),4000, color='#90AACB', alpha=0.3,  linewidth=1,linestyle='--')
ax.grid(which='major', axis='y',  color='#EEEEEE', lw=0.4)

#plot title

ax.set_title('Year Compensation in ML space - 2021',loc='center',fontdict={'fontsize': plt.rcParams['axes.titlesize'],
                                          'fontweight' : plt.rcParams['axes.titleweight'],
                                          'verticalalignment': 'baseline',
                                          'horizontalalignment': 'left'})


# data label
for i in ax.patches:
    userCnt=i.get_height()
    val=f'{round(((userCnt/survey.count())*100),2):.1f}%'
    x = i.get_x() + i.get_width() / 2 
    y = i.get_y() + i.get_height()+75
    ax.text(x, y, val, ha='center', va='center', fontsize=7, weight='semibold',color='red') 
    
ax.tick_params(axis='both', which='major', labelsize=8,width=0.3, length=1)

#Labels
ax.set_xlabel('Yearly compensation', labelpad=10,weight='semibold')
ax.set_ylabel('NoOfUsers', labelpad=8,weight='semibold')

ax.set_xticklabels(ax.get_xticklabels(), rotation=90, ha='right')

plt.show()

# <div class="subsection_title">4.18 Gender vs Yearly Compensation </div>

In [None]:
import numpy as np
compensation_men_df=survey.filter(col('What is your gender? - Selected Choice')=='Man').select(col('What is your current yearly compensation (approximate $USD)?').alias('Yearlycompensation'))\
.groupBy(col('Yearlycompensation'))\
.agg(count("*").alias('recCount-Men'))\
.toPandas()
compensation_men_df.sort_values(by=['recCount-Men'], inplace=True, ascending=False)
compensation_men_df.reset_index(drop=True, inplace=True)

compensation_women_df=survey.filter(col('What is your gender? - Selected Choice')=='Woman').select(col('What is your current yearly compensation (approximate $USD)?').alias('Yearlycompensation'))\
.groupBy(col('Yearlycompensation'))\
.agg(count("*").alias('recCount'))\
.withColumn('recCount-Women',-col('recCount'))\
.toPandas()
compensation_women_df.sort_values(by=['recCount'], inplace=True, ascending=False)
compensation_women_df.reset_index(drop=True, inplace=True)

fig, ax = plt.subplots(figsize=(14,10),facecolor='#FEFBF3',constrained_layout=True)
ax.set_facecolor('#FEFBF3')
# Hide the right and top spines
ax.spines['right'].set_visible(False)
ax.spines['top'].set_visible(False)
ax.spines['left'].set_visible(False)

ax.bar(data=compensation_women_df,x=compensation_women_df.index,height='recCount-Women', width=0.7, color='#40E0D0', alpha=0.8, label='Male')
ax.bar(data=compensation_men_df,x=compensation_men_df.index,height='recCount-Men', width=0.7, color='#FF6A33', alpha=0.8, label='Female')

ax.xaxis.set_ticks(np.arange(len(compensation_women_df)))
ax.set_xticklabels(compensation_women_df['Yearlycompensation'], rotation=90)
ax.set_yticklabels([])

ax.set_title('Year Compensation vs Gender',loc='center',fontdict={'fontsize': plt.rcParams['axes.titlesize'],
                                          'fontweight' : plt.rcParams['axes.titleweight'],
                                          'verticalalignment': 'baseline',
                                          'horizontalalignment': 'left'})

# data label
for i in ax.patches:
    userCnt=(i.get_height())
    x = i.get_x() + i.get_width() / 2 
    if(userCnt>0):
        y = i.get_y() + (i.get_height())+75
    else:
        y = i.get_y() + i.get_height()-200
    ax.text(x, y, abs(userCnt), ha='center', va='center', fontsize=9, weight='semibold',color='black') 

ax.legend()
plt.show()

**References**
I have learned a lot by reviewing other Kagglers work and have used those ideas/knowledge in this notebook. 

1. Notebook - https://www.kaggle.com/datafan07/what-takes-to-be-a-data-scientist-story-of-robert

2. Notebook - https://www.kaggle.com/corneliuskristianto/machine-learning-and-data-science-survey-analysis

3. Notebook - https://www.kaggle.com/spitfire2nd/enthusiast-to-data-professional-what-changes#3.-What-should-I-focus-on?

**Thank You** for reading this notebook. My thanks to all Kagglers who has shared their works and inspired me to do the same, and gave me the opportunities to learn from their work.

Please provide comments for improvements and **upvote** if you liked my effort.