In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.ticker as mticker
import plotly.express as px
from plotly.offline import plot, iplot, init_notebook_mode
init_notebook_mode(connected=True)

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
data_df = pd.read_csv('/kaggle/input/kaggle-survey-2021/kaggle_survey_2021_responses.csv',low_memory=False)
data_df.head()

In [None]:
data_df.shape

In [None]:
data_df.columns

# Data Cleaning

### 1) Split the original dataframe into smaller dataframe
a) general_df: 'Q1','Q2','Q3','Q4','Q5','Q6'<br>
b) tech_df: 'Q6','Q7','Q8','Q9','Q10'<br>
c) comp_df: 'Q11','Q12','Q13'<br>
d) ml_df: 'Q14','Q15','Q16' <br>

### 2) Give Columns meaningful names and prepare the data

### a) general_df
Q1- What is your age (# years)? <br>
Q2- What is your gender? <br>
Q3- In which country do you currently reside? <br>
Q4- What is the highest level of formal education that you have attained or plan to attain within the next 2
years? <br>
Q5- Select the title most similar to your current role (or most recent title if retired):

In [None]:
#split dataframe
#general qs data frame
general_df = data_df[['Q1','Q2','Q3','Q4','Q5','Q6']].copy()
general_df.head()

In [None]:
general_df.rename(columns = {'Q1':'age','Q2':'gender','Q3':'country','Q4':'education','Q5':'job_role','Q6':'coding_exp'},inplace = True)
general_df.drop(index=0,inplace=True)
general_df.head()

In [None]:
general_df.info()

In [None]:
general_df.age.unique()

In [None]:
general_df.gender.unique()

In [None]:
general_df['gender'].replace(['Nonbinary','Prefer not to say','Prefer to self-describe'],'Other',inplace=True)

In [None]:
general_df.country.mode()

In [None]:
general_df.education.unique()

In [None]:
general_df.education.replace({'Some college/university study without earning a bachelor’s degree':'college and no degree','No formal education past high school':'high school','I prefer not to answer':'no answer'},inplace = True)
general_df.education.unique()

In [None]:
general_df.job_role.unique()

In [None]:
general_df.coding_exp.unique()

In [None]:
general_df.coding_exp.replace('I have never written code','0 years',inplace = True)

In [None]:
general_df.coding_exp.unique()

### b) tech_df
Q6- For how many years have you been writing code and/or programming? <br>
Q7- What programming languages do you use on a regular basis? (Select all that apply) <br>
Q8- What programming language would you recommend an aspiring data scientist to learn first? <br>
Q9- Which of the following integrated development environments (IDE's) do you use on a regular basis?(Select all that apply) <br>
Q10- Which of the following hosted notebook products do you use on a regular basis? (Select all that apply) <br>

In [None]:
# programming languages, IDE, host notebooks
tech_df = data_df.loc[:,data_df.columns.str.contains('^Q6|^Q7|^Q8|^Q9|^Q10')].copy()
tech_df

In [None]:
tech_df.columns

In [None]:
def get_header(col_names,df,keyword):
    vals = df[col_names].loc[0,:].values
    header = [keyword+val.split("-")[-1].strip() for val in vals]
    return header

In [None]:
q7_col_names = ['Q7_Part_1', 'Q7_Part_2', 'Q7_Part_3', 'Q7_Part_4', 'Q7_Part_5',
                   'Q7_Part_6', 'Q7_Part_7', 'Q7_Part_8', 'Q7_Part_9', 'Q7_Part_10',
                   'Q7_Part_11', 'Q7_Part_12', 'Q7_OTHER']
q7_header = get_header(q7_col_names,tech_df,'lang_')

q7_header

In [None]:
q9_col_names = ['Q9_Part_1', 'Q9_Part_2',
               'Q9_Part_3', 'Q9_Part_4', 'Q9_Part_5', 'Q9_Part_6', 'Q9_Part_7',
               'Q9_Part_8', 'Q9_Part_9', 'Q9_Part_10', 'Q9_Part_11', 'Q9_Part_12',
               'Q9_OTHER']
q9_header = get_header(q9_col_names,tech_df,'ide_')

q9_header

In [None]:
q10_col_names = ['Q10_Part_1', 'Q10_Part_2', 'Q10_Part_3', 'Q10_Part_4',
                'Q10_Part_5', 'Q10_Part_6', 'Q10_Part_7', 'Q10_Part_8', 'Q10_Part_9',
                'Q10_Part_10', 'Q10_Part_11', 'Q10_Part_12', 'Q10_Part_13',
                'Q10_Part_14', 'Q10_Part_15', 'Q10_Part_16', 'Q10_OTHER']
q10_header = get_header(q10_col_names,tech_df,'host_')

q10_header

In [None]:
col_names = q7_col_names + q9_col_names + q10_col_names + ['Q6','Q8']
col_header = q7_header + q9_header + q10_header + ['coding_exp','rec_lang']
tech_header = dict(zip(col_names,col_header))

In [None]:
tech_df.rename(columns=tech_header,inplace=True)
tech_df.head()

In [None]:
tech_df.drop(0,inplace = True)
tech_df.head()

### c) comp_df
Q11- What type of computing platform do you use most often for your data science projects? <br>
Q12- Which types of specialized hardware do you use on a regular basis? (Select all that apply) <br>
Q13- Approximately how many times have you used a TPU (tensor processing unit)?

In [None]:
# computing platform and hardware
comp_df = data_df.loc[:,data_df.columns.str.contains('^Q11|Q12|Q13')].copy()
comp_df.columns

In [None]:
q12_col_names = ['Q12_Part_1', 'Q12_Part_2', 'Q12_Part_3', 'Q12_Part_4',
                   'Q12_Part_5', 'Q12_OTHER']
q12_header = get_header(q12_col_names,comp_df,'hw_')

q12_header

In [None]:
col_names = ['Q11'] + q12_col_names + ['Q13']
col_new = ['comp_platform'] + q12_header + ['TPU']
comp_header = dict(zip(col_names,col_new))
comp_header

In [None]:
comp_df.rename(columns = comp_header, inplace = True)
comp_df.columns

In [None]:
comp_df.drop(0,inplace = True)
comp_df.head()

### d) ml_df
Q14- What data visualization libraries or tools do you use on a regular basis? (Select all that apply) <br>
Q15- For how many years have you used machine learning methods? <br>
Q16- Which of the following machine learning frameworks do you use on a regular basis? (Select all that apply)

In [None]:
# dataframe for ML
ml_df = data_df.loc[:,data_df.columns.str.contains('^Q14|Q15|Q16')].copy()
ml_df.columns

In [None]:
q14_col_names = ['Q14_Part_1', 'Q14_Part_2', 'Q14_Part_3', 'Q14_Part_4', 'Q14_Part_5',
                   'Q14_Part_6', 'Q14_Part_7', 'Q14_Part_8', 'Q14_Part_9', 'Q14_Part_10',
                   'Q14_Part_11', 'Q14_OTHER']
q14_header = get_header(q14_col_names,ml_df,'vis_')

q14_header

In [None]:
q16_col_names = ['Q16_Part_1', 'Q16_Part_2',
                   'Q16_Part_3', 'Q16_Part_4', 'Q16_Part_5', 'Q16_Part_6', 'Q16_Part_7',
                   'Q16_Part_8', 'Q16_Part_9', 'Q16_Part_10', 'Q16_Part_11', 'Q16_Part_12',
                   'Q16_Part_13', 'Q16_Part_14', 'Q16_Part_15', 'Q16_Part_16',
                   'Q16_Part_17', 'Q16_OTHER']
q16_header = get_header(q16_col_names,ml_df,'fmw_')

q16_header

In [None]:
q16_header[0] = 'fmw_sklearn'
q16_header

In [None]:
col_names = ['Q15'] + q14_col_names + q16_col_names
col_new = ['ml_years'] + q14_header + q16_header
ml_header = dict(zip(col_names,col_new))
ml_header

In [None]:
ml_df.rename(columns = ml_header,inplace = True)
ml_df.drop(0,inplace = True)
ml_df.head()

# Data visualization

### a) general_df

In [None]:
general_df.head()

In [None]:
dict(general_df.age.value_counts())

In [None]:
data = {'Age Segment':list(dict(general_df.age.value_counts()).keys()),'Number of people':list(dict(general_df.age.value_counts()).values())}
age_df = pd.DataFrame(data)
age_df['percent'] = (age_df['Number of people']/age_df['Number of people'].sum())*100
age_df

In [None]:
age_gender_df = general_df[['gender','age','country']].groupby(['gender','age'],as_index=False).count().copy()
age_gender_df.rename(columns = {'country':'Gender Count','age':'Age Segment','gender':'Gender'},inplace=True)
age_gender_df.sort_values(by='Gender Count',inplace=True,ascending=False)

In [None]:
gender_df = age_gender_df[['Gender','Gender Count']].groupby('Gender',as_index=False).sum().copy()

In [None]:
gender_df.sort_values(by='Gender Count',inplace=True,ascending=False)

In [None]:
plt.figure(figsize=(7,7));
plt.title('Gender Count', fontsize=23)
plt.pie(data = gender_df,x='Gender Count',autopct='%.0f%%'
        , textprops={'fontsize': 15}, colors = ['#82E0AA','#FA8072','#808B96']);
plt.legend(labels = ['Men','Women','Other'], fontsize = 12);

<h1><center> The previous Pie chart shows that the majority of participants were men with a percentage of 79 % followed by women with a percentage of 19%</center></h1>

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(15, 7), sharey=True);
fig.suptitle('Age And Gender',fontsize=25);

#Age
s=sns.barplot(ax=axes[0],data = age_df, x='Age Segment',y='Number of people',color='#16A085');
axes[0].set_title('Respondents Age', fontsize = 15);

#Age and gender(men and women)
sns.barplot(ax=axes[1],data = age_gender_df, x='Age Segment', y='Gender Count', hue='Gender'
            , palette = np.array(['#82E0AA','#FA8072','#808B96']) );
axes[1].set_title('Age vs Gender', fontsize = 15);


<h1><center> After examining age segments we could see that almost 75% of the participants are between 18 to 35 years old</center></h1>

In [None]:
country_df = general_df[['country','age']].groupby(['country'],as_index=False).count().copy()
country_df.rename(columns = {'age':'country_count'},inplace=True)
country_df.sort_values(by='country_count',inplace=True,ascending=False)

In [None]:
fig = px.treemap(country_df, path=[px.Constant('country'),'country'], values='country_count',color = 'country_count'
                 , color_continuous_scale=px.colors.sequential.Blugrn, title = 'Respondents Countries')
fig.data[0].textinfo = "label+value+percent parent"
fig.show()

<h1><center> As for the Respondents' Countries, India was the country of most respondents with a percentage of 29%, followed by USA with a percentage of 10% </center></h1>

In [None]:
coding_gender_df = general_df[['coding_exp','gender','age']].groupby(['coding_exp','gender'],as_index=False).count().copy()
coding_gender_df.rename(columns = {'age':'Gender_Count'},inplace=True)
coding_gender_df.sort_values(by='Gender_Count',inplace=True,ascending=False)
coding_gender_df['percent'] = round((coding_gender_df['Gender_Count']/coding_gender_df.Gender_Count.sum())*100,2)
fig = px.bar(coding_gender_df, x='coding_exp', y='Gender_Count', color = 'gender',title = 'Coding Experience vs Gender',
             labels={"coding_exp": "Coding Experience",
                     "Gender_Count": "Gender Count"}
             , color_discrete_sequence = ['#82E0AA','#FA8072','#808B96'], text = 'percent')
fig.show()

<h1><center> Almost 55% of the people who took the survey had less than one year or between 1 and 3 years of experience in Machine Learning and Data Science</center></h1>

In [None]:
general_df.job_role.unique()

In [None]:
data = {'job/role':list(dict(general_df.job_role.value_counts()).keys())
        ,'Number of people':list(dict(general_df.job_role.value_counts()).values())}
job_df = pd.DataFrame(data)
job_df.sort_values(by='Number of people',inplace=True,ascending=False)


In [None]:
plt.figure(figsize = (18,9))
plt.xticks(rotation=70,fontsize = 12)
plt.yticks(fontsize = 12)
barplot = plt.bar(data = job_df, x='job/role',height='Number of people',color='#16A085');
for bar in barplot:
    yval = bar.get_height()
    plt.text(bar.get_x() + bar.get_width()/4, yval, round((int(yval)/(job_df['Number of people'].sum()))*100,2), va='bottom', fontsize = 12)
plt.xlabel('Job Title',fontsize=15);
plt.ylabel('Number of Respondents',fontsize=15);
plt.title('Number Of People per Job/Role',fontsize = 25);

<h1><center> 26% of the survey respondents were students followed by Data Scientists that were nearly half of the number of students with a percentage of almost 14% </center></h1>

In [None]:
job_sal_df = data_df[['Q5','Q25']].copy().drop(0)
job_sal_df.rename(columns = {'Q5':'job','Q25':'salary'},inplace = True)
job_sal_df['count_val'] = 1
job_sal_df.head()

In [None]:
job_sal_df = job_sal_df.groupby(['job','salary'],as_index = False).count().copy()
job_sal_df.head()

In [None]:
job_sal = job_sal_df.pivot("job", "salary", "count_val")
plt.figure(figsize = (14,7))
sns.heatmap(job_sal, linewidths=.7, cmap=sns.color_palette("viridis"));
plt.title('Job Title vs. Salary',fontsize = 25);
plt.xlabel('Salary Range',fontsize = 15);
plt.ylabel('Job Title',fontsize = 15);

### b) tech_df

In [None]:
lang_df = tech_df.loc[:,tech_df.columns.str.contains('^coding|^lang')].copy()
lang_df.drop(['lang_None','lang_Other'],axis=1,inplace = True)

In [None]:
lang_exp_df = lang_df.groupby('coding_exp').count().copy()
lang_exp_df.drop('I have never written code',inplace = True)

In [None]:
plt.figure(figsize = (14,7))
hmp = sns.heatmap(lang_exp_df.transpose(), linewidths=.7, cmap=sns.color_palette("viridis"),annot = True,fmt='.0f');
plt.yticks(np.arange(0.5,lang_exp_df.shape[1]),labels=lang_exp_df.columns.str.replace('lang_',""),rotation=0);
plt.xlabel('Coding Experience',fontsize = 15);
plt.ylabel('Language',fontsize=15);
plt.title('# language users and experience', fontsize=25);

In [None]:
lang_df['count_row'] = lang_df.loc[:,lang_df.columns.str.contains('^lang')].count(axis=1)
lang_df

In [None]:
lang_df = lang_df[lang_df['coding_exp'] != 'I have never written code'].copy()
lang_df[['coding_exp','count_row']].head(10)
b = px.box(lang_df,x='coding_exp',y='count_row',title = 'Distribution of languages vs coding experience',
           labels={"coding_exp": "Coding Experience",
                     "count_row": "Languages Distribution"}
           ,color_discrete_sequence = ['#16A085']);
b.show();

<h1><center>The previous boxplot shows the distribution of how many languages is used for every coding experience segment</center></h1>

In [None]:
tech_df['rec_lang'].value_counts()

In [None]:
rec_df = pd.DataFrame({'lang':list(dict(tech_df['rec_lang'].value_counts()).keys()),
                  'Recommended':list(dict(tech_df['rec_lang'].value_counts()).values())})
                   
rec_df.sort_values(by='Recommended',inplace=True,ascending=False)
rec_df.drop(10,inplace=True)
rec_df.drop(7,inplace=True)

In [None]:
df = pd.DataFrame({'lang':list(dict(lang_df.loc[:,lang_df.columns.str.contains('^lang')].count(axis=0)).keys()),
                  'Used':list(dict(lang_df.loc[:,lang_df.columns.str.contains('^lang')].count(axis=0)).values())})
                   
df.sort_values(by='Used',inplace=True,ascending=False)
df.replace({'lang_':""}, regex=True, inplace=True)

In [None]:
rec_df = rec_df.merge(df,how='left',on='lang')

In [None]:
fig = px.bar(rec_df, x='lang', y=['Used','Recommended'],title = 'Used and Recommended languages',barmode='group',
             labels={"lang": "Language", "value": "Count of Respondents"}
             , color_discrete_sequence = ['#82E0AA','#FA8072'],text_auto=True)
fig.update_layout(legend_title = "")
fig.show()

<h1><center>Here it's shown that Python is still popular and highly recommended among users, on the contrary SQL is only recommended by 1338 Respondents while more than 10k are using it</center></h1>


In [None]:
df = pd.DataFrame(tech_df.loc[:,tech_df.columns.str.contains('^lang')].count(axis = 1))
df.rename(columns={0:'Number of Languages'}, inplace = True)
# df.plot(kind = 'bar', color = '#16A085', figsize= (10,5), stacked = True, width = 1 ,fontsize = 10
#         , title = 'Distribution of number of Languages', xlabel = 'Languages Number'
#         , xticks = df['Number of Languages'].unique());
plt.figure(figsize=(10,5))
plt.xlabel('Languages Number', fontsize = 15)
plt.ylabel('Frequency', fontsize = 15)
plt.title('Languages Number Distribution', fontsize = 20)
sns.histplot(df, palette = ["#16A085"], binwidth=0.5);
plt.xticks(np.arange(0, 13, 1)+0.25,labels=np.arange(0, 13, 1));

In [None]:
ide_df = tech_df.loc[:,tech_df.columns.str.contains('^coding|^ide')].copy()
ide_df.drop(['ide_None','ide_Other'],axis=1,inplace = True)
ide_df.rename(columns = {'ide_Jupyter (JupyterLab, Jupyter Notebooks, etc)':'ide_Jupyter Lab'
                             ,'ide_Visual Studio Code (VSCode)':'ide_Visual Studio Code'},inplace = True)
ide_exp_df = ide_df.groupby('coding_exp').count().copy()
ide_exp_df.drop('I have never written code',inplace = True)
ide_exp_df

In [None]:
plt.figure(figsize = (14,7))
hmp = sns.heatmap(ide_exp_df.transpose(), linewidths=.7, cmap=sns.color_palette("viridis"),annot = True,fmt='.0f');
plt.yticks(np.arange(0.5,ide_exp_df.shape[1]),labels=ide_exp_df.columns.str.replace('ide_',""),rotation=0);
plt.xlabel('Coding Experience',fontsize=15);
plt.ylabel('IDE',fontsize=15);
plt.title('# IDE Users vs. Experience',fontsize=25);

In [None]:
df = pd.DataFrame({'IDE':list(dict(ide_df.loc[:,ide_df.columns.str.contains('^ide')].count(axis=0)).keys()),
                  'Users':list(dict(ide_df.loc[:,ide_df.columns.str.contains('^ide')].count(axis=0)).values())})
                   
df.sort_values(by='Users',inplace=True,ascending=False)
df.replace({'ide_':""}, regex=True, inplace=True)
df

In [None]:
fig = px.treemap(df, values='Users', path=[px.Constant('IDE'),'IDE'],title = 'Number of users of IDE',
             color = 'Users', color_continuous_scale=px.colors.sequential.Blugrn)
fig.data[0].textinfo = "label+value+percent parent"
fig.show()

<h1><center>• Jupyter notebook is the most popular IDE across all coding experience segments</center></h1>
<h1><center>• Top 3 IDEs: 26% of the respondents are using Jupyter lab
    followed by Visual studio code with 16% of respondents using it and PyChram with 12%</center></h1>


In [None]:
ide_df['count_row'] = ide_df.loc[:,ide_df.columns.str.contains('^ide')].count(axis=1)
ide_df = ide_df[ide_df['coding_exp'] != 'I have never written code'].copy()
b = px.box(ide_df,x='coding_exp',y='count_row',title = 'Distribution of IDEs vs coding experience',
           labels={"coding_exp": "Coding Experience",
                     "count_row": "IDEs Distribution"}
           ,color_discrete_sequence = ['#16A085']);
b.show();

In [None]:
ide_df

In [None]:
host_df = tech_df.loc[:,tech_df.columns.str.contains('^coding|^host')].copy()
host_df.drop(['host_None','host_Other'],axis=1,inplace = True)
host_exp_df = host_df.groupby('coding_exp').count().copy()
host_exp_df.drop('I have never written code',inplace = True)
host_exp_df.transpose()

In [None]:
plt.figure(figsize = (12,7))
hmp = sns.heatmap(host_exp_df.transpose(), linewidths=.7, cmap=sns.color_palette("viridis"),annot = True,fmt='.0f');
plt.yticks(np.arange(0.5,host_exp_df.shape[1]),labels=host_exp_df.columns.str.replace('host_',""),rotation=0);
plt.xlabel('Coding Experience', fontsize=15);
plt.ylabel('Hosted Notebook',fontsize=15);
plt.title('# Hosted NB users vs experience',fontsize=25);

In [None]:
df = pd.DataFrame({'Host':list(dict(host_df.loc[:,host_df.columns.str.contains('^host')].count(axis=0)).keys()),
                  'Users':list(dict(host_df.loc[:,host_df.columns.str.contains('^host')].count(axis=0)).values())})                   
df.sort_values(by='Users',inplace=True,ascending=False)
df.replace({'host_':""}, regex=True, inplace=True)
df['percent'] = round((df['Users']/df['Users'].sum())*100,2)
fig = px.bar(df, x='Users', y='Host',title = 'Used Hosted Notebook',orientation='h',
             labels={"Users": "Number of Users",
                     "Host": "Hosted Notebook"}
             , color_discrete_sequence = ['#16A085'], text = 'percent')
fig.show()

<h1><center>Most popular Hosted Notebooks are Colab Notebooks with nearly 33% of the respondents using it, followed by Kaggle Notebooks with almost 32% (that's really small difference!)</center></h1>

In [None]:
host_df['count_row'] = host_df.loc[:,host_df.columns.str.contains('^host')].count(axis=1)
host_df = host_df[host_df['coding_exp'] != 'I have never written code'].copy()
b = px.box(host_df,x='coding_exp',y='count_row',title = 'Distribution of IDEs vs coding experience',
           labels={"coding_exp": "Coding Experience",
                     "count_row": "Host Notebook Distribution"}
           ,color_discrete_sequence = ['#16A085']);
b.show();

### c) comp_df

In [None]:
comp_df.head()

In [None]:
comp_df.comp_platform.unique()

In [None]:
df = pd.DataFrame({'Platform':list(dict(comp_df.comp_platform.value_counts()).keys()),
                  'Users count':list(dict(comp_df.comp_platform.value_counts()).values())})                   
df.sort_values(by='Users count',inplace=True,ascending=False)
df

In [None]:
fig = px.pie(df, values='Users count', names='Platform',title = 'Used Platform'
             , color_discrete_sequence = px.colors.qualitative.Dark2)
fig.show()

<h1><center>Almost 86% of the respondents are using either a laptop or a personal computer</center></h1>

In [None]:
df = pd.DataFrame({'HardWare':list(dict(comp_df.loc[:,comp_df.columns.str.contains('^hw')].count(axis=0)).keys()),
                  'Users':list(dict(comp_df.loc[:,comp_df.columns.str.contains('^hw')].count(axis=0)).values())})                   
df.sort_values(by='Users',inplace=True,ascending=False)
df.replace({'hw_':""}, regex=True, inplace=True)
df

In [None]:
df = pd.DataFrame({'TPU':list(dict(comp_df.TPU.value_counts()).keys())
                   ,'# of times used':list(dict(comp_df.TPU.value_counts()).values())})


fig, axes = plt.subplots(1, 2, figsize=(15, 7), sharey=False);
fig.suptitle('TPU number of times used',fontsize=20);

axes[0].pie(data = df,x='# of times used',autopct='%.0f%%'
        , textprops={'fontsize': 15}, colors = np.array(sns.color_palette("Set2"))
        ,wedgeprops=dict(width=0.3, edgecolor='white'),pctdistance=0.85);
axes[0].legend(labels = list(dict(comp_df.TPU.value_counts()).keys()), bbox_to_anchor =(-0.1, -0.1),loc ="lower left");

axes[1].set_xticklabels(axes[1].get_xticklabels(),rotation = 30)
sns.barplot(ax=axes[1],data = df, x='TPU', y='# of times used', palette = np.array(['#16A085']) );


### d) ml_df

In [None]:
ml_df.columns

In [None]:
ml_df.ml_years.unique()

In [None]:
ml_df.shape

In [None]:
general_df.shape

In [None]:
ml_df['job_role'] = general_df['job_role']
ml_df[['job_role']].head()

In [None]:
df = ml_df[['ml_years','job_role']].copy()
# df.drop(['host_None','host_Other'],axis=1,inplace = True)
df = pd.DataFrame(df.groupby(['ml_years','job_role']).size().copy())
# df.drop('I have never written code',inplace = True)
df.rename(columns = {0:'job_count'},inplace=True)
df.reset_index(inplace = True)
df.replace('I do not use machine learning methods', '0 years', inplace = True)
df = df.pivot(index='ml_years', columns='job_role', values='job_count')

In [None]:
plt.figure(figsize = (14,7))
hmp = sns.heatmap(df.transpose(), linewidths=.7, cmap=sns.color_palette("viridis"),annot = True,fmt='.0f');
plt.xlabel('Ml years', fontsize = 15);
plt.ylabel('Job role', fontsize = 15);
plt.title('# ML years vs job role',fontsize = 25)
plt.xticks(rotation = 320);

<h1><center>I don't think that we can say that most users of ML methods are students as we previously mentioned that 26% of the participants were students and any conclusions based on participants' roles wouldn't be very accurate</center></h1>


In [None]:
ml_df.columns

In [None]:
df = ml_df.loc[:,ml_df.columns.str.contains('^job_role|^vis')].copy()
df.drop(['vis_None','vis_Other'],axis=1,inplace = True)
df = pd.DataFrame({'Visualization Library':list(dict(df.loc[:,df.columns.str.contains('^vis')].count(axis=0)).keys()),
                  'Users':list(dict(df.loc[:,df.columns.str.contains('^vis')].count(axis=0)).values())})                   
df.sort_values(by='Users',inplace=True,ascending=False)
df.replace({'vis_':""}, regex=True, inplace=True)
df['percent'] = round((df['Users']/df['Users'].sum())*100,2)
fig = px.bar(df, x='Users', y='Visualization Library',title = 'Number of users of Vis. Libraries',orientation='h',
             labels={"Users": "Number of Users"}
             , color_discrete_sequence = ['#16A085'], text = 'percent')
fig.show()

In [None]:
df = ml_df.loc[:,ml_df.columns.str.contains('^job_role|^fmw')].copy()
df.drop(['fmw_None','fmw_Other'],axis=1,inplace = True)
df = df.groupby('job_role').count().copy()
plt.figure(figsize = (15,7))
hmp = sns.heatmap(df, linewidths=.7, cmap=sns.color_palette("viridis")
                  ,annot = True,fmt='.0f').set_title('# Framework users and job role', fontsize = 20);
plt.xticks(np.arange(0.5,df.shape[1]),labels=df.columns.str.replace('fmw_',""),rotation=30);
plt.xlabel('Framework',fontsize=15);
plt.ylabel('Job Role',fontsize=15);

In [None]:
df = ml_df.loc[:,ml_df.columns.str.contains('^job_role|^fmw')].copy()
df.drop(['fmw_None','fmw_Other'],axis=1,inplace = True)
df = pd.DataFrame({'FrameWork':list(dict(df.loc[:,df.columns.str.contains('^fmw')].count(axis=0)).keys()),
                  'Users':list(dict(df.loc[:,df.columns.str.contains('^fmw')].count(axis=0)).values())})                   
df.sort_values(by='Users',inplace=True,ascending=False)
df.replace({'fmw_':""}, regex=True, inplace=True)

In [None]:
fig = px.treemap(df, values='Users', path=[px.Constant('FrameWork'),'FrameWork'],title = 'Number of users of Framework',
             color = 'Users', color_continuous_scale=px.colors.sequential.Blugrn)
fig.data[0].textinfo = "label+value+percent parent"
fig.show()

In [None]:
vals = list(data_df.loc[0,data_df.columns.str.contains('^Q17')])
header = ['alg_'+val.split("- Selected Choice -")[-1].strip() for val in vals]

header

In [None]:
df = data_df.loc[:,data_df.columns.str.contains('^Q17')].copy()
df.rename(columns = dict(zip(data_df.loc[:,data_df.columns.str.contains('^Q17')].columns,header)),inplace = True)
df.drop(0,inplace=True)
df['job_role'] = general_df.job_role

In [None]:
df.drop(['alg_None','alg_Other'],axis=1,inplace = True)
df = df.groupby('job_role').count().copy()
plt.figure(figsize = (15,7))
hmp = sns.heatmap(df.transpose(), linewidths=.7, cmap=sns.color_palette("viridis")
                  ,annot = True,fmt='.0f').set_title('# Algorithm users and job role', fontsize = 20);
plt.yticks(np.arange(0.5,df.shape[1]),labels=df.columns.str.replace('alg_',""),rotation=0,fontsize=12);
plt.xticks(fontsize=12);
plt.xlabel('Algorithm',fontsize=15);
plt.ylabel('Job Role',fontsize=15);

In [None]:
df = data_df[['Q20']].copy()
df.rename(columns = {'Q20':'Industry'},inplace = True)
df.drop(0,inplace = True)
df = pd.DataFrame(df.value_counts())
df.reset_index(inplace = True)
df.rename(columns = {0:'Respondents'},inplace = True)

In [None]:
fig = px.treemap(df, values='Respondents', path=[px.Constant('Industry'),'Industry'],title = 'Number of Respondents of Industry',
             color = 'Respondents', color_continuous_scale=px.colors.sequential.Blugrn)
fig.data[0].textinfo = "label+value+percent parent"
fig.show()

<h1><center>ML and Data Science are still mostly popular in the fields of Computer/Technology (25%) and Academics/Education (20%) </center></h1>