In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## Introduction
### Welcome to my notebook!
#### We will be exploring the 2021 Kaggle DS & ML Survey dataset. 
#### I wanted to take a look at what the requirements of some industries are; to help students/ non-working people understand their next step towards the career they want.



## Objectives
> 1. To explore various aspects of the top 3 industries.
> 2. To help get a clear understanding on how to gain entry into any of these industries. 

### We will address the following questions:
>  1. Which are the top 3 most popular industries?
>  2. Which job title is most common among the top 3?
>  3. Which of the top 3 industries have more high-paying jobs?
>  4. Which languages are preferred in the top 3?
>  5. What is the education background of the people per industry?

#### To start off with this, let's first import and take a look at our data.

In [None]:
data= pd.read_csv('../input/kaggle-survey-2021/kaggle_survey_2021_responses.csv')

data.head()


#### The first row is a series of questions so it would be better to remove it before going any further.

In [None]:
df=data.drop(data.index[0])
df.head()

#### We can start working on it now!

## Which industries are the most popular currently?


In [None]:
plt.figure(figsize=(8,5))


sns.countplot(data=df,y='Q20', order=df['Q20'].value_counts().index,palette=sns.color_palette('rocket',18))
plt.ylabel(' ')
plt.xlabel('No. of people working')
plt.title('Industry vs No. of people working in them',fontsize=20)


*  The top three most popular indutries according to this data are:
>       1. Technology
>       2. Education
>       3. Finance
*  There is a considerable amount of difference between the number of people working in the other industries when compared to these.

#### We will now look at various aspects of the top 3 industries that we have seen namely- Computer/Technology, Education and Finance.

## Which job title is most common?

#### Let's look at the distribution of various roles.


In [None]:
df['Q5']

plt.figure(figsize=(8,5))

ax=sns.countplot(data=df,y='Q5', order=df['Q5'].value_counts().index,palette=sns.color_palette('rocket',15))
ax.set_title('Over-all Distribution of Roles', fontsize=20)
ax.set_ylabel(' ')

#### Amongst the people who filled this survey, the highest count is of students. When it comes to working professionals, most are Data Scientists followed by Software Engineers.
#### Data Analyst also is one of the top roles.

In [None]:
#Removing students and unemployed people
df=df[df['Q5']!= 'Student']
df=df[df['Q5']!= 'Currently not employed']

#Plotting distribution of roles in each industry
df1=df[df['Q20']=='Computers/Technology']
plt.figure(figsize=(8,5))
ax=sns.countplot(data=df1,y='Q5', order=df1['Q5'].value_counts().index,palette=sns.color_palette("flare",13))
ax.set_title('Roles in Computer/Tech',fontsize=20)
ax.set_ylabel(' ')

df2=df[df['Q20']=='Academics/Education']
plt.figure(figsize=(8,5))
ax=sns.countplot(data=df2,y='Q5', order=df2['Q5'].value_counts().index,palette=sns.color_palette("summer_r",13))
ax.set_title('Roles in Education',fontsize=20)
ax.set_ylabel(' ')

df3=df[df['Q20']=='Accounting/Finance']
plt.figure(figsize=(8,5))
ax=sns.countplot(data=df3,y='Q5', order=df3['Q5'].value_counts().index,palette=sns.color_palette("winter_r",13))
ax.set_title('Roles in Finance',fontsize=20)
ax.set_ylabel(' ')


*  In each industry, these roles have the highest number of people:
> 1.  **Computer/Tech** : Software Engineers, Data Scientists and ML Engineers.
> 2.  **Education** : Research Scientist and Data Scientists.
> 3.  **Finance** : Data Scientists, Data Analysts and Business Analysts.
> 
*  An interesting thing to note is that Data Scientist as a role **is the only role that is common** to all the three industries as one of the top three roles. 
*  Also if we consider top 5 roles, Data Analyst is another common role amongst all three.

## Which industry has the most high-paying jobs?

In [None]:
## Computer/Technology ##
a2=df1['Q25'].value_counts()
#print(c2)

r1=a2.loc[['$0-999','1,000-1,999','2,000-2,999','3,000-3,999','4,000-4,999','5,000-7,499','7,500-9,999',
          '10,000-14,999','15,000-19,999','20,000-24,999','25,000-29,999','30,000-39,999','40,000-49,999']].sum()
r2=a2.loc[['50,000-59,999','60,000-69,999','70,000-79,999','80,000-89,999','90,000-99,999']].sum()
r3=a2.loc[['100,000-124,999','125,000-149,999']].sum()
r4=a2.loc[['150,000-199,999']].sum()
r5=a2.loc[['200,000-249,999','250,000-299,999','300,000-499,999','$500,000-999,999','>$1,000,000']].sum()


r1=a2.loc[['$0-999','1,000-1,999','2,000-2,999','3,000-3,999','4,000-4,999','5,000-7,499','7,500-9,999']].sum()
r2= a2.loc[['10,000-14,999','15,000-19,999']].sum()
r3=a2.loc[['20,000-24,999','25,000-29,999']].sum()
r4=a2.loc[['30,000-39,999']].sum()
r5=a2.loc[['40,000-49,999']].sum()
r6=a2.loc[['50,000-59,999']].sum()
r7=a2.loc[['60,000-69,999']].sum()
r8=a2.loc[['70,000-79,999']].sum()
r9=a2.loc[['80,000-89,999']].sum()
r10=a2.loc[['90,000-99,999']].sum()
r11=a2.loc[['100,000-124,999','125,000-149,999','150,000-199,999','200,000-249,999','250,000-299,999','300,000-499,999','$500,000-999,999','>$1,000,000']].sum()
 

sal=[['0-9,999',r1],['10,000-19,999',r2],['20,000-29,999',r3],['30,000-39,999',r4],['40,000-49,999',r5],
    ['50,000-59,999',r6],['60,000-69,999',r7],['70,000-79,999',r8],['80,000-89,999',r9],['90,000-99,999',r10],['>100,000',r11]]
frame= pd.DataFrame(sal,columns=['Salary','Count'])
#print(frame)

x=frame['Salary']
y=frame['Count']

plt.figure(figsize=(8,5))
ax=sns.barplot(x=y,y=x,palette=sns.color_palette("flare",11))
ax.set_title('Salary Distribution in Computer/Tech',fontsize=20)
ax.set_ylabel(' ')

## Academics/Education ##
b2=df2['Q25'].value_counts()
#print(b2)

s1=b2.loc[['$0-999','1,000-1,999','2,000-2,999','3,000-3,999','4,000-4,999','5,000-7,499','7,500-9,999']].sum()
s2=b2.loc[['10,000-14,999','15,000-19,999']].sum()
s3=b2.loc[['20,000-24,999','25,000-29,999']].sum()
s4=b2.loc[['30,000-39,999']].sum()
s5=b2.loc[['40,000-49,999']].sum()
s6=b2.loc[['50,000-59,999']].sum()
s7=b2.loc[['60,000-69,999']].sum()
s8=b2.loc[['70,000-79,999']].sum()
s9=b2.loc[['80,000-89,999']].sum()
s10=b2.loc[['90,000-99,999']].sum()
s11=b2.loc[['100,000-124,999','125,000-149,999','150,000-199,999','200,000-249,999','250,000-299,999','300,000-499,999','$500,000-999,999','>$1,000,000']].sum()
 

sal1=[['0-9,999',s1],['10,000-19,999',s2],['20,000-29,999',s3],['30,000-39,999',s4],['40,000-49,999',s5],
    ['50,000-59,999',s6],['60,000-69,999',s7],['70,000-79,999',s8],['80,000-89,999',s9],['90,000-99,999',s10],['>100,000',s11]]
frame1= pd.DataFrame(sal1,columns=['Salary','Count'])
#print(frame1)

x1=frame1['Salary']
y1=frame1['Count']

plt.figure(figsize=(8,5))
ax=sns.barplot(x=y1,y=x1,palette=sns.color_palette("summer_r",11))
ax.set_title('Salary Distribution in Academics',fontsize=20)
ax.set_ylabel(' ')

## Finance ##
c2=df3['Q25'].value_counts()
#print(c2)

t1=c2.loc[['$0-999','1,000-1,999','2,000-2,999','3,000-3,999','4,000-4,999','5,000-7,499','7,500-9,999']].sum()
t2=c2.loc[['10,000-14,999','15,000-19,999']].sum()
t3=c2.loc[['20,000-24,999','25,000-29,999']].sum()
t4=c2.loc[['30,000-39,999']].sum()
t5=c2.loc[['40,000-49,999']].sum()
t6=c2.loc[['50,000-59,999']].sum()
t7=c2.loc[['60,000-69,999']].sum()
t8=c2.loc[['70,000-79,999']].sum()
t9=c2.loc[['80,000-89,999']].sum()
t10=c2.loc[['90,000-99,999']].sum()
t11=c2.loc[['100,000-124,999','125,000-149,999','150,000-199,999','200,000-249,999','250,000-299,999','300,000-499,999','$500,000-999,999','>$1,000,000']].sum()
 

sal2=[['0-9,999',t1],['10,000-19,999',t2],['20,000-29,999',t3],['30,000-39,999',t4],['40,000-49,999',t5],
    ['50,000-59,999',t6],['60,000-69,999',t7],['70,000-79,999',t8],['80,000-89,999',t9],['90,000-99,999',t10],['>100,000',t11]]
frame2= pd.DataFrame(sal2,columns=['Salary','Count'])
#print(frame2)

x2=frame2['Salary']
y2=frame2['Count']

plt.figure(figsize=(8,5))
ax=sns.barplot(x=y2,y=x2,palette=sns.color_palette("winter_r",11))
ax.set_title('Salary Distribution in Finance',fontsize=20)
ax.set_ylabel(' ')

*  All three industries have maximum people in the 0-9,999 dollar range.
*  The field of Technology has the highest number of jobs that pay above 100,000 dollars (close to 500).
* The Finance industry has about 250 jobs that pay over 100,000 dollars while Education has below 250. 

## Which languages are most used currently?

In [None]:
#In Computer/Tech

a=[df1['Q7_Part_1'].count(),df1['Q7_Part_2'].count(),df1['Q7_Part_3'].count(), df1['Q7_Part_4'].count(),
  df1['Q7_Part_5'].count(),df1['Q7_Part_6'].count(),df1['Q7_Part_7'].count(),df1['Q7_Part_8'].count(),
  df1['Q7_Part_9'].count(),df1['Q7_Part_10'].count(),df1['Q7_Part_11'].count(), df1['Q7_Part_12'].count(),
  df1['Q7_OTHER'].count()]

lab=['Python','R','SQL','C','C++','Java','JavaScript','Julia','Swift','Bash','MATLAB','None','Other']

plt.figure(figsize=(10,5))
ax=sns.barplot(x=lab, y= a,palette=sns.color_palette("flare",13))
ax.set_xticklabels(ax.get_xticklabels(),rotation = 30)
plt.title('Languages in Comp/Tech', fontsize=20)
plt.ylabel('Count ')

#In Academics

a1=[df2['Q7_Part_1'].count(),df2['Q7_Part_2'].count(),df2['Q7_Part_3'].count(), df2['Q7_Part_4'].count(),
  df2['Q7_Part_5'].count(),df2['Q7_Part_6'].count(),df2['Q7_Part_7'].count(),df2['Q7_Part_8'].count(),
  df2['Q7_Part_9'].count(),df2['Q7_Part_10'].count(),df2['Q7_Part_11'].count(), df2['Q7_Part_12'].count(),
  df2['Q7_OTHER'].count()]

lab1=['Python','R','SQL','C','C++','Java','JavaScript','Julia','Swift','Bash','MATLAB','None','Other']

plt.figure(figsize=(10,5))
ax=sns.barplot(x=lab1, y= a1,palette=sns.color_palette("summer_r",13))
ax.set_xticklabels(ax.get_xticklabels(),rotation = 30)
plt.title('Languages in Education', fontsize=20)
plt.ylabel('Count ')

#In Finance
a2=[df3['Q7_Part_1'].count(),df3['Q7_Part_2'].count(),df3['Q7_Part_3'].count(), df3['Q7_Part_4'].count(),
  df3['Q7_Part_5'].count(),df3['Q7_Part_6'].count(),df3['Q7_Part_7'].count(),df3['Q7_Part_8'].count(),
  df3['Q7_Part_9'].count(),df3['Q7_Part_10'].count(),df3['Q7_Part_11'].count(), df3['Q7_Part_12'].count(),
  df3['Q7_OTHER'].count()]

lab2=['Python','R','SQL','C','C++','Java','JavaScript','Julia','Swift','Bash','MATLAB','None','Other']

plt.figure(figsize=(10,5))
ax=sns.barplot(x=lab2, y= a2,palette=sns.color_palette("winter_r",13))
ax.set_xticklabels(ax.get_xticklabels(),rotation = 30)
plt.title('Languages in Finance', fontsize=20)
plt.ylabel('Count ')


*    Python is undoubtedly the most used language in all three.
*   SQL is used more than R in all three.
*   R is preferred over other languages (except Python) in Academics and Finance; however Tech uses various other languages like C++, Java and JavaScript over R.

## Which education background is prevalent in the top 3 industries?

In [None]:
#Removing entries of people who haven't answered Q4
df1=df1[df1['Q4']!= 'I prefer not to answer']
df2=df2[df2['Q4']!= 'I prefer not to answer']
df3=df3[df3['Q4']!= 'I prefer not to answer']
#Renaming some values
df1.replace(['No formal education past high school','Some college/university study without earning a bachelor’s degree']
            ,['High School','No Degree'],inplace=True)
df2.replace(['No formal education past high school','Some college/university study without earning a bachelor’s degree']
            ,['High School','No Degree'],inplace=True)
df3.replace(['No formal education past high school','Some college/university study without earning a bachelor’s degree']
            ,['High School','No Degree'],inplace=True)

#Technology
df1[['Q20','Q4']]
plt.figure(figsize=(10,4))
ax=sns.countplot(data=df1,x='Q4',order= df1['Q4'].value_counts().index,palette=sns.color_palette("flare",6) )
ax.set_xticklabels(ax.get_xticklabels(),rotation = 30)
plt.title('Education Background in Comp/Tech',fontsize=20)
plt.xlabel(' ')
plt.ylabel('No. of people')

#Education
df2[['Q20','Q4']]
plt.figure(figsize=(10,4))
ax=sns.countplot(data=df2,x='Q4',order= df2['Q4'].value_counts().index,palette=sns.color_palette("summer_r",6) )
ax.set_xticklabels(ax.get_xticklabels(),rotation = 30)
plt.title('Education Background in Education',fontsize=20)
plt.xlabel(' ')
plt.ylabel('No. of people')

#Finance
df3[['Q20','Q4']]
plt.figure(figsize=(10,4))
ax=sns.countplot(data=df3,x='Q4',order= df3['Q4'].value_counts().index,palette=sns.color_palette("winter_r",6) )
ax.set_xticklabels(ax.get_xticklabels(),rotation = 30)
plt.title('Education Background in Finance',fontsize=20)
plt.xlabel(' ')
plt.ylabel('No. of people')

* In Tech and Finance, the number of people with a Master's degree is the highest while those with a Bachelor's degree come second.
* Education is the only field where the number of Doctoral degree-holders are the highest while a Master's degree comes a close second. 

## Conclusion
* If your main goal is to keep your options open regarding industry choice; then go for skills related to Data Science/Data Analysis as they are the most fluid roles.
* If your main goal is to have a steady source of income with room for growth; then aim for roles in the Computer/Technology industry as that industry has many high-paying jobs.
* If you are thinking of learning a programming language; then go for Python as that is the most used language currently. 
* When it comes to educational background, a doctorate degree is preferred mostly in the teaching field while others prefer either a Bachelor's or Master's degree.

## Limitations
* The data is only limited to the people who have participated in the survey. Hence any conclusions formed are limited only to the participants of the survey.

## References
1. https://seaborn.pydata.org/generated/seaborn.countplot.html
2. https://seaborn.pydata.org/generated/seaborn.barplot.html
3. https://medium.com/swlh/visual-storytelling-with-seaborn-28dc548dbc4b