In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd 
import seaborn as sns
import matplotlib.patches as mpatches
sns.set_style(style="whitegrid")

**Lets have a look at the Dataset**

In [None]:
data = pd.read_csv('/kaggle/input/kaggle-survey-2020/kaggle_survey_2020_responses.csv',low_memory=False)
print(f"The dimensions of the dataset: {data.shape}")
data.columns = data.iloc[0]
data.drop(data.index[0],inplace=True)
data.head()

In [None]:
questions = list(data.columns)
question_df = pd.DataFrame(data.columns,columns=['questions'])
print(questions[:15])

**AGE GROUPS**

In [None]:
age_groups = data[data.columns[1]].value_counts().sort_index()
sns.barplot(age_groups,age_groups.index)
mill = age_groups["22-24"] + age_groups["25-29"]
mill_percentage = (mill/age_groups.sum())*100
print(f"Millennials in the DS community : {mill}")
print(f"% of Millennials in the DS community : {mill_percentage}")
plt.show()

* 39 % of the People who Answered the survey were millenials.

**GENDER**

In [None]:
gender = data[data.columns[2]].value_counts()
sns.barplot(gender,gender.index)
man = gender['Man']
woman = gender['Woman']
diff_p = ((man-woman)/woman)*100
print(f"Men are more than women in this field by {diff_p}%")
plt.show()

* Most definitely Men dominate in the number game in the DS community by 307%

**Gender Based Age Analysis**

In [None]:
Male = data[data[questions[2]] == 'Man']
Female = data[data[questions[2]] == 'Woman']

In [None]:
fig, ax = plt.subplots()
m_age_groups = Male[Male.columns[1]].value_counts().sort_index()
sns.barplot(m_age_groups,m_age_groups.index,color="cyan")
f_age_groups = Female[Female.columns[1]].value_counts().sort_index()
sns.barplot(-1 * f_age_groups,f_age_groups.index,color="salmon")
ticks =  ax.get_xticks()
plt.tight_layout()
ax.set_xticklabels([int(abs(tick)) for tick in ticks])

red_patch = mpatches.Patch(color='salmon', label='Female')
black_patch = mpatches.Patch(color='cyan', label='Male')
plt.legend(handles=[red_patch, black_patch])

plt.show()

**COUNTRY of ORIGIN**

In [None]:
plt.rcParams['figure.figsize']=10,10
country = data[data.columns[3]].value_counts()
perce = (country['India']/country.sum())*100
print(f"The percentage of Indians in the DS Community {perce}")
sns.barplot(country,country.index)
plt.show()

* Close to 30% of all the members who answered the survey are indians. Indians definitely win the numbers game.
* Its also interesting to see that nigerians are very close to china,germany and UK. Which is surprising considering Nigeria has 40 percent of population under poverty line. DS education seems to grow there.

 **QUALIFICATION**

In [None]:
ITEM = data[data.columns[4]].value_counts()
perc = ((ITEM.iloc[0] + ITEM.iloc[0])/ITEM.sum())*100
print(f"Masters and Bachelor graduates constitute {perc}% of the total demographic.")
sns.barplot(ITEM,ITEM.index)
plt.show()

* 80.32% people are masters and bachelor graduates, which is expected.

**Programming Languages Used**

In [None]:
plt.rcParams['figure.figsize']=10,6
prog_lang = data.filter(regex = ("What programming languages do you use on a regular basis?"))
desc = prog_lang.describe()
prog_count = desc.iloc[0].values
prog_names = desc.iloc[2].values
prog_df = pd.DataFrame({"Language":prog_names,"Count":prog_count})
prog_df = prog_df.set_index('Language')
prog_df.sort_values(inplace=True,by="Count",ascending=False)
sns.barplot(prog_df.Count,prog_df.index)
plt.title('What programming languages do you use on a regular basis?')
plt.show()

* Its not surprising to see python as the most used language as most of the tools and frameworks are written for python, SQL is used extensively for data storage and retrieving so it is also mostly used I guess. R,C,C++,Java have almost same usage.

**Platform used to Learn Data Science**

In [None]:
plt.rcParams['figure.figsize']=10,6
item = data.filter(regex = ("On which platforms have you begun or completed data science courses?"))
desc = item.describe()
item_count = desc.iloc[0].values
item_names = desc.iloc[2].values
item_df = pd.DataFrame({"Platform":item_names,"Count":item_count})
item_df = item_df.set_index('Platform')
item_df.sort_values(inplace=True,by="Count",ascending=False)
sns.barplot(item_df.Count,item_df.index)
plt.show()

* The popularity of coursera is very much expected as well, This is very much expected as Most people start their career through Andrew Ng's course.