In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
#Exploratory Data Analysis
survey_df = pd.read_csv('../input/kaggle-survey-2021/kaggle_survey_2021_responses.csv')
survey_df.head()

In [None]:
#check null values on the dataframe
survey_df.isnull().sum()

In [None]:
survey_df.info()

In [None]:
survey_df.describe()

In [None]:
survey_df.columns

In [None]:
survey_df.dtypes

In [None]:
#Drop first row
survey_df = survey_df.drop(survey_df.index[0])
survey_df

In [None]:
#plot bar function
def plot_bar(title, col, palette):
    fig, ax = plt.subplots(figsize=(10,10))
    fig.suptitle(title, fontsize=20, font='Arial')
    sns.countplot(y=col, data=survey_df, order=survey_df.iloc[0:][col].value_counts().index, palette=palette, linewidth = 3)
    plt.show()

What is your age?

In [None]:
plt.figure(figsize=(10,10))
plt.title('Distribution of Age', fontsize=20)
sns.countplot(x='Q1', data=survey_df, palette='Set2')
plt.show()

People from 18 to 29 are the most popular reponses
 in Survey Response of Kaggle 2021

Gender Distribution

In [None]:
title = 'Distribution of Gender'
title = title.replace("-Select Choice-", "")
fig, ax = plt.subplots(figsize=(16,8))
fig.suptitle(title, fontsize=20, font='Arial')
explode =  (0.05, 0.4, 0.4, 0.5, 0.6)
labels  = list(survey_df.iloc[0:].Q2.value_counts().index)
sizes   = survey_df.iloc[0:].Q2.value_counts().values
ax.pie(sizes, explode=explode, labels=labels, autopct='%1.0f%%', pctdistance=0.7, startangle=60, colors=['#f1c40f', '#e67e22', '#e74c3c', '#2ecc71', '#3498db'])
ax.add_artist(plt.Circle((0,0),0.4,fc='white'))
plt.show()

Which country you currently reside?

In [None]:
# Top 5 most common countries 
countries = survey_df.iloc[0:].Q3.value_counts()
top_countries = countries[:5]

plt.figure(figsize=(10,10))
top_countries.plot(kind='bar', color='#3498db')
plt.title('Top 5 Countries', fontsize=20)
plt.show()

[Q6] Years of Experiences of Writing code

In [None]:

plot_bar('Years of Experiences of Writing code', 'Q6', 'Set1')

Almost people have 1 - 3 years of experiences of writing code

[Q4] What is the highest level of formal education that you are attained or plan within the next 2 years

In [None]:
#Check null value in columns Q4
survey_df['Q4'].isnull().sum()

In [None]:
plot_bar('Highes level of formal education (attained or plan to attain next 2 years',
        'Q4', "BrBG_r")

[Q5] Top 10 job titles that most similar to Kagglers current role

In [None]:
#Select top 10 job titles
top_10_job = survey_df.iloc[0:].Q5.value_counts()[:10]

#plot bar
plt.figure(figsize=(10,10))
plt.title('Top 10 Job Titles', fontsize=20)
plt.xticks(rotation=60)
sns.barplot(x=top_10_job.index, y=top_10_job.values, palette='Set2')
plt.show()

[Q7] Prgramming language used on regular basic

In [None]:
#Programming languges from Q7_1_Part_1 to Q7_1_Part_12
#Languages counting function
def counting(question_num, parts):
    questions = []
    questions = ['Q' + str(question_num) + '_Part_' + str(part) for part in range(1, parts + 1)]
    questions.append('Q' + str(question_num) + '_OTHER')

    categories = []
    values = []
    for question in questions:
        categories.append(survey_df[question].value_counts().index[0])
        values.append(survey_df[question].value_counts()[0])

    combined_langdf = pd.DataFrame()
    combined_langdf['Category'] = categories
    combined_langdf['Value'] = values

    combined_langdf = combined_langdf.sort_values(['Value'], ascending=False)

    return combined_langdf

In [None]:
#plot pie function
def plot_pie(title, df, colors):
    fig, ax = plt.subplots(figsize=(10,10))
    fig.suptitle(title, fontsize=20, font='Arial')
    ax.pie(df['Value'], labels=df['Category'], autopct='%1.0f%%', pctdistance=0.7, startangle=60, colors=colors)
    plt.show()

In [None]:
#Plot bar chart
lang_df = counting(7, 12)
plot_pie('Programming Languages used on regular basis', lang_df, ['#f1c40f', '#e67e22', '#e74c3c', '#2ecc71', '#3498db'])

[Q8] what programming language would you recommend an aspiring data scientist to learn first?

In [None]:
plot_bar('What programming language would you recommend an aspiring data scientist to learn first?', 'Q8', 'BrBG_r')

According to the bar chart, Python is the popular programming language that Kagglers recommend an aspiring data scientist to learn first, followed by the second most popular language is R, and the third is SQL

[Q9] Which of following integrated development environments (IDE's) do you use on regular basis?

In [None]:
ide_df = counting(9, 12)
plot_pie("Integrated Development Environments (IDE's) used on regular basis", ide_df, ['#f1c40f', '#e67e22', '#e74c3c', '#2ecc71', '#E23198'])

[Q10] Which of the following hosted notebook products do you use on a regular basis?

In [None]:
# barplot multiple choice function
def plot_bar_multiple_choice(title, df, colors, horizontal=False):
    plt.figure(figsize=(10,10))
    plt.title(title, fontsize=20)
    if horizontal:
        sns.barplot(x=df['Value'], y=df['Category'], palette=colors)
    else:
        sns.barplot(x=df['Category'], y=df['Value'], palette=colors)
    plt.show()



In [None]:
hosted_notebook_df = counting(10, 12)
plot_bar_multiple_choice('Hosted Notebook', hosted_notebook_df, 'Blues', True)

[Q11] What type of computing platform do you use most often for your data science projects?

In [None]:
title = 'Type of computing platform most often use for data science projects'
plot_bar(title, "Q11", "Blues_r")

[Q12] Which types of specialized hardware do you use on a regular basis?

In [None]:
hardware_df = counting(12, 5)
plot_bar_multiple_choice('Hardware', hardware_df, 'Greens_r', True)

[Q13] Approximately how many times have you used a TPU (tensor processing unit)?

In [None]:
title = 'Times used a TPU (Tensor processing unit)'
plot_bar(title, "Q13", 'brg')

[Q14] What data visualization libraries or tools do you use on a regular basis?

In [None]:
libraries_df = counting(14, 11)
plot_bar_multiple_choice('Most Libraries for Data Scientists', libraries_df, 'Purples_r', True)

[Q15] For how many years have you used machine learning methods?

In [None]:
title = 'Years of using machine learning methods'
plot_bar(title, "Q15", 'brg')