In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.preprocessing import LabelEncoder

In [None]:
data = pd.read_csv('../input/kaggle-survey-2021/kaggle_survey_2021_responses.csv', low_memory=False)
descriptions = data.iloc[0,:] 
data = data.iloc[1:,:] 

In [None]:
descriptions.reset_index()

In [None]:
data.head()

In [None]:
# get the columns for a question
def same_cols(q):
    cols = []
    for k,v in dict(descriptions).items():
        if q in k:
            cols.append(k)
    return cols

def plot_multi_cols(qn):
    colsq = pd.get_dummies(data[same_cols(qn)])
    cols = [t.split('_')[len(t.split('_'))-1].strip() for t in colsq.columns]
    cc=np.count_nonzero(colsq,axis=0)
    
    d = pd.concat([pd.DataFrame(cc),pd.DataFrame(cols,columns=[1])],axis=1).fillna(0).sort_values(0)

    ax=sns.barplot(x=d[0],y=d[1])
    for p in ax.patches:
        ax.annotate(f'\n{p.get_width()}',(p.get_width()+0.5,p.get_y()+0.5),color='black')
        
def plot_one_col(q):
    ax = sns.countplot(y=q,data=data,order=data[q].value_counts().index,
                       linewidth=1,edgecolor=sns.color_palette("dark", 3))
    for p in ax.patches:
        ax.annotate(f'\n{p.get_width()}',(p.get_width()+0.5,p.get_y()+0.5),color='black')

#### Q1
What is your age (# years)?

In [None]:
plot_one_col('Q1')

#### Q2
What is your gender? - Selected Choice

In [None]:
plot_one_col('Q2')

#### Q3
In which country do you currently reside?

In [None]:
plot_one_col('Q3')

#### Q4
What is the highest level of formal education that you have attained or plan to attain within the next 2 years?

In [None]:
plot_one_col('Q4')

#### Q5
Select the title most similar to your current role (or most recent title if retired): - Selected Choice

In [None]:
plot_one_col('Q5')

#### Q6
For how many years have you been writing code and/or programming?

In [None]:
plot_one_col('Q6')

#### Q7
What programming languages do you use on a regular basis? (Select all that apply)

In [None]:
plot_multi_cols('Q7')

#### Q8
What programming language would you recommend an aspiring data scientist to learn first? - Selected Choice

In [None]:
plot_one_col('Q8')

#### Q9
Which of the following integrated development environments (IDE's) do you use on a regular basis?  (Select all that apply) - Selected Choice - Jupyter (JupyterLab, Jupyter Notebooks, etc)

In [None]:
plot_multi_cols('Q9')

#### Q10
Which of the following hosted notebook products do you use on a regular basis?  (Select all that apply) - Selected Choice

In [None]:
plot_multi_cols('Q10')

#### Q11
What type of computing platform do you use most often for your data science projects? - Selected Choice

In [None]:
plot_one_col('Q11')

#### Q12
Which types of specialized hardware do you use on a regular basis?  (Select all that apply) - Selected Choice

In [None]:
plot_multi_cols('Q12')

#### Q13
Approximately how many times have you used a TPU (tensor processing unit)?

In [None]:
plot_one_col('Q13')

#### Q14
What data visualization libraries or tools do you use on a regular basis?  (Select all that apply) - Selected Choice

In [None]:
plot_multi_cols('Q14')

#### Q15
For how many years have you used machine learning methods?

In [None]:
plot_one_col('Q15')

#### Q16
Which of the following machine learning frameworks do you use on a regular basis? (Select all that apply) - Selected Choice

In [None]:
plot_multi_cols('Q16')

#### Q17
Which of the following ML algorithms do you use on a regular basis? (Select all that apply): - Selected Choice

In [None]:
plot_multi_cols('Q17')

#### Q18
Which categories of computer vision methods do you use on a regular basis?  (Select all that apply) - Selected Choice - General purpose image/video tools (PIL, cv2, skimage, etc)

In [None]:
plot_multi_cols('Q18')

#### Q19
Which of the following natural language processing (NLP) methods do you use on a regular basis?  (Select all that apply) - Selected Choice

In [None]:
plot_multi_cols('Q19')

#### Q20
In what industry is your current employer/contract (or your most recent employer if retired)? - Selected Choice

In [None]:
plot_one_col('Q20')

#### Q21
What is the size of the company where you are employed?

In [None]:
plot_one_col('Q21')

#### Q22
Approximately how many individuals are responsible for data science workloads at your place of business?

In [None]:
plot_one_col('Q22')

#### Q23
Does your current employer incorporate machine learning methods into their business?

In [None]:
plot_one_col('Q23')

#### Q24
Select any activities that make up an important part of your role at work: (Select all that apply) - Selected Choice

In [None]:
plot_multi_cols('Q24')

#### Q25
What is your current yearly compensation (approximate $USD)?

In [None]:
plot_one_col('Q25')

#### Q26
Approximately how much money have you (or your team) spent on machine learning and/or cloud computing services at home (or at work) in the past 5 years (approximate $USD)?

In [None]:
plot_one_col('Q26')

#### Q27_A
Which of the following cloud computing platforms do you use on a regular basis? (Select all that apply) - Selected Choice

In [None]:
plot_multi_cols('Q27_A')

##### Q27_B
Which of the following cloud computing platforms do you hope to become more familiar with in the next 2 years? - Selected Choice

In [None]:
plot_multi_cols('Q27_B')

#### Q28
Of the cloud platforms that you are familiar with, which has the best developer experience (most enjoyable to use)? - Selected Choice

In [None]:
plot_one_col('Q28')

#### Q29_A
Do you use any of the following cloud computing products on a regular basis? (Select all that apply) - Selected Choice

In [None]:
plot_multi_cols('Q29_A')

##### Q29_B
In the next 2 years, do you hope to become more familiar with any of these specific cloud computing products? (Select all that apply)

In [None]:
plot_multi_cols('Q29_B')

#### Q30_A
Which of the following hosted notebook products do you use on a regular basis?  (Select all that apply) - Selected Choice

In [None]:
plot_multi_cols('Q30_A')

#### Q30_B
In the next 2 years, do you hope to become more familiar with any of these specific data storage products? (Select all that apply)

###### No answers found

#### Q31
Do you use any of the following managed machine learning products on a regular basis? (Select all that apply)

In [None]:
plot_multi_cols('Q31_A')

#### Q31_B
In the next 2 years, do you hope to become more familiar with any of these managed machine learning products? (Select all that apply)

In [None]:
plot_multi_cols('Q31_B')

#### Q32_A
Which of the following big data products (relational databases, data warehouses, data lakes, or similar) do you use on a regular basis? (Select all that apply)

In [None]:
plot_multi_cols('Q32_A')

#### Q32_B
Which of the following big data products (relational databases, data warehouses, data lakes, or similar) do you hope to become more familiar with in the next 2 years?

In [None]:
plot_multi_cols('Q32_B')

#### Q33
Which of the following big data products (relational database, data warehouse, data lake, or similar) do you use most often?

In [None]:
plot_one_col('Q33')

#### Q34
Which of the following business intelligence tools do you use on a regular basis? (Select all that apply)

In [None]:
plot_multi_cols('Q34_A')

#### Q34_B
Which of the following business intelligence tools do you hope to become more familiar with in the next 2 years?

In [None]:
plot_multi_cols('Q34_B')

#### Q35
Which of the following business intelligence tools do you use most often?

In [None]:
plot_one_col('Q35')

#### Q36_A
Do you use any automated machine learning tools (or partial AutoML tools) on a regular basis?  (Select all that apply)

In [None]:
plot_multi_cols('Q36_A')

##### Q36_B
Which categories of automated machine learning tools (or partial AutoML tools) do you hope to become more familiar with in the next 2 years?

In [None]:
plot_multi_cols('Q36_B')

#### Q37_A
Which of the following automated machine learning tools (or partial AutoML tools) do you use on a regular basis?

In [None]:
plot_multi_cols('Q37_A')

##### Q37_B
Which specific automated machine learning tools (or partial AutoML tools) do you hope to become more familiar with in the next 2 years?

In [None]:
plot_multi_cols('Q37_B')

#### Q38_A
Do you use any tools to help manage machine learning experiments? (Select all that apply) 

In [None]:
plot_multi_cols('Q38_A')

##### Q38_B
In the next 2 years, do you hope to become more familiar with any of these tools for managing ML experiments?

In [None]:
plot_multi_cols('Q38_B')

#### Q39
Where do you publicly share your data analysis or machine learning applications? (Select all that apply)

In [None]:
plot_multi_cols('Q39')

#### Q40
On which platforms have you begun or completed data science courses? (Select all that apply) - Selected Choice

In [None]:
plot_multi_cols('Q40')

#### Q41
What is the primary tool that you use at work or school to analyze data? (Include text response)

In [None]:
plot_multi_cols('Q41')

#### Q42
Who/what are your favorite media sources that report on data science topics? (Select all that apply)

In [None]:
plot_multi_cols('Q42')

# Part 2

#### Age (Q1) with Title (Q5)

This is a simple question, that based on age group, what are the most frequent title?

In [None]:
plt.figure(figsize=(6,10))
sns.countplot(y='Q1',hue='Q5',data=data)
plt.legend(loc=2,bbox_to_anchor=(1.05,1),ncol=1)
plt.xlabel("Frequent Title")
plt.ylabel("Age group")

#### Education (Q4) with Compensation (Q25)

In general as you see in figure, the higher education, the higher salary. for $ 0-999 most of them are with bachelor degree. The master degree is the most frequent in high salary positions.

In [None]:
plt.figure(figsize=(6,10))
sns.countplot(y='Q25',hue='Q4',data=data)
plt.legend(loc=2,bbox_to_anchor=(1.05,1),ncol=1)
plt.xlabel("Count education")
plt.ylabel("Compensation")

#### Gender (Q2) with Education (Q4)

Any gender other then men, through out the world have limited access to education, and higher education. It cause that few women and other gender show up in high positions.

In [None]:
sns.countplot(y='Q2',hue='Q4',data=data)
plt.legend(loc=2,bbox_to_anchor=(1.05,1),ncol=1)
plt.xlabel("# of high education")
plt.ylabel("Gender")

#### Gender (Q2) with Compensation (Q25)

There always been a wage between men, women, and others. Men are getting the highest in salary. In this survey it also shows that men are getting the more high salary.

In [None]:
sns.countplot(y='Q2',hue='Q25',data=data)
plt.legend(loc=2,bbox_to_anchor=(1.05,1),ncol=2)
plt.xlabel("# of high compensation")
plt.ylabel("Gender")

#### Number of employee (21) with Number of responsible individuals for AI (Q22)

Most small company do not have dedicate AI employee. but larger companies have the most number of individuals responsilbe in AI tasks.

In [None]:
sns.countplot(y='Q22',hue='Q21',data=data)
plt.legend(loc=2,bbox_to_anchor=(1.05,1),ncol=1)
plt.xlabel("# employee in a company")
plt.ylabel("Number of Individuals Responsilbe for AI")

#### Enjoyable clound platform (Q28) with Title (Q5)

In [None]:
plt.figure(figsize=(6,10))
sns.countplot(y='Q5',hue='Q28',data=data)
plt.legend(loc=2,bbox_to_anchor=(1.05,1),ncol=1)
plt.xlabel("# Cloud platforms (enjoyable)")
plt.ylabel("Title")

In [None]:
pd.pivot(data[['Q5','Q28']].groupby(['Q5','Q28']).size().
         reset_index(),index='Q5',columns='Q28',values=0).fillna(0).style.background_gradient()

#### Experience (Q6) with Most often used big data product (Q33)

Here we want to answer, Which big data products are more used by level of experience.

In [None]:
pd.pivot(data[['Q6','Q33']].groupby(['Q6','Q33']).size().
         reset_index(),index='Q33',columns='Q6',values=0).fillna(0).style.background_gradient()

#### Experience (Q6) with primary tool to analyze data (Q41)


In [None]:
pd.pivot(data[['Q6','Q41']].groupby(['Q6','Q41']).size().
         reset_index(),index='Q41',columns='Q6',values=0).fillna(0).style.background_gradient()

# Extracting Information using PCA

Principle component analysis (PCA): is a deterministic matrix decomposition to reduce its dimension.

In [None]:
dsr=data[['Q1','Q2','Q3','Q4','Q5']]
data_dump=pd.get_dummies(dsr)
data_dump.shape

In [None]:
pca=PCA(n_components=2)
points=pca.fit_transform(np.array(data_dump))

In [None]:
for qn in ['Q2','Q4']:
    le = LabelEncoder()
    lte=le.fit_transform(np.array(data[qn]))
    lgd = [str(k) for k in data[qn].unique()]
    plt.figure(figsize=(10,4))
    result=plt.scatter(points[:,0],points[:,1],c=lte,marker='*')
    plt.title(descriptions[qn])
    plt.legend(handles=result.legend_elements()[0],labels=lgd)

# Applying PCA on all answers

In [None]:
data_dump_all=pd.get_dummies(data)
data_dump_all.shape

In [None]:
pca2=PCA(n_components=2)
points_all=pca2.fit_transform(np.array(data_dump_all))

In [None]:
qn='Q25'
le = LabelEncoder()
lte=le.fit_transform([str(k) for k in np.array(data[qn])])
lgd = [str(k) for k in data[qn].unique()]
plt.figure(figsize=(10,8))
rel=plt.scatter(points_all[:,0],points_all[:,1],c=lte,marker='*')
plt.title(descriptions[qn])
plt.legend(handles=rel.legend_elements()[0],labels=lgd)

In [None]:
qn='Q21'
le = LabelEncoder()
lte=le.fit_transform([str(k) for k in np.array(data[qn])])
lgd = [str(k) for k in data[qn].unique()]
plt.figure(figsize=(10,8))
rel=plt.scatter(points_all[:,0],points_all[:,1],c=lte,marker='*')
plt.title(descriptions[qn])
plt.legend(handles=rel.legend_elements()[0],labels=lgd)

# Thank You for Reading!





In [None]:
dsr.to_csv("op.csv",index=False)