# Importing required libraries for getting useful insights

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib
import matplotlib.pyplot as plt
import matplotlib.pylab as pylab
import matplotlib.patches as patches
from pylab import text
import matplotlib.image as mpimg
from matplotlib.offsetbox import AnnotationBbox, OffsetImage
import nltk
import plotly
from nltk import corpus
import warnings
warnings.filterwarnings("ignore")


%matplotlib inline
sns.set(font="'Source Code Pro', monospace")
sns.set_style("whitegrid")
plt.rcParams["font.family"] = "'Source Code Pro', monospace"

treasure_colors = ["#703728", "#c86b25", "#dc9555", "#fed56f", "#c89a37"]
pirate_colors = ["#010307", "#395461", "#449FAF", "#B1F4FC", 
                 "#F4D499", "#835211"]
sns.palplot(sns.color_palette(treasure_colors))
sns.palplot(sns.color_palette(pirate_colors))

all_colors = treasure_colors.copy()
all_colors.extend(pirate_colors[1:])
import plotly

# Reading the responses file

In [None]:
data= pd.read_csv('../input/kaggle-survey-2020/kaggle_survey_2020_responses.csv')


## EDA of survey responses

In [None]:
data.info()

In [None]:
data.describe()

In [None]:
data.shape, data.size

In [None]:
data.drop_duplicates(inplace= True)

In [None]:
data.isnull().sum()

## Below are some basic problem statements that make the survey data understandable

## **Q1) Which age-group is more involved in data science?**

In [None]:
data['Q1'].value_counts().plot(kind='bar', color='pink', width=0.5 )


## **Q2)How many years of programming background is required for getting into data science field?**

In [None]:
labels= ['< 1 years', '1-2 years','5-10 years' ,'10-20 years' ,'3-5 years' , '20+ years' ,'I have never written code', 'For how many years have you been writing code and/or programming?']
plt.pie(data['Q6'].value_counts(), autopct= '%1.1f%%', wedgeprops= {'edgecolor':'white','linewidth':2},rotatelabels=45,labels= labels,explode=[0,0,0,0,0,0,0.1,0.2],radius=1.5, shadow= True)
plt.show()

## **Q3) What are the higher values of Pay scales in data analysis**

In [None]:
q = "What is your current yearly compensation (approximate $USD)?"
order = ['$0-999', '1,000-1,999', '2,000-2,999', '3,000-3,999', '4,000-4,999',
         '5,000-7,499', '7,500-9,999', '10,000-14,999',  '15,000-19,999',
         '20,000-24,999', '25,000-29,999', '30,000-39,999', '40,000-49,999', 
         '50,000-59,999', '60,000-69,999', '70,000-79,999', '80,000-89,999',
         '90,000-99,999', '100,000-124,999', '125,000-149,999', '150,000-199,999', 
         '200,000-249,999', '250,000-299,999', '300,000-500,000', '> $500,000']

In [None]:
plt.figure(figsize=(16, 12))
plt.rcParams['figure.dpi'] = 360
ax = sns.countplot(y = data['Q24'], order = order, orient="v", palette = "YlOrBr_r", saturation=1)

plt.plot([0, 650], [15.5, 15.5], lw=2, color=pirate_colors[1])
plt.plot([0, 650], [24.5, 24.5], lw=2, color=pirate_colors[1])
plt.plot([650, 650], [15.5, 24.5], lw=2, color=pirate_colors[1])
plt.plot([3, 3], [15.5, 24.5], lw=2, color=pirate_colors[1])

style = "Simple, tail_width=5, head_width=16, head_length=23"
kw = dict(arrowstyle=style, color=pirate_colors[1])
arrow = patches.FancyArrowPatch((1850, 1), (1850, 24),
                             connectionstyle="arc3,rad=-.15", **kw)
plt.gca().add_patch(arrow)

plt.text(1890, 1.5, 'less pay', size=14, color=pirate_colors[1])
plt.text(1650, 23, 'more pay', size=14, color=pirate_colors[1])
ax.set_xlabel("Number of respondents", size = 18, color = pirate_colors[0])
ax.set_ylabel("Pay Scale", size = 18, color = pirate_colors[0])
ax.set_title("--Pay distribution around respondents--", 
             size = 22, color = treasure_colors[0], weight='bold')
plt.xticks([])
plt.yticks(fontsize=11)
sns.despine(left=True, bottom=True);

## **Q4) WHich country has the most number of data Science professionals**

In [None]:
countries_with_most_developers = data['Q3'].value_counts().nlargest(10).sort_values(ascending=False)
plt.figure(figsize=(12,8))
#countries_with_most_developers.plot(kind='barh',color=custom_color)
splot = sns.barplot(x = countries_with_most_developers.values ,y = countries_with_most_developers.index )
#Decorating the plot
plt.title('Countries with the most professionals',fontsize=18,weight='bold')
plt.yticks(fontsize=14)
plt.xticks(fontsize=14)
plt.xlabel('Count',fontsize=14)
plt.style.use('ggplot')
#add annotation to each bar
for p in splot.patches:
    width = p.get_width()
    plt.text(150+p.get_width(), p.get_y()+0.55*p.get_height(),int(width),
             ha='center', va='center')

    
#Make a annotation to show the most obvious result
plt.annotate('India has the most Data Science professionals' , xy =(2000,4),fontsize=24,color='navy');

## **Q5) Which are the most used programming languages for Data Science?** 

In [None]:
df_language = data[[i for i in data.columns if 'Q7' in i]]
df_language_all = pd.Series(dtype='int')
for i in df_language.columns:
    df_language_all[df_language[i].value_counts().index[0]] = df_language[i].count()

df_language_all = df_language_all.sort_values(ascending=True)
plt.figure(figsize=(4,4))
expl = [0.5,0.3,0.1,0.05,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0]
df_language_all.plot(kind='pie',explode=expl,colors=sns.color_palette('viridis'),textprops={'fontsize': 4}, autopct='%1.1f%%')
plt.ylabel('')
plt.title('Programming Languages used', fontsize= 8)
plt.show()

## **Q6) Best Visualization libraries used by professionals**

In [None]:
df_lib = data[[i for i in data.columns if 'Q14' in i]]
df_lib_all = pd.Series(dtype='int')
for i in df_lib.columns:
    df_lib_all[df_lib[i].value_counts().index[0]] = df_lib[i].count()

In [None]:
from wordcloud import WordCloud
from PIL import Image


# Define a function to plot word cloud
def cloud(wordcloud):
    # Set figure size
    plt.figure(figsize=(18, 13))
    # Display image
    plt.imshow(wordcloud) 
    # No axis details
    plt.axis("off")



wordcloud = WordCloud(width = 400, height = 300, random_state=1, background_color='Pink',mode="RGBA"
                      ,max_words=70, collocations=False
                      ,repeat=True).generate_from_frequencies(df_lib_all)
# Plot
cloud(wordcloud)

## **Q7)  Which gender has elder people working in Data Science?**

In [None]:
#Let's narrow the genders to only men and women to facilitate the analysis(as they are the main categories)
gender_df = data[data['Q2'].isin(['Man','Woman'])]
table = gender_df.groupby(['Q1', 'Q2']).size().reset_index().pivot(columns='Q2', index='Q1', values=0)
plt.style.use('ggplot')
    
ax=table.plot(stacked=True,kind='barh',figsize=(12,10),alpha=0.7)

index_list = table.index.values
total = table.values.sum()

#ploting the annotation text
for i in table.index :
    tot_x = 0
    for j in table.columns:
        
        ratio = (table.loc[(i)][j])/ total
        x_pos = table.loc[(i)][j]+ tot_x
        tot_x += table.loc[(i)][j]
        if(ratio >= 0.001):
            plt.text(x = x_pos - table.loc[(i)][j]/2, y = np.where(index_list == i)[0][0]
                     ,s= '%.1f'%(ratio*100)+'%' ,va='center', ha='center', size=10)

#Decorating the plot
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left',prop={'size': 14})
plt.xlabel('Count',fontsize=16)
plt.ylabel('Age',fontsize=16)
plt.xticks(fontsize=12)
plt.yticks(fontsize=12)
plt.title('Distribution of Age and Gender',fontsize=15,weight='bold')


## **Q8) Which country out of USA or India has the most qualified data professionals?**

In [None]:
us_in = data[data['Q3'].isin(['India','United States of America'])]
table = us_in.groupby(['Q4', 'Q3']).size().reset_index().pivot(columns='Q4', index='Q3', values=0)
plt.style.use('ggplot')
    
ax=table.plot(stacked=True,kind='barh',figsize=(16,12),alpha=0.7)

index_list = table.index.values
total = table.values.sum()

#ploting the annotation text
for i in table.index :
    tot_x = 0
    for j in table.columns:
#Decorating the plot
        plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left',prop={'size': 26})
        plt.xlabel('Count',fontsize=40)
        plt.ylabel('Age',fontsize=40)
        plt.xticks(fontsize=26)
        plt.yticks(fontsize=26)
        plt.title('Educational qualifications of USA and American data Profs',fontsize=40,weight='bold')
        