In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session


# **Set Display Options**
To view complete text of Questions asked and Analysis

In [None]:
pd.set_option('display.max_colwidth', 250)
pd.set_option('display.max_columns', 500)

# **Define Functions**
This whole notebook depends on the three functions defined below
1. rinse(df): returns most frequent answers to each question in dataframe
2. group_squeeze(df): returns most frequent answers per most frequent demographics in dataframe
3. joint_squeeze(df): returns most frequent answer pairs in dataframe

In [None]:
def rinse(df):
    unique_df = pd.DataFrame()
    unique_dfcol = pd.DataFrame()
    unique_dfcol['Question'] = list('x')
    unique_dfcol['Answer'] = 0
    unique_dfcol['Population'] = 0
    for col in df.columns:
        unique_dfcol['Question'] = col
        unique_dfcol['Answer'] = df[col].mode(dropna=True)
        unique_dfcol['Population'] = df[col].value_counts().max()
        unique_df = unique_df.append(unique_dfcol)
    
    unique_df = (unique_df.reset_index()).drop('index', axis=1)
    unique_df['Percentage(%)'] = ((unique_df['Population'] / len(df)) * 100).round(2)
    unique_df = unique_df.sort_values('Percentage(%)', ascending=False)
    #unique_df = unique_df[unique_df['Percentage(%)']>10]
    unique_df = ((unique_df.dropna()).reset_index()).drop('index', axis=1)
    return unique_df

In [None]:
def group_squeeze(df):
    unique_df = pd.DataFrame()
    rinse_df = rinse(df)
    for feat in rinse_df['Question'].unique():
        ans = (rinse_df[(rinse_df['Question']==feat)]).iloc[0,1]
        ans_df = df[df[feat]==ans]
        ans_df = ans_df.drop(feat, axis= 1)
        unique_dfcol = rinse(ans_df)
        unique_dfcol['Question_Group'] = feat
        unique_dfcol['Answered_Group'] = ans
        unique_df = unique_df.append(unique_dfcol)
    unique_df = unique_df.sort_values('Population', ascending=False)
    unique_df = ((unique_df.dropna()).reset_index()).drop('index', axis=1)
    cols = list(unique_df.columns)
    cols = cols[-2:] + cols[:-2]
    unique_df = unique_df[cols]
    return unique_df

In [None]:
def joint_squeeze(df):
    unique_df = pd.DataFrame()
    rinse_df = rinse(df)
    for feat in rinse_df['Question'].unique():
        ans = (rinse_df[(rinse_df['Question']==feat)]).iloc[0,1]
        ans_df = df[df[feat]==ans]
        ans_df = ans_df.drop(feat, axis= 1)
        unique_dfcol = rinse(ans_df)
        unique_dfcol['Other_Question'] = feat
        unique_dfcol['Other_Answer'] = ans
        unique_df = unique_df.append(unique_dfcol)
    unique_df['Percentage(%)'] = ((unique_df['Population'] / len(df)) * 100).round(2)
    unique_df = unique_df.sort_values('Percentage(%)', ascending=False)
    unique_df = unique_df.iloc[::2]
    unique_df = ((unique_df.dropna()).reset_index()).drop('index', axis=1)
    cols = list(unique_df.columns)
    cols = cols[-2:] + cols[:-2]
    unique_df = unique_df[cols]
    return unique_df

# **Load Dataset**

In [None]:
df = pd.read_csv('/kaggle/input/kaggle-survey-2021/kaggle_survey_2021_responses.csv', header=1)
df.head(2)

In [None]:
df.info()

In [None]:
len(df)

# **General Stats**

In [None]:
General_Stats = rinse(df)
General_Stats.T

**General Stats/Introduction**
* 84.16% of Data Scientist use python on a regular basis
* 79.31% of Data Scientist are Men
* 77.82% of Data Scientist recommend aspiring data scientists to learn python
* 67.74% of Data Scientist use Matplotlib for data visualization on a regular basis
* Amazingly, 63.36% of Data Scientist Never used TPU (Tensor Processing Unit)
* 62.5% of Data Scientist use Jupyter Notebook (IDE) on a regular basis
* 62.49% of Data Scientists make use of Laptops
* 53.85% of Data Scientists use Scikit-learn framework on a regular basis
* 53.33% of Data Scientists use Linear or Logistic Regression algorithm on a regular basis
* 50.95% of Data Scientist use no specialized hardware 


# **Compared Stats**

In [None]:
Compare_Stat = rinse(df)
Compare_Stat['Q'] = Compare_Stat.Question.str[:45]
Compare_Stat['Multiple'] = (Compare_Stat.groupby('Q')['Q'].transform('count')>1).astype('int')
Compare_Stat = (Compare_Stat[Compare_Stat.Multiple==0]).iloc[:,:5]
Compare_Stat = Compare_Stat.sort_values(['Q','Population'], ascending=False)
Compare_Stat.set_index('Q', inplace=True)
Compare_Stat = df[list(Compare_Stat.Question.values)]


In [None]:
for col in Compare_Stat.columns:
    Compare_Stat[col].value_counts().head(10).plot.bar(color=list('rgbkymc'))
    plt.title(col)
    plt.xlabel('answers')
    plt.ylabel('population')
    plt.show()

* Most data scientists use laptops, followed by personal computers
* Majority of the data scientists are students and are aged between 25-29years
* Majority of the data scientist have a Masters degree or will attain it in the next 2years
* Most data scientists do not spend money on cloud computing services
* More employers are exploring ML methods which will someday may be put into production
* A larger section employers are in computer/tech industry
* More of the data scientists companies have less than 50 employees
* A higher portion of the data scientists use statistical software for analysis at wrok
* Most of the data scientists earn less than $1000 annually
* A larger section of the data scientists reside in India
* A larger section of the data scientists have between 1-3 years experience
* 1-2 people are mostly responsible for data science workloads at places where data scientists work

In [None]:
Compare_Stats = rinse(df)
Compare_Stats['Q'] = Compare_Stats.Question.str[:45]
Compare_Stats['Multiple'] = (Compare_Stats.groupby('Q')['Q'].transform('count')>1).astype('int')
Compare_Stats = (Compare_Stats[Compare_Stats.Multiple==1]).iloc[:,:5]
Compare_Stats = Compare_Stats.sort_values(['Q','Population'], ascending=False)
Compare_Stats.set_index('Q', inplace=True)
Compare_Stats.T

**Media**
* Kaggle(notebooks, forums, etc) - 43.79% is the most common favorite media source for data scientists, followed by Youtube(Kaggle YouTubem Cloud AI Adventures, etc) - 40%, then Blogs(Towards Data Science, Analytics Vidhya,etc) - 30.71%

**Cloud Platforms**
* 14.33% of Data scientist use Amazon web services (AWS) cloud platform on a regular basis as against 12.1% who use Google Cloud Platform (GCP)
* 28.81% of Data Scientists hope to become more familiar with Google Cloud Platform(GCP) which is almost as many as 28.85% of data scientists who hope to become more familiar with Amazon Web Services(AWS) 


**Specialized Hardware**
* Top most frequently used specialized hardware amongst data scientists include NVIDIA GPUs(30.94%), followed by Google Cloud TPUs(13.29), then AWS Inferentia Chips(1.6%)

**Automated Machine Learning tools**
* Google Cloud AutoML(2.89%) is the most frequently used automated ML tool, followed by Azure Automated Machine Learning(1.73%), Amazon Sagemaker Autopilot(1.45%), Databricks AutoML(1.24%), H2O Driverless AI(1.22%), and DataRobot AutoML(1.04%)
* More data scientists hope to become more familiar with Google Cloud AutoML(18.55%) which is more than fifty-percent the number of data scientists that hope to become more familiar with Azure Automated Machine Learning(12.14%)

**IDE & Frameworks**
* 83.63%(62.5% + 21.13%) of data scientists use Jupyter Notebook and other jupyter IDE
* 38.66% of data scientists use Visual Studio Code(VSCode) IDE, 28.75% use Pycharm while 18.37% use RStudio
* The most frequently used machine learning frameworks amongst data scientists include:
1. Scikit-learn - 53.85%
2. TensorFlow - 36%
3. Keras - 30.77%
4. PyTorch - 23.44%
5. Xgboost-23%, LightGBM-10.15%, CatBoost-5.82%, Huggingface-4.32%

**Hosted notebook products**
* Colab Notebooks(37.7%) is the most regularly used hosted notebook product, followed closely by Kaggle Notebooks(36.6%)

**NLP (Natural Language Processing)**
* Commonly used NLP methods include - Word embeddings/vectors(10.18%), Transformer language models(9%), Encoder-decoder model(7.79%)

# **Grouped Stats**

In [None]:
Grouped_Stats = group_squeeze(df)
Grouped_Stats = Grouped_Stats.T
Grouped_Stats

**Computing Devices**
* 65.43% of Matplotlib users mostly make use of Laptops for computing
* 70.17% of Laptop users never made use of tensor processing units(TPU)
* 69.21% of data scientists that have never used TPU, make use of Laptops for computing
* 84.1% of data scientists that don't use specialized hardwares, make use of Python programming language
* Meanwhile, 50.91% of Python users do not make use of specialized hardware 

**Programming Language & Libraries**
* 84.36% of data scientists who use Python on a regular basis, also recommend it to aspiring data scientists. 91.24% of data scientists who recommend python programming language to aspiring data scientist, also use it on a regular basis.
* 80.43% of Python users are Men. 85.36% of male data scientists use python on a regular basis
* 77.15% of python users make use of Matplotlib, 55.99 of python users make use of Seaborn
* 67.25% of Matplotlib users use Seaborn. 94.02% of Seaborn users make use of Matplotlib
* 67.2% of Matplotlib users make use of Linear or Logistic Regression algorithms
* 50.78% of Python users make use of Decision Trees or Random Forests on a regular basis
* 79% of Scikit-learn users make use of Linear or Logistic Regression algorithms


# **Joint Stats**

In [None]:
Joint_Stats = joint_squeeze(df)
Joint_Stats = Joint_Stats.T
Joint_Stats

* 71% of data scientists use python and recommend it to aspiring data scientists
* 36.99% of data scientists use both python and SQL programming languages on a regular basis
* 32.28% of data scientists use VSCode IDE and recommend aspiring data scientists to learn python programming language first