In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import geopandas as gpd

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Changing Column Names

The column name does not give a proper info. The questions to which the column answers is given as the first row in the dataset. Using this row we can change the column names to be more understandable and easy to use.

### Changing Column names which have are part of a question

Many questions which have multiple choice are split into parts based on the choices. So if the column contains the String "Part" it means that it is a choice to a question. So if the string "Part" is present in the column name, I am splitting the corresponding question with "-" as seperator. The third string in the splitted list has the choice that corresponds to that column. So that string is taken, unwanted spaces in the front and back are removed and added to the new_columns list for further changes.

- For example: The question is of the form : "What programming languages do you use on a regular basis? - (Select all that apply) - Python"  
Splitting it on "-" gives " Python". Unwanted white spaces is striped using lstrip() and rstrip().

In [None]:
data = pd.read_csv("/kaggle/input/kaggle-survey-2021/kaggle_survey_2021_responses.csv")
pd.set_option("display.max_columns",400)
print(data.shape)
data.head()

In [None]:
questions = data.iloc[0,:]

In [None]:
columns = data.columns

In [None]:
new_columns = []
for column in columns:
    if "Part" in column:
        splitted = questions[column].split("-")
        choosen = splitted[2]
        new_columns.append(choosen.lstrip().rstrip()) 
    else:
        new_columns.append(questions[column])
        
print(new_columns)

***Changing Columns whose choice is others to format QuestionKeyword_Other.***  
***That is if the question is "What programming languages do you use on a regular basis? - (Select all that apply) - Other", I change it to Programming_Language_Other.***

In [None]:
cnt = 0
for column in new_columns:
    if "Other" in column:
        print(column)
        cnt += 1
        
print(cnt)

In [None]:
# List for substiuting qustions whose choice was Other
other_sub = ["Programming_Language","IDE","Notebook","Hardware","Vis_Lib","ML_Framework","ML_Alg","Computer_Vision","NLP","Activity","CC_Platform","CC_Product",
            "DS_Product","ML_Product","BigData_Products","BI_Products","AutoML_Tools_1","AutoML_Tools_2","ML_Manage","Share","Course_Platform","Media_Src","CC_Platform_Familiar",
            "CC_Prod_Familiar","DS_Prod_Familiar","ML_Manage_Familiar","BigData_Familiar","BI_Tools_Familiar","AutoML_Category_Familiar","AutoML_Tool_Familiar",
            "Specific_Tool_Familiar"]

In [None]:
print(len(other_sub))

In [None]:
# If the column has "Other", the item from the list corresponding to i is selected and "_Other" is added. It is then appended to change_column list
i = 0
change_column = []
for column in new_columns:
    if "Other" in column:
        name = other_sub[i] + "_Other"
        change_column.append(name)
        i += 1
    else:
        change_column.append(column)
        
print(change_column)

In [None]:
# Creating a Copy of the dataset and changing the column names
new_data = data.copy()
new_data.columns = change_column

In [None]:
new_data.head()

In [None]:
# Since None is also a choice, There are multiple columns named None
# I now replace None the same way I replaced Other
new_data["None"]

In [None]:
for it in new_data["No / None"].iloc[0,:]:
    print(it)

In [None]:
# This list is used for replacing None
none_sub = ["Programming_Language","IDE","Notebook","Hardware","Vis_Lib","ML_Framework","ML_Alg","Computer_Vision","NLP","CC_Platform","BigData_Products","BI_Products","Course_Platform","Media_Src","CC_Platform_Familiar",
            "CC_Prod_Familiar","ML_Manage_Familiar","BigData_Familiar","BI_Tools_Familiar","AutoML_Category_Familiar","AutoML_Tool_Familiar",
            "Specific_Tool_Familiar"]

no_none_sub = ["CC_Product","DS_Product","ML_Manage","AutoML_Tools_1","AutoML_Tools_2","ML_Manage_Familiar","Specific_Tool_Familiar"]

In [None]:
new_data_columns = new_data.columns

In [None]:
i = 0
j = 0
new_data_column_change = []
for column in new_data_columns:
    if column == "None":
        name = none_sub[i] + "_None"
        new_data_column_change.append(name)
        i += 1
    elif column == "No / None":
        name = no_none_sub[j] + "_None"
        new_data_column_change.append(name)
        j += 1
    else:
        new_data_column_change.append(column)
print(i)
print(j)

In [None]:
new_data.columns = new_data_column_change

In [None]:
len(new_data_column_change)

In [None]:
new_data.head()

In [None]:
questions = new_data.drop(0,axis = 0,inplace = True)

In [None]:
new_data.head(2)

In [None]:
new_data.rename(columns = {"Duration (in seconds)":"Duration",
                          "What is your age (# years)?":"Age",
                          "What is your gender? - Selected Choice":"Gender",
                          "In which country do you currently reside?":"Country",
                          "What is the highest level of formal education that you have attained or plan to attain within the next 2 years?":"Education_Level",
                          "Select the title most similar to your current role (or most recent title if retired): - Selected Choice":"Role",
                          "For how many years have you been writing code and/or programming?":"Programming_Experience",
                          "What programming language would you recommend an aspiring data scientist to learn first? - Selected Choice":"Language_Recommended",
                          "What type of computing platform do you use most often for your data science projects? - Selected Choice":"Computing_Platform",
                          "Approximately how many times have you used a TPU (tensor processing unit)?":"TPU_Usage",
                          "For how many years have you used machine learning methods?":"ML_Experience",
                          "General purpose image/video tools (PIL, cv2, skimage, etc)":"General_Img_Tools",
                          "Image segmentation methods(U":"Image_Segmentation",
                          "Object detection methods (YOLOv3, RetinaNet, etc)":"Object_Detection",
                          "Image classification and other general purpose networks (VGG, Inception, ResNet, ResNeXt, NASNet, EfficientNet, etc)":"Img_Classification"},
               inplace = True)

In [None]:
new_data.head(2)

### **Visualizing**

In [None]:
fig = px.pie(values = new_data["Gender"].value_counts(),names = new_data["Gender"].unique(),title = "Gender Distribution")
fig.show()

In [None]:
fig = px.pie(values = new_data["Age"].value_counts(),names = new_data["Age"].unique(),title = "Age Distribution")
fig.show()

***About 80 percent of the respondents are Male ML & DS Enthusiasts***.

In [None]:
personal_info = new_data.iloc[:,:7]
programming_languages = new_data.iloc[:,7:20].copy()
programming_languages.head(2)

In [None]:
programming_languages["Programming_Language_None"] = programming_languages["Programming_Language_None"].map({"None":"Yes",np.nan:"No"})
programming_languages["Programming_Language_Other"] = programming_languages["Programming_Language_Other"].map({"Other":"Yes",np.nan:"No"})

In [None]:
languages = list(programming_languages.columns)
languages.pop()
languages.pop()
for language in languages:
    programming_languages[language] = programming_languages[language].map({language:"Yes",np.nan:"No"})
programming_languages.head(2)

In [None]:
responses = new_data.shape[0]
languages_count = []
languages = list(programming_languages.columns)
for language in languages:
    languages_count.append(sum(programming_languages[language] == "Yes")) # Adding the Yes count of each language to a list
languages_count

In [None]:
fig = px.pie(values = languages_count,names = languages,title = "Programming Languages Used")
fig.show()

***More than 33% of the respondents use Python.***   

*Now Lets see which language is famous among each age group*

In [None]:
age_values = personal_info["Age"].unique()
for a_age in age_values:
    temp = programming_languages[personal_info["Age"] == a_age]
    languages_count = []
    for language in languages:
        languages_count.append(sum(temp[language] == "Yes")) # Adding the Yes count of each language to a list
    fig = px.pie(values = languages_count,names = languages,title = "Programming Languages Used by people belonging to age category " + a_age)
    fig.show()

Regardless of the age group, Python is the famous language among them.  
In the age group of 18-21 and 22-24 R is not as widely used as Python.

***Further Steps to be done:***
* More EDA to be Done
* To find information from the visualisations and Data

### If you have liked the work done so for, Please Upvote to show your appreciation
### Feel free to leave comments on any improvements to be done or if I had done any thing wrong.