In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import re
import seaborn as sns
import wordcloud

# from mpl_toolkits.basemap import Basemap
from os import path
from PIL import Image
import missingno as msno
%matplotlib inline

# Data Check

In [None]:
df_multipleChoice = pd.read_csv("../input/multipleChoiceResponses.csv",  encoding="ISO-8859-1", low_memory=False)
df_freeform = pd.read_csv("../input/freeformResponses.csv", low_memory=False)
df_schema = pd.read_csv("../input/schema.csv", index_col="Column")

multiple_choice_columns = df_multipleChoice.columns
freeform_columns = df_freeform.columns

 ## Data Check - (1) multipleChoiceResponses.csv

In [None]:
def check_NaN_percentage(df, df_columns):
    print("--------------------NaN value percentage--------------------")
    for col in df_columns:
        print("column: {:>20}\t Percent of NaN value: {:.2f}% (Total not NaN response: {})".format(col, 100 * df[col].isnull().sum() / len(df[col]), len(df[col]) - df[col].isnull().sum()))

In [None]:
check_NaN_percentage(df_multipleChoice, multiple_choice_columns)

In [None]:
msno.matrix(df=df_multipleChoice.iloc[:, :], figsize=(20, 14), color=(0.8, 0.5, 0.2))  

There are many NaN values in each features of multiple Choice csv file.
The white space in the missingno plot represents the NaN value. So, we'd better consider the percentage of the NaN values. I'll print the percentage of NaN values in title! I used missingno package! It is very useful package to show NaN values. More detail is [here.](https://github.com/ResidentMario/missingno) :). Thanks to Aleksey Bilogur.

 ## Data Check - (2) freeformResponses.csv

In [None]:
check_NaN_percentage(df_freeform, freeform_columns)

In [None]:
msno.matrix(df=df_freeform.iloc[:, :], figsize=(20, 14), color=(0.8, 0.5, 0.2))   

There are many NaN values in each features of multiple Choice csv file.

 ## Data Check - (3) schema.csv

In [None]:
len(multiple_choice_columns) + len(freeform_columns) == df_schema.shape[0]

In [None]:
df_schema.head()

In [None]:
df_schema['Asked'].value_counts().sum()

The schema.csv file contains all of features and questions of each feature.

 # Data analysis

 ## Data analysis - Preparation

In [None]:
all_features = df_schema.index

In [None]:
def make_meta(all_features):
    data = []
    for feature in all_features:
        # which form this feature included
        if feature in multiple_choice_columns:
            WhichForm = "Multiple_choice"
            Response_rate = 100 * df_multipleChoice[feature].isnull().sum() / len(df_multipleChoice[feature])
            dtype = str(df_multipleChoice[feature].dtype)
        else:
            WhichForm = "FreeForm"
            Response_rate = 100 * df_freeform[feature].isnull().sum() / len(df_freeform[feature])
            dtype = str(df_freeform[feature].dtype)
        # target
        target = df_schema.loc[feature, 'Asked']
        Question = df_schema.loc[feature, 'Question']
        temp_dict = {
            "feature": feature,
            "WhichForm": WhichForm,
            "target": target,
            "Question": Question,
            "Response_rate": 100 - np.round(Response_rate, 1),
            "dtype": dtype
        }
        data.append(temp_dict)
    return data
data = make_meta(all_features)
meta = pd.DataFrame(data, columns=['feature', 'WhichForm', 'target', 'Question', 'Response_rate', 'dtype'])
meta.set_index('feature', inplace=True)

In [None]:
meta

## Data analysis - (1) for 'all asked' features & Continous

In [None]:
feature_all_float = meta.loc[(meta['target'] == 'All') & (meta['WhichForm'] == 'Multiple_choice') & (meta['dtype'] == 'float64')]

In [None]:
feature_all_float

In [None]:
fig = plt.figure(figsize=(10, 10))
sns.set(font_scale=2)
sns.distplot(df_multipleChoice.loc[~df_multipleChoice['Age'].isnull()]['Age'])
plt.title("{}\nResponse Rate: {}%".format(meta.loc['Age', 'Question'], meta.loc['Age', 'Response_rate']))

In [None]:
for feature in feature_all_float.index[1:]:
    fig = plt.figure(figsize=(10, 10))
    sns.set(font_scale=2)
    sns.distplot(df_multipleChoice.loc[~df_multipleChoice[feature].isnull()][feature])
    plt.title("{}\n{}\nResponse rate: {}%".format(meta.loc[feature, 'Question'][:int(len(meta.loc[feature, 'Question'])/2)], 
                              meta.loc[feature, 'Question'][int(len(meta.loc[feature, 'Question'])/2):],
                                                meta.loc[feature, 'Response_rate']))

As you can see, respondents tend to learn data analysis through 'SelfTaught' and 'OnlineCourse'.

### More deep analysis by Age

To see the learning trends depending on Age, I categorize the Age in to 5 groups.

In [None]:
def CategorizeAge(df):
    df.loc[(0.0 <= df['Age']) & (df['Age'] < 18.0), 'CategorizedAge'] = '0~18' # before university
    df.loc[(19 <= df['Age']) & (df['Age'] < 26), 'CategorizedAge'] = '19~25' # during university
    df.loc[(26 <= df['Age']) & (df['Age'] < 41), 'CategorizedAge'] = '26~40' # Hard worker
    df.loc[(41 <= df['Age']) & (df['Age'] < 61), 'CategorizedAge'] = '41~60' # more experienced
    df.loc[(61 <= df['Age']), 'CategorizedAge'] = '61~' # Master :)
#     df.loc[(0 <= df['Age']) & (df['Age'] < 10), 'CategorizedAge'] = '0~10'
#     df.loc[(10 <= df['Age']) & (df['Age'] < 20), 'CategorizedAge'] = '10~20'
#     df.loc[(20 <= df['Age']) & (df['Age'] < 30), 'CategorizedAge'] = '20~30'
#     df.loc[(30 <= df['Age']) & (df['Age'] < 40), 'CategorizedAge'] = '30~40'
#     df.loc[(40 <= df['Age']) & (df['Age'] < 50), 'CategorizedAge'] = '40~50'
#     df.loc[(50 <= df['Age']) & (df['Age'] < 60), 'CategorizedAge'] = '50~60'
#     df.loc[(60 <= df['Age']) & (df['Age'] < 70), 'CategorizedAge'] = '60~70'
#     df.loc[(70 <= df['Age']) & (df['Age'] < 80), 'CategorizedAge'] = '70~80'
#     df.loc[(80 <= df['Age']), 'CategorizedAge'] = '80~'
    return df

In [None]:
df_multipleChoice = CategorizeAge(df_multipleChoice.loc[df_multipleChoice['Age'].notnull()])

In [None]:
Categorized_Age = df_multipleChoice.groupby(['CategorizedAge'])['Age'].count().reset_index().set_index('CategorizedAge')
Categorized_Age['Percent'] = 100* np.round(Categorized_Age['Age'] / Categorized_Age['Age'].sum(), 3)

In [None]:
Categorized_Age

# My suggestion to Kaggle
The percent of younger data analyst is about 30%. If possible, how about making the competition for only younger kaggler? The purpose of this suggestion is to encourage the younger kaggler  :)! Also, how about making the competitions for each ages?

In [None]:
for count in range(6):
    if count == 0:
        continue
    fig, ax = plt.subplots(figsize=(5, 5))

    target_feature = feature_all_float.index[count]
    sns.distplot(df_multipleChoice.loc[(df_multipleChoice['CategorizedAge'] == '0~18') & (df_multipleChoice[target_feature].notnull())][target_feature], 
                 hist=False, label='0~18', ax=ax)
    sns.distplot(df_multipleChoice.loc[(df_multipleChoice['CategorizedAge'] == '19~25') & (df_multipleChoice[target_feature].notnull())][target_feature], 
                 hist=False, label='19~25', ax=ax)
    sns.distplot(df_multipleChoice.loc[(df_multipleChoice['CategorizedAge'] == '26~40') & (df_multipleChoice[target_feature].notnull())][target_feature], 
                 hist=False, label='26~40', ax=ax)
    sns.distplot(df_multipleChoice.loc[(df_multipleChoice['CategorizedAge'] == '41~60') & (df_multipleChoice[target_feature].notnull())][target_feature], 
                 hist=False, label='41~60', ax=ax)
    sns.distplot(df_multipleChoice.loc[(df_multipleChoice['CategorizedAge'] == '60~') & (df_multipleChoice[target_feature].notnull())][target_feature], 
                 hist=False, label='60~', ax=ax)

As you can see, The learning style of Ages is similar in all graphs. There are some bug in 4th plot, But, in my computer, the trend is similar compared to others.

## Data analysis - (2) for 'all asked' features & Objec t(mutiple choice question)

In [None]:
feature_all_object = meta.loc[(meta['target'] == 'All') & (meta['WhichForm'] == 'Multiple_choice') & (meta['dtype'] != 'float64')]

In [None]:
feature_all_object

In [None]:
feature_all_object.shape

I plotted all of features. Yes, 36 graph! They will show "who is the kaggler'. Let's start!

In [None]:
count = 0
fig = plt.figure(figsize=(20, 20))
target_feature = feature_all_object.index[count]
temp_data = df_multipleChoice.loc[(df_multipleChoice[target_feature].isin(df_multipleChoice[target_feature].value_counts().index[:10]))]
ncount = temp_data[target_feature].value_counts().sum()
ax = sns.countplot(y=target_feature, data=temp_data, 
              order=temp_data[target_feature].value_counts().index)
plt.title("{}\nResponse rate: {:.1f}%".format(meta.loc[target_feature, 'Question'], 
                                          meta.loc[target_feature, 'Response_rate']))
for p in ax.patches:
    x=p.get_bbox().get_points()[1,0]
    y=p.get_bbox().get_points()[:,1]
    ax.annotate('{:.1f}%'.format(100.*x/ncount), (x, y.mean()+0.1), 
            ha='center', va='bottom') # set the alignment of the text\
    
count = count + 1

The "Male" has a large distribution.

In [None]:
fig = plt.figure(figsize=(20, 20))
target_feature = feature_all_object.index[count]
temp_data = df_multipleChoice.loc[(df_multipleChoice[target_feature].isin(df_multipleChoice[target_feature].value_counts().index[:100]))]
ncount = temp_data[target_feature].value_counts().sum()
ax = sns.countplot(y=target_feature, data=temp_data, 
              order=temp_data[target_feature].value_counts().index)
plt.title("{}\nResponse rate: {:.1f}%".format(meta.loc[target_feature, 'Question'], 
                                          meta.loc[target_feature, 'Response_rate']))
for p in ax.patches:
    x=p.get_bbox().get_points()[1,0]
    y=p.get_bbox().get_points()[:,1]
    ax.annotate('{:.1f}%'.format(100.*x/ncount), (x, y.mean()+0.1), 
            ha='center', va='bottom') # set the alignment of the text\
    
count = count + 1

The "United States" and "India" have large distributions.

In [None]:
fig = plt.figure(figsize=(20, 20))
target_feature = feature_all_object.index[count]
temp_data = df_multipleChoice.loc[(df_multipleChoice[target_feature].isin(df_multipleChoice[target_feature].value_counts().index[:10]))]
ncount = temp_data[target_feature].value_counts().sum()
ax = sns.countplot(y=target_feature, data=temp_data, 
              order=temp_data[target_feature].value_counts().index)
plt.title("{}\nResponse rate: {:.1f}%".format(meta.loc[target_feature, 'Question'], 
                                          meta.loc[target_feature, 'Response_rate']))
for p in ax.patches:
    x=p.get_bbox().get_points()[1,0]
    y=p.get_bbox().get_points()[:,1]
    ax.annotate('{:.1f}%'.format(100.*x/ncount), (x, y.mean()+0.1), 
            ha='center', va='bottom') # set the alignment of the text\
    
count = count + 1

Most respondent is Employed full-time.

In [None]:
fig = plt.figure(figsize=(20, 20))
target_feature = feature_all_object.index[count]
temp_data = df_multipleChoice.loc[(df_multipleChoice[target_feature].isin(df_multipleChoice[target_feature].value_counts().index[:10]))]
ncount = temp_data[target_feature].value_counts().sum()
ax = sns.countplot(y=target_feature, data=temp_data, 
              order=temp_data[target_feature].value_counts().index)
plt.title("{}\nResponse rate: {:.1f}%".format(meta.loc[target_feature, 'Question'], 
                                          meta.loc[target_feature, 'Response_rate']))
for p in ax.patches:
    x=p.get_bbox().get_points()[1,0]
    y=p.get_bbox().get_points()[:,1]
    ax.annotate('{:.1f}%'.format(100.*x/ncount), (x, y.mean()+0.1), 
            ha='center', va='bottom') # set the alignment of the text\
    
count = count + 1

Many peple are eager to learn "Tensorflow" and "Python" next year.
I'm gonna to see deeply this survey based on "Age".

In [None]:
for category in ['0~18', '19~25', '26~40', '41~60', '61~']:
    fig = plt.figure(figsize=(10, 10))
    temp_results = pd.DataFrame(df_multipleChoice.loc[(df_multipleChoice['CategorizedAge'] == category) \
                                       & (df_multipleChoice[target_feature].notnull())][target_feature].value_counts()[:10]\
                                        )
    temp_results['Percent'] = np.round(100 * temp_results[target_feature] / df_multipleChoice.loc[(df_multipleChoice['CategorizedAge'] == category) \
                                       & (df_multipleChoice[target_feature].notnull())][target_feature].value_counts().sum())
    ax = sns.barplot(x=temp_results['MLToolNextYearSelect'].values, y=temp_results.index)
    plt.title("{}\nAge category: {}".format(meta.loc[target_feature, 'Question'], 
                                              category))

As you can see, "Tensorflow" is the first in all ages.
The more younger, The python is more prior than others. And the tools which are useful for the commercial and job purposes, such as Google, Amazone, Microsoft, spark, is more dominent in the working age(20~60).

# My suggestion to Kaggle
If possible, how about making the competition for only Tensorflow or Only tool in other tools?

In [None]:
fig = plt.figure(figsize=(20, 20))
target_feature = feature_all_object.index[count]
temp_data = df_multipleChoice.loc[(df_multipleChoice[target_feature].isin(df_multipleChoice[target_feature].value_counts().index[:10]))]
ncount = temp_data[target_feature].value_counts().sum()
ax = sns.countplot(y=target_feature, data=temp_data, 
              order=temp_data[target_feature].value_counts().index)
plt.title("{}\nResponse rate: {:.1f}%".format(meta.loc[target_feature, 'Question'], 
                                          meta.loc[target_feature, 'Response_rate']))
for p in ax.patches:
    x=p.get_bbox().get_points()[1,0]
    y=p.get_bbox().get_points()[:,1]
    ax.annotate('{:.1f}%'.format(100.*x/ncount), (x, y.mean()+0.1), 
            ha='center', va='bottom') # set the alignment of the text\
    
count = count + 1

Deep learning and Neural Nets have attracted many kagglers! Many kaggler wil learn them!

In [None]:
fig = plt.figure(figsize=(20, 20))
target_feature = feature_all_object.index[count]
temp_data = df_multipleChoice.loc[(df_multipleChoice[target_feature].isin(df_multipleChoice[target_feature].value_counts().index[:10]))]
ncount = temp_data[target_feature].value_counts().sum()
ax = sns.countplot(y=target_feature, data=temp_data, 
              order=temp_data[target_feature].value_counts().index)
plt.title("{}\nResponse rate: {:.1f}%".format(meta.loc[target_feature, 'Question'], 
                                          meta.loc[target_feature, 'Response_rate']))
for p in ax.patches:
    x=p.get_bbox().get_points()[1,0]
    y=p.get_bbox().get_points()[:,1]
    ax.annotate('{:.1f}%'.format(100.*x/ncount), (x, y.mean()+0.1), 
            ha='center', va='bottom') # set the alignment of the text\
    
count = count + 1

Python is fun, efficient, and powerful! How about learning python?

In [None]:
fig = plt.figure(figsize=(20, 20))
target_feature = feature_all_object.index[count]
temp_data = df_multipleChoice.loc[(df_multipleChoice[target_feature].isin(df_multipleChoice[target_feature].value_counts().index[:6]))]
ncount = temp_data[target_feature].value_counts().sum()
ax = sns.countplot(y=target_feature, data=temp_data, 
              order=temp_data[target_feature].value_counts().index)
plt.title("{}\nResponse rate: {:.1f}%".format(meta.loc[target_feature, 'Question'], 
                                          meta.loc[target_feature, 'Response_rate']))
for p in ax.patches:
    x=p.get_bbox().get_points()[1,0]
    y=p.get_bbox().get_points()[:,1]
    ax.annotate('{:.1f}%'.format(100.*x/ncount), (x, y.mean()+0.1), 
            ha='center', va='bottom') # set the alignment of the text\
    
count = count + 1

In [None]:
fig = plt.figure(figsize=(12, 12))
target_feature = feature_all_object.index[count]
temp_data = df_multipleChoice.loc[(df_multipleChoice[target_feature].isin(df_multipleChoice[target_feature].value_counts().index[:10]))]
ncount = temp_data[target_feature].value_counts().sum()
ax = sns.countplot(y=target_feature, data=temp_data, 
              order=temp_data[target_feature].value_counts().index)
plt.title("{}\nResponse rate: {:.1f}%".format(meta.loc[target_feature, 'Question'], 
                                          meta.loc[target_feature, 'Response_rate']))
for p in ax.patches:
    x=p.get_bbox().get_points()[1,0]
    y=p.get_bbox().get_points()[:,1]
    ax.annotate('{:.1f}%'.format(100.*x/ncount), (x, y.mean()+0.1), 
            ha='center', va='bottom') # set the alignment of the text\
    
count = count + 1

Kaggle is very helpful to learn data science skills!!. If summing up "Kaggle" and "Kaggle, Online courses", Kaggle is the most useful and efficient way to be a data scientist! Actually, I'm learning data science through "Kaggle". Thanks! :)!

The many plots of "How useful" is below. Let's start!

In [None]:
fig = plt.figure(figsize=(12, 12))
target_feature = feature_all_object.index[count]
temp_data = df_multipleChoice.loc[(df_multipleChoice[target_feature].isin(df_multipleChoice[target_feature].value_counts().index[:10]))]
ncount = temp_data[target_feature].value_counts().sum()
ax = sns.countplot(y=target_feature, data=temp_data, 
              order=temp_data[target_feature].value_counts().index)
plt.title("{}\nResponse rate: {:.1f}%".format(meta.loc[target_feature, 'Question'], 
                                          meta.loc[target_feature, 'Response_rate']))
for p in ax.patches:
    x=p.get_bbox().get_points()[1,0]
    y=p.get_bbox().get_points()[:,1]
    ax.annotate('{:.1f}%'.format(100.*x/ncount), (x, y.mean()+0.1), 
            ha='center', va='bottom') # set the alignment of the text\
    
count = count + 1

In [None]:
fig = plt.figure(figsize=(12, 12))
target_feature = feature_all_object.index[count]
temp_data = df_multipleChoice.loc[(df_multipleChoice[target_feature].isin(df_multipleChoice[target_feature].value_counts().index[:10]))]
ncount = temp_data[target_feature].value_counts().sum()
ax = sns.countplot(y=target_feature, data=temp_data, 
              order=temp_data[target_feature].value_counts().index)
plt.title("{}\nResponse rate: {:.1f}%".format(meta.loc[target_feature, 'Question'], 
                                          meta.loc[target_feature, 'Response_rate']))
for p in ax.patches:
    x=p.get_bbox().get_points()[1,0]
    y=p.get_bbox().get_points()[:,1]
    ax.annotate('{:.1f}%'.format(100.*x/ncount), (x, y.mean()+0.1), 
            ha='center', va='bottom') # set the alignment of the text\
    
count = count + 1

In [None]:
fig = plt.figure(figsize=(12, 12))
target_feature = feature_all_object.index[count]
temp_data = df_multipleChoice.loc[(df_multipleChoice[target_feature].isin(df_multipleChoice[target_feature].value_counts().index[:10]))]
ncount = temp_data[target_feature].value_counts().sum()
ax = sns.countplot(y=target_feature, data=temp_data, 
              order=temp_data[target_feature].value_counts().index)
plt.title("{}\nResponse rate: {:.1f}%".format(meta.loc[target_feature, 'Question'], 
                                          meta.loc[target_feature, 'Response_rate']))
for p in ax.patches:
    x=p.get_bbox().get_points()[1,0]
    y=p.get_bbox().get_points()[:,1]
    ax.annotate('{:.1f}%'.format(100.*x/ncount), (x, y.mean()+0.1), 
            ha='center', va='bottom') # set the alignment of the text\
    
count = count + 1

In [None]:
fig = plt.figure(figsize=(12, 12))
target_feature = feature_all_object.index[count]
temp_data = df_multipleChoice.loc[(df_multipleChoice[target_feature].isin(df_multipleChoice[target_feature].value_counts().index[:10]))]
ncount = temp_data[target_feature].value_counts().sum()
ax = sns.countplot(y=target_feature, data=temp_data, 
              order=temp_data[target_feature].value_counts().index)
plt.title("{}\nResponse rate: {:.1f}%".format(meta.loc[target_feature, 'Question'], 
                                          meta.loc[target_feature, 'Response_rate']))
for p in ax.patches:
    x=p.get_bbox().get_points()[1,0]
    y=p.get_bbox().get_points()[:,1]
    ax.annotate('{:.1f}%'.format(100.*x/ncount), (x, y.mean()+0.1), 
            ha='center', va='bottom') # set the alignment of the text\
    
count = count + 1

In [None]:
fig = plt.figure(figsize=(12, 12))
target_feature = feature_all_object.index[count]
temp_data = df_multipleChoice.loc[(df_multipleChoice[target_feature].isin(df_multipleChoice[target_feature].value_counts().index[:10]))]
ncount = temp_data[target_feature].value_counts().sum()
ax = sns.countplot(y=target_feature, data=temp_data, 
              order=temp_data[target_feature].value_counts().index)
plt.title("{}\nResponse rate: {:.1f}%".format(meta.loc[target_feature, 'Question'], 
                                          meta.loc[target_feature, 'Response_rate']))
for p in ax.patches:
    x=p.get_bbox().get_points()[1,0]
    y=p.get_bbox().get_points()[:,1]
    ax.annotate('{:.1f}%'.format(100.*x/ncount), (x, y.mean()+0.1), 
            ha='center', va='bottom') # set the alignment of the text\
    
count = count + 1

In [None]:
fig = plt.figure(figsize=(12, 12))
target_feature = feature_all_object.index[count]
temp_data = df_multipleChoice.loc[(df_multipleChoice[target_feature].isin(df_multipleChoice[target_feature].value_counts().index[:10]))]
ncount = temp_data[target_feature].value_counts().sum()
ax = sns.countplot(y=target_feature, data=temp_data, 
              order=temp_data[target_feature].value_counts().index)
plt.title("{}\nResponse rate: {:.1f}%".format(meta.loc[target_feature, 'Question'], 
                                          meta.loc[target_feature, 'Response_rate']))
for p in ax.patches:
    x=p.get_bbox().get_points()[1,0]
    y=p.get_bbox().get_points()[:,1]
    ax.annotate('{:.1f}%'.format(100.*x/ncount), (x, y.mean()+0.1), 
            ha='center', va='bottom') # set the alignment of the text\
    
count = count + 1

In [None]:
fig = plt.figure(figsize=(12, 12))
target_feature = feature_all_object.index[count]
temp_data = df_multipleChoice.loc[(df_multipleChoice[target_feature].isin(df_multipleChoice[target_feature].value_counts().index[:10]))]
ncount = temp_data[target_feature].value_counts().sum()
ax = sns.countplot(y=target_feature, data=temp_data, 
              order=temp_data[target_feature].value_counts().index)
plt.title("{}\nResponse rate: {:.1f}%".format(meta.loc[target_feature, 'Question'], 
                                          meta.loc[target_feature, 'Response_rate']))
for p in ax.patches:
    x=p.get_bbox().get_points()[1,0]
    y=p.get_bbox().get_points()[:,1]
    ax.annotate('{:.1f}%'.format(100.*x/ncount), (x, y.mean()+0.1), 
            ha='center', va='bottom') # set the alignment of the text\
    
count = count + 1

In [None]:
fig = plt.figure(figsize=(12, 12))
target_feature = feature_all_object.index[count]
temp_data = df_multipleChoice.loc[(df_multipleChoice[target_feature].isin(df_multipleChoice[target_feature].value_counts().index[:10]))]
ncount = temp_data[target_feature].value_counts().sum()
ax = sns.countplot(y=target_feature, data=temp_data, 
              order=temp_data[target_feature].value_counts().index)
plt.title("{}\nResponse rate: {:.1f}%".format(meta.loc[target_feature, 'Question'], 
                                          meta.loc[target_feature, 'Response_rate']))
for p in ax.patches:
    x=p.get_bbox().get_points()[1,0]
    y=p.get_bbox().get_points()[:,1]
    ax.annotate('{:.1f}%'.format(100.*x/ncount), (x, y.mean()+0.1), 
            ha='center', va='bottom') # set the alignment of the text\
    
count = count + 1

In [None]:
fig = plt.figure(figsize=(12, 12))
target_feature = feature_all_object.index[count]
temp_data = df_multipleChoice.loc[(df_multipleChoice[target_feature].isin(df_multipleChoice[target_feature].value_counts().index[:10]))]
ncount = temp_data[target_feature].value_counts().sum()
ax = sns.countplot(y=target_feature, data=temp_data, 
              order=temp_data[target_feature].value_counts().index)
plt.title("{}\nResponse rate: {:.1f}%".format(meta.loc[target_feature, 'Question'], 
                                          meta.loc[target_feature, 'Response_rate']))
for p in ax.patches:
    x=p.get_bbox().get_points()[1,0]
    y=p.get_bbox().get_points()[:,1]
    ax.annotate('{:.1f}%'.format(100.*x/ncount), (x, y.mean()+0.1), 
            ha='center', va='bottom') # set the alignment of the text\
    
count = count + 1

In [None]:
fig = plt.figure(figsize=(12, 12))
target_feature = feature_all_object.index[count]
temp_data = df_multipleChoice.loc[(df_multipleChoice[target_feature].isin(df_multipleChoice[target_feature].value_counts().index[:10]))]
ncount = temp_data[target_feature].value_counts().sum()
ax = sns.countplot(y=target_feature, data=temp_data, 
              order=temp_data[target_feature].value_counts().index)
plt.title("{}\nResponse rate: {:.1f}%".format(meta.loc[target_feature, 'Question'], 
                                          meta.loc[target_feature, 'Response_rate']))
for p in ax.patches:
    x=p.get_bbox().get_points()[1,0]
    y=p.get_bbox().get_points()[:,1]
    ax.annotate('{:.1f}%'.format(100.*x/ncount), (x, y.mean()+0.1), 
            ha='center', va='bottom') # set the alignment of the text\
    
count = count + 1

In [None]:
fig = plt.figure(figsize=(12, 12))
target_feature = feature_all_object.index[count]
temp_data = df_multipleChoice.loc[(df_multipleChoice[target_feature].isin(df_multipleChoice[target_feature].value_counts().index[:10]))]
ncount = temp_data[target_feature].value_counts().sum()
ax = sns.countplot(y=target_feature, data=temp_data, 
              order=temp_data[target_feature].value_counts().index)
plt.title("{}\nResponse rate: {:.1f}%".format(meta.loc[target_feature, 'Question'], 
                                          meta.loc[target_feature, 'Response_rate']))
for p in ax.patches:
    x=p.get_bbox().get_points()[1,0]
    y=p.get_bbox().get_points()[:,1]
    ax.annotate('{:.1f}%'.format(100.*x/ncount), (x, y.mean()+0.1), 
            ha='center', va='bottom') # set the alignment of the text\
    
count = count + 1

In [None]:
fig = plt.figure(figsize=(12, 12))
target_feature = feature_all_object.index[count]
temp_data = df_multipleChoice.loc[(df_multipleChoice[target_feature].isin(df_multipleChoice[target_feature].value_counts().index[:10]))]
ncount = temp_data[target_feature].value_counts().sum()
ax = sns.countplot(y=target_feature, data=temp_data, 
              order=temp_data[target_feature].value_counts().index)
plt.title("{}\nResponse rate: {:.1f}%".format(meta.loc[target_feature, 'Question'], 
                                          meta.loc[target_feature, 'Response_rate']))
for p in ax.patches:
    x=p.get_bbox().get_points()[1,0]
    y=p.get_bbox().get_points()[:,1]
    ax.annotate('{:.1f}%'.format(100.*x/ncount), (x, y.mean()+0.1), 
            ha='center', va='bottom') # set the alignment of the text\
    
count = count + 1

In [None]:
fig = plt.figure(figsize=(12, 12))
target_feature = feature_all_object.index[count]
temp_data = df_multipleChoice.loc[(df_multipleChoice[target_feature].isin(df_multipleChoice[target_feature].value_counts().index[:10]))]
ncount = temp_data[target_feature].value_counts().sum()
ax = sns.countplot(y=target_feature, data=temp_data, 
              order=temp_data[target_feature].value_counts().index)
plt.title("{}\nResponse rate: {:.1f}%".format(meta.loc[target_feature, 'Question'], 
                                          meta.loc[target_feature, 'Response_rate']))
for p in ax.patches:
    x=p.get_bbox().get_points()[1,0]
    y=p.get_bbox().get_points()[:,1]
    ax.annotate('{:.1f}%'.format(100.*x/ncount), (x, y.mean()+0.1), 
            ha='center', va='bottom') # set the alignment of the text\
    
count = count + 1

In [None]:
fig = plt.figure(figsize=(12, 12))
target_feature = feature_all_object.index[count]
temp_data = df_multipleChoice.loc[(df_multipleChoice[target_feature].isin(df_multipleChoice[target_feature].value_counts().index[:10]))]
ncount = temp_data[target_feature].value_counts().sum()
ax = sns.countplot(y=target_feature, data=temp_data, 
              order=temp_data[target_feature].value_counts().index)
plt.title("{}\nResponse rate: {:.1f}%".format(meta.loc[target_feature, 'Question'], 
                                          meta.loc[target_feature, 'Response_rate']))
for p in ax.patches:
    x=p.get_bbox().get_points()[1,0]
    y=p.get_bbox().get_points()[:,1]
    ax.annotate('{:.1f}%'.format(100.*x/ncount), (x, y.mean()+0.1), 
            ha='center', va='bottom') # set the alignment of the text\
    
count = count + 1

In [None]:
fig = plt.figure(figsize=(12, 12))
target_feature = feature_all_object.index[count]
temp_data = df_multipleChoice.loc[(df_multipleChoice[target_feature].isin(df_multipleChoice[target_feature].value_counts().index[:10]))]
ncount = temp_data[target_feature].value_counts().sum()
ax = sns.countplot(y=target_feature, data=temp_data, 
              order=temp_data[target_feature].value_counts().index)
plt.title("{}\nResponse rate: {:.1f}%".format(meta.loc[target_feature, 'Question'], 
                                          meta.loc[target_feature, 'Response_rate']))
for p in ax.patches:
    x=p.get_bbox().get_points()[1,0]
    y=p.get_bbox().get_points()[:,1]
    ax.annotate('{:.1f}%'.format(100.*x/ncount), (x, y.mean()+0.1), 
            ha='center', va='bottom') # set the alignment of the text\
    
count = count + 1

In [None]:
fig = plt.figure(figsize=(12, 12))
target_feature = feature_all_object.index[count]
temp_data = df_multipleChoice.loc[(df_multipleChoice[target_feature].isin(df_multipleChoice[target_feature].value_counts().index[:10]))]
ncount = temp_data[target_feature].value_counts().sum()
ax = sns.countplot(y=target_feature, data=temp_data, 
              order=temp_data[target_feature].value_counts().index)
plt.title("{}\nResponse rate: {:.1f}%".format(meta.loc[target_feature, 'Question'], 
                                          meta.loc[target_feature, 'Response_rate']))
for p in ax.patches:
    x=p.get_bbox().get_points()[1,0]
    y=p.get_bbox().get_points()[:,1]
    ax.annotate('{:.1f}%'.format(100.*x/ncount), (x, y.mean()+0.1), 
            ha='center', va='bottom') # set the alignment of the text\
    
count = count + 1

In [None]:
fig = plt.figure(figsize=(12, 12))
target_feature = feature_all_object.index[count]
temp_data = df_multipleChoice.loc[(df_multipleChoice[target_feature].isin(df_multipleChoice[target_feature].value_counts().index[:10]))]
ncount = temp_data[target_feature].value_counts().sum()
ax = sns.countplot(y=target_feature, data=temp_data, 
              order=temp_data[target_feature].value_counts().index)
plt.title("{}\nResponse rate: {:.1f}%".format(meta.loc[target_feature, 'Question'], 
                                          meta.loc[target_feature, 'Response_rate']))
for p in ax.patches:
    x=p.get_bbox().get_points()[1,0]
    y=p.get_bbox().get_points()[:,1]
    ax.annotate('{:.1f}%'.format(100.*x/ncount), (x, y.mean()+0.1), 
            ha='center', va='bottom') # set the alignment of the text\
    
count = count + 1

In [None]:
fig = plt.figure(figsize=(12, 12))
target_feature = feature_all_object.index[count]
temp_data = df_multipleChoice.loc[(df_multipleChoice[target_feature].isin(df_multipleChoice[target_feature].value_counts().index[:10]))]
ncount = temp_data[target_feature].value_counts().sum()
ax = sns.countplot(y=target_feature, data=temp_data, 
              order=temp_data[target_feature].value_counts().index)
plt.title("{}\nResponse rate: {:.1f}%".format(meta.loc[target_feature, 'Question'], 
                                          meta.loc[target_feature, 'Response_rate']))
for p in ax.patches:
    x=p.get_bbox().get_points()[1,0]
    y=p.get_bbox().get_points()[:,1]
    ax.annotate('{:.1f}%'.format(100.*x/ncount), (x, y.mean()+0.1), 
                ha='center', va='bottom') # set the alignment of the text\
    
count = count + 1

As you can see in above many "LearningPlatformUsefullness" plot, All of platforms and resources tend to be "Very useful" or  "Somewhat useful".  Kaggle has following result - "Very useful" : 62%, "Somewhat useful : 37%, "Not Useful" : 0.3%. The platforms & resources that have the similar result with Kaggle are Conference, newsletters, online courses, personal project, Stack over flow, Tutoring/metoring. I think Kaggle contains the mixture of those similar resources free!. Wow! Kaggle does not require money, rather, they and competition host give us prize. Wow! (But, Kaggle require many time from me. :))

In [None]:
fig = plt.figure(figsize=(12, 12))
target_feature = feature_all_object.index[count]
temp_data = df_multipleChoice.loc[(df_multipleChoice[target_feature].isin(df_multipleChoice[target_feature].value_counts().index[:10]))]
ncount = temp_data[target_feature].value_counts().sum()
ax = sns.countplot(y=target_feature, data=temp_data, 
              order=temp_data[target_feature].value_counts().index)
plt.title("{}\nResponse rate: {:.1f}%".format(meta.loc[target_feature, 'Question'], 
                                          meta.loc[target_feature, 'Response_rate']))
for p in ax.patches:
    x=p.get_bbox().get_points()[1,0]
    y=p.get_bbox().get_points()[:,1]
    ax.annotate('{:.1f}%'.format(100.*x/ncount), (x, y.mean()+0.1), 
            ha='center', va='bottom') # set the alignment of the text\
    
count = count + 1

In [None]:
fig = plt.figure(figsize=(12, 12))
target_feature = feature_all_object.index[count]
temp_data = df_multipleChoice.loc[(df_multipleChoice[target_feature].isin(df_multipleChoice[target_feature].value_counts().index[:10]))]
ncount = temp_data[target_feature].value_counts().sum()
ax = sns.countplot(y=target_feature, data=temp_data, 
              order=temp_data[target_feature].value_counts().index)
plt.title("{}\nResponse rate: {:.1f}%".format(meta.loc[target_feature, 'Question'], 
                                          meta.loc[target_feature, 'Response_rate']))
for p in ax.patches:
    x=p.get_bbox().get_points()[1,0]
    y=p.get_bbox().get_points()[:,1]
    ax.annotate('{:.1f}%'.format(100.*x/ncount), (x, y.mean()+0.1), 
            ha='center', va='bottom') # set the alignment of the text\
    
count = count + 1

# My Question
What makes someone to be a data scientist?

In [None]:
fig = plt.figure(figsize=(12, 12))
target_feature = feature_all_object.index[count]
temp_data = df_multipleChoice.loc[(df_multipleChoice[target_feature].isin(df_multipleChoice[target_feature].value_counts().index[:10]))]
ncount = temp_data[target_feature].value_counts().sum()
ax = sns.countplot(y=target_feature, data=temp_data, 
              order=temp_data[target_feature].value_counts().index)
plt.title("{}\nResponse rate: {:.1f}%".format(meta.loc[target_feature, 'Question'], 
                                          meta.loc[target_feature, 'Response_rate']))
for p in ax.patches:
    x=p.get_bbox().get_points()[1,0]
    y=p.get_bbox().get_points()[:,1]
    ax.annotate('{:.1f}%'.format(100.*x/ncount), (x, y.mean()+0.1), 
            ha='center', va='bottom') # set the alignment of the text\
    
count = count + 1

In [None]:
fig = plt.figure(figsize=(12, 12))
target_feature = feature_all_object.index[count]
temp_data = df_multipleChoice.loc[(df_multipleChoice[target_feature].isin(df_multipleChoice[target_feature].value_counts().index[:10]))]
ncount = temp_data[target_feature].value_counts().sum()
ax = sns.countplot(y=target_feature, data=temp_data, 
              order=temp_data[target_feature].value_counts().index)
plt.title("{}\nResponse rate: {:.1f}%".format(meta.loc[target_feature, 'Question'], 
                                          meta.loc[target_feature, 'Response_rate']))
for p in ax.patches:
    x=p.get_bbox().get_points()[1,0]
    y=p.get_bbox().get_points()[:,1]
    ax.annotate('{:.1f}%'.format(100.*x/ncount), (x, y.mean()+0.1), 
            ha='center', va='bottom') # set the alignment of the text\
    
count = count + 1

As Ph.D student of chemical engineering, It is delightful that there are may engineers in kaggle!

In [None]:
fig = plt.figure(figsize=(12, 12))
target_feature = feature_all_object.index[count]
temp_data = df_multipleChoice.loc[(df_multipleChoice[target_feature].isin(df_multipleChoice[target_feature].value_counts().index[:10]))]
ncount = temp_data[target_feature].value_counts().sum()
ax = sns.countplot(y=target_feature, data=temp_data, 
              order=temp_data[target_feature].value_counts().index)
plt.title("{}\nResponse rate: {:.1f}%".format(meta.loc[target_feature, 'Question'], 
                                          meta.loc[target_feature, 'Response_rate']))
for p in ax.patches:
    x=p.get_bbox().get_points()[1,0]
    y=p.get_bbox().get_points()[:,1]
    ax.annotate('{:.1f}%'.format(100.*x/ncount), (x, y.mean()+0.1), 
            ha='center', va='bottom') # set the alignment of the text\
    
count = count + 1

In [None]:
fig = plt.figure(figsize=(12, 12))
target_feature = feature_all_object.index[count]
temp_data = df_multipleChoice.loc[(df_multipleChoice[target_feature].isin(df_multipleChoice[target_feature].value_counts().index[:10]))]
ncount = temp_data[target_feature].value_counts().sum()
ax = sns.countplot(y=target_feature, data=temp_data, 
              order=temp_data[target_feature].value_counts().index)
plt.title("{}\nResponse rate: {:.1f}%".format(meta.loc[target_feature, 'Question'], 
                                          meta.loc[target_feature, 'Response_rate']))
for p in ax.patches:
    x=p.get_bbox().get_points()[1,0]
    y=p.get_bbox().get_points()[:,1]
    ax.annotate('{:.1f}%'.format(100.*x/ncount), (x, y.mean()+0.1), 
            ha='center', va='bottom') # set the alignment of the text\
    
count = count + 1

Amazingly, many kaggler said that "I havent' started working yet". Image the era when they will work! They will be well-trained, so, they will show high-perfomance! How about recruiting them? (Also, me :)

In [None]:
fig = plt.figure(figsize=(12, 12))
target_feature = feature_all_object.index[count]
temp_data = df_multipleChoice.loc[(df_multipleChoice[target_feature].isin(df_multipleChoice[target_feature].value_counts().index[:10]))]
ncount = temp_data[target_feature].value_counts().sum()
ax = sns.countplot(y=target_feature, data=temp_data, 
              order=temp_data[target_feature].value_counts().index)
plt.title("{}\nResponse rate: {:.1f}%".format(meta.loc[target_feature, 'Question'], 
                                          meta.loc[target_feature, 'Response_rate']))
for p in ax.patches:
    x=p.get_bbox().get_points()[1,0]
    y=p.get_bbox().get_points()[:,1]
    ax.annotate('{:.1f}%'.format(100.*x/ncount), (x, y.mean()+0.1), 
            ha='center', va='bottom') # set the alignment of the text\
    
count = count + 1

I want to admire who are "Self-taught". Their passion will change the world!

In [None]:
fig = plt.figure(figsize=(12, 12))
target_feature = feature_all_object.index[count]
temp_data = df_multipleChoice.loc[(df_multipleChoice[target_feature].isin(df_multipleChoice[target_feature].value_counts().index[:10]))]
ncount = temp_data[target_feature].value_counts().sum()
ax = sns.countplot(y=target_feature, data=temp_data, 
              order=temp_data[target_feature].value_counts().index)
plt.title("{}\nResponse rate: {:.1f}%".format(meta.loc[target_feature, 'Question'], 
                                          meta.loc[target_feature, 'Response_rate']))
for p in ax.patches:
    x=p.get_bbox().get_points()[1,0]
    y=p.get_bbox().get_points()[:,1]
    ax.annotate('{:.1f}%'.format(100.*x/ncount), (x, y.mean()+0.1), 
            ha='center', va='bottom') # set the alignment of the text\
    
count = count + 1

In [None]:
fig = plt.figure(figsize=(15, 15))
target_feature = feature_all_object.index[count]
temp_data = df_multipleChoice.loc[(df_multipleChoice[target_feature].isin(df_multipleChoice[target_feature].value_counts().index[:10]))]
ncount = temp_data[target_feature].value_counts().sum()
ax = sns.countplot(y=target_feature, data=temp_data, 
              order=temp_data[target_feature].value_counts().index)
plt.title("{}\nResponse rate: {:.1f}%".format(meta.loc[target_feature, 'Question'], 
                                          meta.loc[target_feature, 'Response_rate']))
for p in ax.patches:
    x=p.get_bbox().get_points()[1,0]
    y=p.get_bbox().get_points()[:,1]
    ax.annotate('{:.1f}%'.format(100.*x/ncount), (x, y.mean()+0.1), 
            ha='center', va='bottom') # set the alignment of the text\

fig = plt.figure(figsize=(15, 15))
target_feature = feature_all_object.index[count]
temp_data = df_multipleChoice.loc[(df_multipleChoice[target_feature].isin(df_multipleChoice[target_feature].value_counts().index[:7]))]
ncount = temp_data[target_feature].value_counts().sum()
ax = sns.countplot(y=target_feature, data=temp_data, 
              order=temp_data[target_feature].value_counts().index)
plt.title("{}\nResponse rate: {:.1f}%".format(meta.loc[target_feature, 'Question'], 
                                          meta.loc[target_feature, 'Response_rate']))
for p in ax.patches:
    x=p.get_bbox().get_points()[1,0]
    y=p.get_bbox().get_points()[:,1]
    ax.annotate('{:.1f}%'.format(100.*x/ncount), (x, y.mean()+0.1), 
            ha='center', va='bottom') # set the alignment of the text\
    
count = count + 1

In [None]:
fig = plt.figure(figsize=(15, 15))
target_feature = feature_all_object.index[count]
temp_data = df_multipleChoice.loc[(df_multipleChoice[target_feature].isin(df_multipleChoice[target_feature].value_counts().index[:10]))]
ncount = temp_data[target_feature].value_counts().sum()
ax = sns.countplot(y=target_feature, data=temp_data, 
              order=temp_data[target_feature].value_counts().index)
plt.title("{}\nResponse rate: {:.1f}%".format(meta.loc[target_feature, 'Question'], 
                                          meta.loc[target_feature, 'Response_rate']))
for p in ax.patches:
    x=p.get_bbox().get_points()[1,0]
    y=p.get_bbox().get_points()[:,1]
    ax.annotate('{:.1f}%'.format(100.*x/ncount), (x, y.mean()+0.1), 
            ha='center', va='bottom') # set the alignment of the text\
    
count = count + 1

#  Things to do
1. More analysis on the features ("All" and in multiplechoices)
2. Analysis on the features ("Not all" and in mutiplechoices)
3. Analysis on the features in freeform