In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## This notebook contains my attempt to perform a *statistical analysis* over different groups of kagglers.

<img src="https://miro.medium.com/max/1400/1*eOxxYV0nui_R9HrC4eRFqw.png" style="width: 70%; margin: auto;">

In [None]:
df_2021 = pd.read_csv('/kaggle/input/kaggle-survey-2021/kaggle_survey_2021_responses.csv')

### Let's pick a question for testing this idea, and I shall pick a question that I'm interested to see the difference between the two population in Egypt and the US.

So the question would be **What is the difference between the activites that an ML Engineer in Egypt does versus an ML Engineer in the USA?**

### How would we proceed to answer this question?

The easy way to fire up seaborn or matplotlib to make a comparison between the the proportions of each activity for the two countries. 

### But I want to try another way..

And that is to calculate the difference in proportions in each activity between the two groups, then calculate a confidence interval. I've always studied statistics, but I never used here on Kaggle (or anywhere for that matter), so why not start using it right now?

#### Let's see what we need to answer that question.
1. First we need the given proportions betwen the two groups
2. Second we need to calculate the difference between the two proportions
3. Then we need to calculate the confidence interval between the statistic obtained

#### And what would that show us? It would emphasize the activites that are difference between the two groups, which might be just a glorified way of looking into the barplot between the two groups' activities, but hey this is Data Science yo, we are supposed to use statistics in here so wise up.

I'll attempt to make handy functions as I go so that I can answer more questions using this way.

In [None]:
# Find different countries values
df_2021['Q3'].unique()

In [None]:
# Find out the ML engineer value
df_2021['Q5'].unique()

In [None]:
# Filter out the two countries and the role of choice
analysis_df = df_2021[df_2021['Q3'].isin(['Egypt', 'United States of America']) & df_2021['Q5'].isin(['Machine Learning Engineer'])]
analysis_df.head()

In [None]:
# Filter out the columns that we want (Countires and the activities)
analysis_df = analysis_df.loc[:, analysis_df.columns.str.startswith('Q24') | analysis_df.columns.isin(['Q3'])]
analysis_df.head()

In [None]:
# Calculate the count of each group
analysis_count = analysis_df.groupby('Q3').size()
analysis_count

In [None]:
# Calculate the proportion of each question
analysis_prop = analysis_df.groupby('Q3', as_index=True).apply(lambda df: (~df.isna()).mean()).drop('Q3', axis=1)
analysis_prop

In [None]:
# Calculate the difference in each activity between the two groups
analysis_diff = analysis_prop.diff().rename(index={'United States of America': 'diff'})
analysis_diff

In [None]:
# Calculate the standard deviation of each group
analysis_sd = (analysis_prop * (1 - analysis_prop)).apply(lambda x: x / analysis_count, axis=0)
analysis_sd

In [None]:
# Calculate the standatd deviation of the difference between each two proportions
analysis_sd_diff = analysis_sd.sum()
analysis_sd_diff

In [None]:
# Calculate the upper limit for a 95% confidence interval
analysis_diff_cd_upper = analysis_diff.apply(lambda x: x + (0.95 * analysis_sd_diff), axis=1).rename(index={'diff': 'upper_cd'}).iloc[-1]

# Calculate the lower limit for a 95% condifence interval
analysis_diff_cd_lower = analysis_diff.apply(lambda x: x - (0.95 * analysis_sd_diff), axis=1).rename(index={'diff': 'lower_cd'}).iloc[-1]

analysis_diff_cd_upper

In [None]:
# Get the values of each answer
analysis_values = analysis_df.iloc[:, 1:].apply(lambda x: x[~x.isna()].unique(), axis=0)
analysis_values = analysis_values.rename(index={0: 'values'})
analysis_values

In [None]:
# Append all data into a consumable format
analysis_final_df = analysis_values.T
analysis_final_df = analysis_final_df.join(analysis_prop.T)
analysis_final_df = analysis_final_df.join(analysis_diff.loc['diff', :])
analysis_final_df = analysis_final_df.join(analysis_diff_cd_upper)
analysis_final_df = analysis_final_df.join(analysis_diff_cd_lower)

analysis_final_df

In [None]:
import seaborn as sns
cm = sns.light_palette("seagreen", as_cmap=True)

analysis_final_df.style.background_gradient(cmap=cm)

### Now we can see very clearly for example that ML Engineers in Egypt don't actually do ML that much in their jobs when compared to USA ML Engineers.

#### **Building prototypes to explore applying machine learning to new areas**

**24%** of ***ML Engineers*** in Egypt list **Building ML prototypes** as part of their daily work, while **73%** of US ML Engineers listed it. The confidence interval of the difference between the two proportions is **49% - 50.1%**.

#### **Experimentation and iteration to improve existing ML models**

**22%** of ***ML Engineers*** in Egypt list **Experimentation of ML models** as part of their daily work, while **69.8%** of US ML Engineers listed it. The confidence interval of the difference between the two proportions is **47.2% - 48.3%**.


In [None]:
df_2021.iloc[0].to_dict()