In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
data = pd.read_csv('/kaggle/input/kaggle-survey-2020/kaggle_survey_2020_responses.csv')

# splitting the dataset into questions and data points
questions = data.iloc[0, :].T
data = data.iloc[1:, :]

# Introduction

This notebook will aim to understand the differences between Data Scientists and Machine Learning Engineers. 

I'm sure that for alot of beginners, including myself, we don't truly understand the difference. We read blogposts and medium articles and see the two titles getting tossed around, and all we know is that a data scientist is more concerned with data, while a machine learning engineer is more concerned with algorithms and deployment. 

Does the distinct separation of the two terms hold out in real life? That's what we are going to *try* to answer here in this notebook. 

I hope you enjoy!

In [None]:
# data scientists and machine learning engineers df
ds_mle = data[data.Q5.isin(["Machine Learning Engineer", "Data Scientist"])]

Let's start asking some questions.

## How many data scientists and ml engineers were present in this year's survey?

In [None]:
plt.figure(figsize=(8, 6))

base_color = sns.color_palette()[0]
ax = sns.countplot(data=ds_mle, x='Q5', color=base_color)

plt.xlabel('Title')
plt.ylabel('Count')
plt.title('Data Scientists and Machine Learning Engineers in 2020')

# annotate percentages
titles_p = data.Q5.value_counts()*100/len(data)  # calculate all titles percentages
ds_mle_p = titles_p[['Data Scientist', 'Machine Learning Engineer']]
for p, (title, title_p) in zip(ax.patches, ds_mle_p.iteritems()):
    ax.annotate('{:.1f}%'.format(title_p), (p.get_x() + p.get_width() / 2., p.get_height()), 
                   ha = 'center', va = 'center', 
                   xytext = (0, 7), 
                   textcoords = 'offset points')
    
# hide the right and top spines
ax.spines['right'].set_visible(False)
ax.spines['top'].set_visible(False)

Data Scientists make around 13.4% of the dataset, while ML Engineers are only 5.4%.

This signifies the abundance of Data Scientist jobs over ML Engineer's. But why? **Is one job more sophisticated than the other? Does one require more experience than the other?**

We can look at the experiences of the participants in terms of coding and ML to see if there is a threshold for ML Engineer jobs for examples, which makes it's opportunities more scarce than Data Scientist ones. 

In [None]:
# I need to make a barplot of the experience in coding for the two occupations
# I can't look at the absolute frequency as data scientists will always have higher ones
# So I need to look at the relative frequencies
plt.figure(figsize=(12, 6))

# calculate the relative frequencies of experiences for each title
ds_mle_coding = ds_mle.groupby('Q5')['Q6'].apply(lambda x: x.value_counts()/len(x)).reset_index()

q6_order = ['I have never written code', '< 1 years', '1-2 years', '3-5 years', '5-10 years', '10-20 years', '20+ years']
ax = sns.barplot(data=ds_mle_coding, x='level_1', y='Q6', hue='Q5', order=q6_order, color='#319ebd')
plt.xlabel('Coding Experience')
plt.ylabel('Relative Frequency')
plt.title('Relative Frequencies of Coding Experiences for Data Scientists and ML Engineers')
plt.legend(title='Job')

# hide the right and top spines
ax.spines['right'].set_visible(False)
ax.spines['top'].set_visible(False)

It's clear that the data is normally distributed even though it is binned. But what is more clear now is that the **Data Science** title has more coding veterans than the **ML Enginner** title.

The difference isn't huge, but it is persistent once we look at 3-5 years of coding experience and beyond, that the numbers tip in favor of Data Scientists, which actually challenges what I used to believe. 

I though that ML Engineers needed to have more experience, but it looks like the opposite. My guess for now is that the Data Scientist stack is broader than the ML Engineer one. We can take another look at the ML experience of both occupations to better understand this connundrum.

In [None]:
# cleaning ML experience values to be more similar to coding experience values
ds_mle.loc[:, 'Q15'] = ds_mle['Q15'].replace(
    {'Under 1 year': '< 1 years', '20 or more years': '20+ years',
     'I do not use machine learning methods': 'No experience'});

In [None]:
plt.figure(figsize=(12, 6))

# calculate the relative frequencies of experiences for each title
ds_mle_coding = ds_mle.groupby('Q5')['Q15'].apply(lambda x: x.value_counts()/len(x)).reset_index()

q15_order = ['No experience', '< 1 years', '1-2 years', '2-3 years',
             '3-4 years', '4-5 years', '5-10 years', '10-20 years', '20+ years']
ax = sns.barplot(data=ds_mle_coding, x='level_1', y='Q15', hue='Q5', order=q15_order, color='#319ebd')
plt.xlabel('ML Experience')
plt.ylabel('Relative Frequency')
plt.title('Relative Frequencies of ML Experiences for Data Scientists and ML Engineers')
plt.legend(title='Job')

# hide the right and top spines
ax.spines['right'].set_visible(False)
ax.spines['top'].set_visible(False)



The conclusion is the same regarding ML experience. Data Scientists with more experience in ML methods are greater in relative frequency than their fellow ML Engineers. But is that in anyway related to their ages? 

In [None]:
plt.figure(figsize=(12, 6))


# calculate the relative frequencies of ages for each title
ds_mle_coding = ds_mle.groupby('Q5')['Q1'].apply(lambda x: x.value_counts()/len(x)).reset_index()

q1_order = ['18-21', '22-24', '25-29', '30-34', '35-39', '40-44', '45-49', '50-54', '55-59', '60-69', '70+']
ax = sns.barplot(data=ds_mle_coding, x='level_1', y='Q1', hue='Q5', order=q1_order, color='#319ebd')
plt.xlabel('Age Group')
plt.ylabel('Relative Frequency')
plt.title('Relative Frequencies of Age Groups for Data Scientists and ML Engineers')
plt.legend(title='Job')

# hide the right and top spines
ax.spines['right'].set_visible(False)
ax.spines['top'].set_visible(False)

We can still see the trend in ages, as ML Engineers have more relative proportions of population below 30, while Data scientists have more relative proportions of population above 30. I guess that the same would go with education, where Data Scientists will have higher proportions in post grad studies, while ML Engineers will have the opposite. Let's take a look to make sure.

In [None]:
q4_order = ['No formal education past high school', 'Some college/university study without earning a bachelor’s degree',
            'Bachelor’s degree', 'Professional degree', 'Master’s degree', 'Doctoral degree', 'I prefer not to answer']

plt.figure(figsize=(18, 4))


# calculate the relative frequencies of ages for each title
ds_mle_coding = ds_mle.groupby('Q5')['Q4'].apply(lambda x: x.value_counts()/len(x)).reset_index()

ax = sns.barplot(data=ds_mle_coding, x='level_1', y='Q4', hue='Q5', order=q4_order, color='#319ebd')
plt.xlabel('Education')
plt.ylabel('Relative Frequency')
plt.title('Relative Frequencies of Education for Data Scientists and ML Engineers')
plt.legend(title='Job')
plt.xticks(rotation=90)

# hide the right and top spines
ax.spines['right'].set_visible(False)
ax.spines['top'].set_visible(False)

Now that we've made some assumptions about the data, it's to inspect them. 

Through the previous questions and visualization, we've come to think that Data Scientists with more experience are more compared to ML Engineers. Now how shall we prove this? Well, we first shall look in the number of languages that a typical Data Scientist and ML Engineer works with. 

If the previous assumptions were correct, we ought to see a higher number of Data Scientists who are proficient in more languages at the same time. 

In [None]:
q7 = [f'Q7_Part_{i+1}' for i in range(12)]  # question 7 parts

# calculating the number of languages used on regular basis
def get_languages(row):
    if row.dropna().empty:
        return "Doesn't code"
    else:
        return ', '.join(row.dropna())
ds_mle.loc[:, 'languages_used_count'] = (~ds_mle[q7].isna()).sum(axis=1)  
ds_mle.loc[:, 'languages_used'] = ds_mle[q7].apply(get_languages, axis=1)

plt.figure(figsize=(18, 6))


# calculate the relative frequencies of languages for each title
ds_mle_coding = ds_mle.groupby('Q5')['languages_used_count'].apply(lambda x: x.value_counts()/len(x)).reset_index()

lang_order = list(range(11))
ax = sns.barplot(data=ds_mle_coding, x='level_1', y='languages_used_count', hue='Q5', order=lang_order, color='#319ebd')
plt.xlabel('Number of Languages')
plt.ylabel('Relative Frequency')
plt.title('Relative Frequencies of Regularly Used Languages Count for Data Scientists and ML Engineers')
plt.legend(title='Job', loc='right')
plt.xticks(rotation=90)

# hide the right and top spines
ax.spines['right'].set_visible(False)
ax.spines['top'].set_visible(False)

Aroud 50% Data Scientists use 2 or 3 languages on regularly basis, where only around 35% of ML Engineers use that number. And around 30% of ML Engineer use only 1, while the percentage in Data Scientists is around 20%.

What are the most common languages that Data Scientists and ML Engineer use respectively?

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(18, 5), sharey=False)

(ds_mle.query('Q5 == "Data Scientist"').languages_used.value_counts()/len(ds_mle.query('Q5 == "Data Scientist"')))[:20].plot(kind='bar', ax=axes[0])
(ds_mle.query('Q5 == "Machine Learning Engineer"').languages_used.value_counts()[:20]/len(ds_mle.query('Q5 == "Machine Learning Engineer"'))).plot(kind='bar', ax=axes[1])

axes[0].set_title('Most Regularly Used Language Combinations for Data Scientists')
axes[1].set_title('Most Regularly Used Language Combinations for ML Engineers')

# hide the right and top spines
for ax in axes:
    ax.spines['right'].set_visible(False)
    ax.spines['top'].set_visible(False)
    



We can easily see that Data Scientists have a consensus on the most useful languages and they are Python, R, SQL and some Bash in the top language combinations, which could be used for command line data manipulation.

With ML Engineers it gets different. The agreement is that more than 25% of them use solely Python, but then the combinations get different with decreasing proportions. So for example, along Python, some use C, C++, Bash, SQL, MATLAB, Java, Javascript, etc.

We can take a look at the proportion of users in each profession who said that they used a particular language on regularly basis.

In [None]:
def get_lang_freq(group, relative=True):
    """Calculates each language absolute or relative frequency from multiple selections of question 7."""
    group_lang_freq = pd.melt(group.reset_index(), id_vars='index', value_vars=q7).value.value_counts().astype(int)
    if relative:
        group_lang_freq = group_lang_freq/group_lang_freq.sum()
    return group_lang_freq

ds_mle_lang_freq = ds_mle.groupby('Q5').apply(get_lang_freq).reset_index()

plt.figure(figsize=(18, 5))
ax = sns.barplot(data=ds_mle_lang_freq, x='level_1', y='value', hue='Q5', color='#319ebd')
plt.xlabel('Language')
plt.ylabel('Relative Frequency')
plt.title('Relative Frequencies of Regularly Used Languages for Data Scientists and ML Engineers')
plt.legend(title='Job', loc='right')

# hide the right and top spines
ax.spines['right'].set_visible(False)
ax.spines['top'].set_visible(False)


As we can see here 90% Data Scientists use python, around 60% of them use SQl, and also around 40% use R. Where ML Engineers around 90% use only python, and then the proportion of users choosing other languages never exceed 30%. 

This means that although a lot ML Engineers use more than 1 language, they have no consensus on the languages they use besides python. Where Data Scientists mostly resort to SQL or R, ML Engineers would go for either SQL, C, C++, Java, Javascript or MATLAB.

Is the last remark with ML Engineers in anyway related to coding experience? It might be that ML Engineers who have shifted from a more software oriented job must have brought with them some of what they used. So let's look into which languages ML Engineers and Data Scientists with different experiences use regularly.

Since we'll be looking into each of them separately, it might be better to using absolute frequencies to avoid any over-insights.

In [None]:
lang_order = ds_mle_lang_freq.query('Q5 == "Data Scientist"').level_1.tolist()[:-1]
q6_order = ['< 1 years', '1-2 years', '3-5 years', '5-10 years', '10-20 years', '20+ years']
q15_order = ['No experience', '< 1 years', '1-2 years', '2-3 years', '3-4 years', '4-5 years', '5-10 years', '10-20 years', '20+ years']

relative = True

ds_exp_lang = ds_mle.query('Q5 == "Data Scientist"').groupby('Q6').apply(get_lang_freq, relative).unstack().loc[q6_order, lang_order].fillna(0)
mle_exp_lang = ds_mle.query('Q5 == "Machine Learning Engineer"').groupby('Q6').apply(get_lang_freq, relative).unstack().loc[q6_order, lang_order].fillna(0)

ds_exp_ml = ds_mle.query('Q5 == "Data Scientist"').groupby('Q15').apply(get_lang_freq, relative).unstack().loc[q15_order, lang_order].fillna(0)
mle_exp_ml = ds_mle.query('Q5 == "Machine Learning Engineer"').groupby('Q15').apply(get_lang_freq, relative).unstack().loc[q15_order, lang_order].fillna(0)


# ds_exp_lang = ds_exp_lang.apply(lambda x: x/x.sum(), axis=0)
# mle_exp_lang = mle_exp_lang.apply(lambda x: x/x.sum(), axis=0)

# ds_exp_ml = ds_exp_ml.apply(lambda x: x/x.sum(), axis=0)
# mle_exp_ml = mle_exp_ml.apply(lambda x: x/x.sum(), axis=0)

fmt = '.2f'


# ds_exp_lang = ds_exp_lang.astype(int)
# mle_exp_lang = mle_exp_lang.astype(int)

# ds_exp_ml = ds_exp_ml.astype(int)
# mle_exp_ml = mle_exp_ml.astype(int)

# fmt = 'd'


fig, axes = plt.subplots(2, 2, figsize=(18, 12))
axes = axes.reshape(-1)



sns.heatmap(mle_exp_lang, cmap="YlGnBu",
            square=True, 
            linewidth=0.5, 
            cbar=False, 
            annot=True,
            fmt=fmt,
            ax=axes[0]
           )

axes[0].set_ylabel('Coding Experience')
axes[0].set_title('Languages Frequencies by Coding Experience in ML Engineers');

sns.heatmap(ds_exp_lang, cmap="YlGnBu",
            square=True, 
            linewidth=0.5, 
            cbar=False, 
            annot=True,
            fmt=fmt,
            ax=axes[1]
           )

axes[1].set_title('Languages Frequencies by Coding Experience in Data Scientists');

sns.heatmap(mle_exp_ml, cmap="YlGnBu",
            square=True, 
            linewidth=0.5, 
            cbar=False, 
            annot=True,
            fmt=fmt,
            ax=axes[2]
           )

axes[2].set_ylabel('ML Experience')
axes[2].set_xlabel('Language')
axes[2].set_title('Languages Frequencies by ML Experience in ML Engineers');

sns.heatmap(ds_exp_ml, cmap="YlGnBu",
            square=True, 
            linewidth=0.5, 
            cbar=False, 
            annot=True,
            fmt=fmt,
            ax=axes[3]
           )

axes[3].set_xlabel('Language')
axes[3].set_title('Languages Frequencies by ML Experience in Data Scientists');

I have to remind the reader that we are looking at the relative frequencies of the answers of each group, so these percentages don't represent actual precentage of users that use a particular language in a specific group, rather it calculate the percentage of that language presence in all answers for the multiple selection question for that group. Therefore some insights might be based on trivial numbers that got blown up in proportion.

From the get go we can clearly see that coding veterans don't use python that much for both Data Scientists and ML Engineers.

SQL usage in ML Engineers is uniformly distributed in all groups, as well as in Data Scientists.

C++ usage probability is constant across groups with different coding experiences in ML Engineers, which might signify that a group of some sort opts for using C++ regardless different experiences. This doesn't apply when we look at different ML experiences, where it comprised 16% of the answers of users with no ML experience, and decreases until it stabilized at 9% over 2-5 years of ML experience.  

Let's further look into how langauges preferences differ between groups with different education.

In [None]:
relative = True

ds_edu_lang = ds_mle.query('Q5 == "Data Scientist"').groupby('Q4').apply(get_lang_freq, relative).unstack().loc[q4_order, lang_order].fillna(0)
mle_edu_lang = ds_mle.query('Q5 == "Machine Learning Engineer"').groupby('Q4').apply(get_lang_freq, relative).unstack().loc[q4_order, lang_order].fillna(0)

# ds_edu_lang = ds_edu_lang.apply(lambda x: x/x.sum(), axis=0)
# mle_edu_lang = mle_edu_lang.apply(lambda x: x/x.sum(), axis=0)

fmt = '.2f'

fig, axes = plt.subplots(2, 1, figsize=(18, 12))

sns.heatmap(mle_edu_lang, cmap="YlGnBu",
            square=True, 
            linewidth=0.5, 
            cbar=False, 
            annot=True,
            fmt=fmt,
            ax=axes[0]
           )

axes[0].set_ylabel('Language Experience')
axes[0].set_title('Languages Frequencies by Education in ML Engineers');

sns.heatmap(ds_edu_lang, cmap="YlGnBu",
            square=True, 
            linewidth=0.5, 
            cbar=False, 
            annot=True,
            fmt=fmt,
            ax=axes[1]
           )

axes[1].set_title('Languages Frequencies by Education in Data Scientists');

Once again C++ proves that it's valuable for ML Engineers across different groups, also we can see that MATLAB is used by some ML Engineers with PhDs, which they might have picked up during their graduate studies.

In Data Scientists, most groups show similar answer distributions, and there is no valuable insight that hasn't been pointed out before.

Now it's time to look into the differen between Data Scientists and ML Engineers when it comes to ML. First of all, which frameworks do they use?

In [None]:
q16 = data.columns[data.columns.str.contains('Q16')]  # questions 16 multiple columns

def get_multi_question_freq(group, columns, relative=True):
    """Calculates a multiple selections question answers' absolute or relative frequency."""
    group_multi_question_freq = pd.melt(group.reset_index(), id_vars='index', value_vars=columns).value.value_counts().astype(int)
    if relative:
        group_multi_question_freq = group_multi_question_freq/group_multi_question_freq.sum()
    return group_multi_question_freq

ds_mle_lang_freq = ds_mle.groupby('Q5').apply(get_multi_question_freq, q16).reset_index()

plt.figure(figsize=(18, 5))
ax = sns.barplot(data=ds_mle_lang_freq, x='level_1', y='value', hue='Q5', color='#319ebd')
plt.xlabel('ML Framework')
plt.ylabel('Relative Frequency')
plt.title('Relative Frequencies of Regularly Machine Learning Frameworks for Data Scientists and ML Engineers')
plt.legend(title='Title', loc='right')

# hide the right and top spines
ax.spines['right'].set_visible(False)
ax.spines['top'].set_visible(False)

Now we can see a nice distinction between Data Scientists and ML Engineers, which is that ML Engineers tend to use Deep Learning frameworks more than Data Scientists, and Data Scientists tend to prefer regular ML frameworks.

If we look into their algorithms usage, I suppose that we shall see the same. Let's take a look.

In [None]:
q17 = data.columns[data.columns.str.contains('Q17')]  # questions 16 multiple columns

ds_mle_mla_freq = ds_mle.groupby('Q5').apply(get_multi_question_freq, q17).reset_index()

plt.figure(figsize=(10, 6))
ax = sns.barplot(data=ds_mle_mla_freq, y='level_1', x='value', hue='Q5', color='#319ebd')
plt.ylabel('ML Algorithms')
plt.xlabel('Relative Frequency')
plt.title('Relative Frequencies of Regularly Machine Learning Algorithms for Data Scientists and ML Engineers')
plt.legend(title='Title', loc='right')

# hide the right and top spines
ax.spines['right'].set_visible(False)
ax.spines['top'].set_visible(False)
# plt.xticks(rotation=90);

Now I think that is something to note down. ML Engineers tend to use Neural Networks more heavily than Data Scientists, and that should encourage all beginner Data Scientists, including myself, that our skills can still be useful even if they don't include advanced Deep Learning methods.

In [None]:
q23 = data.columns[data.columns.str.contains('Q23')]

ds_mle_act_freq = ds_mle.groupby('Q5').apply(get_multi_question_freq, q23).reset_index()

plt.figure(figsize=(10, 6))
ax = sns.barplot(data=ds_mle_act_freq, y='level_1', x='value', hue='Q5', color='#319ebd')
plt.ylabel('Activities')
plt.xlabel('Relative Frequency')
plt.title('Relative Frequencies of Regularly Activities for Data Scientists and ML Engineers')
plt.legend(title='Title', loc='right')

# hide the right and top spines
ax.spines['right'].set_visible(False)
ax.spines['top'].set_visible(False)
# plt.xticks(rotation=90);

Machine Learning Engineers tend to do every activity more than Data Scientists, except analyzing and understanding data.

How do women fare in both titles? Does one title favor women's presence over the other?

In [None]:
sex = ['Man', 'Woman']
ds_mle_sex = ds_mle.groupby('Q5')['Q2'].apply(lambda x: x.value_counts()[sex]/x.value_counts()[sex].sum()).reset_index()

plt.figure(figsize=(10, 6))
ax = sns.barplot(data=ds_mle_sex, x='Q5', y='Q2', hue='level_1', color='#319ebd')
plt.title('Men and Women Proportions in Data Science and ML Engineering')
plt.legend(title='Title')

for p, c in zip(ax.patches, ds_mle_sex.sort_values('level_1')['Q2'].values):
    ax.annotate('{:.1f}%'.format(c*100), (p.get_x() + p.get_width() / 2., p.get_height()), 
                   ha = 'center', va = 'center', 
                   xytext = (0, 7), 
                   textcoords = 'offset points')

We can see that women's presence is just slightly higher in Data Scientists than in ML Engineers.

What course platforms do DS and MLEs prefer?

In [None]:
q37 = ds_mle.columns[ds_mle.columns.str.startswith('Q37')]

ds_mle_course_freq = ds_mle.groupby('Q5').apply(get_multi_question_freq, q37).reset_index()

plt.figure(figsize=(10, 8))
ax = sns.barplot(data=ds_mle_course_freq, y='level_1', x='value', hue='Q5', color='#319ebd')

ds_values = ds_mle_course_freq.query('Q5 == "Data Scientist"')
mle_values = ds_mle_course_freq.query('Q5 != "Data Scientist"')



course_platforms = ds_mle_course_freq.level_1.unique()
for p in enumerate(ax.patches):
    
    
    ax.annotate('{:.1f}%'.format(c*100), (p.get_x() + p.get_width() / 2., p.get_height()), 
                   ha = 'center', va = 'center', 
                   xytext = (0, 7), 
                   textcoords = 'offset points')

plt.ylabel('Course Platforms')
plt.xlabel('Relative Frequency')
plt.title('Relative Frequencies of Course Platforms for Data Scientists and ML Engineers')
plt.legend(title='Title', loc='right')

# hide the right and top spines
ax.spines['right'].set_visible(False)
ax.spines['top'].set_visible(False)

### Correspondence Analysis

In [None]:
# Chi2 test
from scipy.stats import chi2_contingency

q4_order = ['No formal education past high school', 'Some college/university study without earning a bachelor’s degree',
            'Bachelor’s degree', 'Professional degree', 'Master’s degree', 'Doctoral degree', 'I prefer not to answer']
q4_q8 = pd.pivot_table(data, index=['Q4'], columns=['Q8'], aggfunc='size', fill_value=0).loc[q4_order]

E_q4_q8 = np.zeros(q4_q8.shape)

grandTotal = np.sum(q4_q8.values)
rowsTotal = np.sum(q4_q8, axis=1).values
colsTotal = np.sum(q4_q8, axis=0).values

expected = np.outer(rowsTotal, colsTotal)/grandTotal

chi_squared = np.sum(np.square(q4_q8.values - expected)/expected)

print(chi_squared)

statistic, prob, dof, ex = chi2_contingency(q4_q8)

print(statistic)

In [None]:
# Chi2 distance
norm_q4_q8 = np.divide(q4_q8.values, rowsTotal[:, None])

n_rows = q4_q8.shape[0]
chi2_distances = np.zeros((n_rows, n_rows))

norm_colTotals = np.sum(norm_q4_q8, axis=1)

for row in range(n_rows):
    chi2_distances[row] = np.sqrt(np.sum(np.square(norm_q4_q8 - norm_q4_q8[row])/norm_colTotals[:, None], axis=1))
    
pd.DataFrame(data=np.round(chi2_distances*100).astype(int), columns=q4_q8.index)

In [None]:
q4_q8.shape