# Categorical free text

In [None]:
import pandas as pd
# Import spreadsheet cleaning loader
from load_survey_data import load_data, headers

In [None]:
sheet = "results.xlsx"
datetime_format = "%Y-%m-%d %H:%M:%S %Z"

In [None]:
df = load_data(sheet, datetime_format)

In [None]:
from analysis import categories, categories_per_answer

In [None]:
from tabulate import tabulate

In [None]:
# Set to True to review the text that is dropped
show_dropped_rows = False

In [None]:
df_categorised = pd.DataFrame(index=df.index.copy())

## 7.a. Which non-desktop interfaces are important to you?

In [None]:
question = '7.a. Which non-desktop interfaces are important to you?'
answers=df[question]
keep_words=[
    'beam',
    'cassandra',
    'commandline',
    'docker',
    'flink',
    'geode',
    'git',
    'hadoop',
    'impala',
    'jupyter',
    'kafka',
    'linux',
    'nifi',
    'omero',
    'phoenix',
    'rstudio',
    'solr',
    'spark',
    'tinkerpop',
    'vscode',
    'xnat',
    'zepplin',
]
synonyms={
    'commandline': [r'command\s*line', 'cli'],
    'git': ['github', 'gitlab', 'git'],
    'jupyter': [r'jupyter.*'],
    'rstudio': ['rserver'],
}
drop_rows=[8,11,14,18,20,23,29,34,35,36,46,64,71,75,92,93,94,97,100,104]

In [None]:
categories(
    answers=answers,
    keep_words=keep_words,
    synonyms=synonyms,
    drop_rows=drop_rows
).round(decimals=1)

In [None]:
if show_dropped_rows:
    print('\n'.join(answers[drop_rows]))

In [None]:
dfc = categories_per_answer(
    answers=answers,
    keep_words=keep_words,
    synonyms=synonyms,
    drop_rows=drop_rows
)
print(tabulate(
    dfc,
    ['id', 'interfaces'],
    tablefmt='github'
))
df_categorised[question] = dfc

## 8.a. Which programming languages are important to you?

In [None]:
question = '8.a. Which programming languages are important to you?'
answers=df[question]
keep_words=['python', 'r', 'stata', 'julia', 'javascript', 'c#', 'java', 'c++', 'sql', 'html', 'shell', 'spark', 'pyspark', 'matlab', 'cuda', 'go', 'spss', 'terraform', 'c', 'groovy']
synonyms={'shell': ['bash', 'zsh'], 'sql': ['mysql', 'postgres', 'postgresql'], 'spark': [r'.*spark']}
drop_rows=[8,34,47,67,70,94,101,104]

In [None]:
categories(
    answers=answers,
    keep_words=keep_words,
    synonyms=synonyms,
    drop_rows=drop_rows
).round(decimals=1)

In [None]:
if show_dropped_rows:
    print('\n'.join(answers[drop_rows]))

In [None]:
dfc = categories_per_answer(
    answers=answers,
    keep_words=keep_words,
    synonyms=synonyms,
    drop_rows=drop_rows
)
print(tabulate(
    dfc,
    ['id', 'interfaces'],
    tablefmt='github'
))
df_categorised[question] = dfc

## 9.a. Which repositories are important to you?

In [None]:
question = '9.a. Which repositories are important to you?'
answers=df[question]
keep_words=['cran', 'pypi', 'anaconda', 'kaggle', 'github', 'dockerhub', 'quay', 'conda-forge', 'bioconductor', 'bioconda', 'distribution']
synonyms={'cran': ['\br\b'], 'pypi': ['python'], 'anaconda': [r'\*conda', r'\bconda(?!-)'], 'github': ['git'], 'dockerhub': ['docker hub'], 'distribution': ['apt', 'linux']}
drop_rows=[3,8,12,22,28,31,35,36,41,47,54,58,64,67,70,71,72,88,91,92,94,99,101,104]

In [None]:
categories(
    answers=answers,
    keep_words=keep_words,
    synonyms=synonyms,
    drop_rows=drop_rows
).round(decimals=1)

In [None]:
if show_dropped_rows:
    print('\n'.join(answers[drop_rows]))

In [None]:
dfc = categories_per_answer(
    answers=answers,
    keep_words=keep_words,
    synonyms=synonyms,
    drop_rows=drop_rows
)
print(tabulate(
    dfc,
    ['id', 'interfaces'],
    tablefmt='github'
))
df_categorised[question] = dfc

## 10.a. Which commercially licenced software is important to you?

In [None]:
question = '10.a. Which commercially licenced software is important to you?'
answers=df[question]
keep_words=['sas', 'office', 'stata', 'spss', 'powerbi', 'tableau', 'matlab', 'jasp', 'arcgis', 'nvivo', 'gitkraken', 'mathematica', 'maple', 'photoshop']
synonyms={'powerbi': ['bi'], 'office': ['excel']}
drop_rows=[8,19,22,31,36,46,64,70,71,94,99,104]

In [None]:
categories(
    answers=answers,
    keep_words=keep_words,
    synonyms=synonyms,
    drop_rows=drop_rows
).round(decimals=1)

In [None]:
if show_dropped_rows:
    print('\n'.join(answers[drop_rows]))

In [None]:
dfc = categories_per_answer(
    answers=answers,
    keep_words=keep_words,
    synonyms=synonyms,
    drop_rows=drop_rows
)
print(tabulate(
    dfc,
    ['id', 'interfaces'],
    tablefmt='github'
))
df_categorised[question] = dfc

In [None]:
df_categorised.columns = [f'{c} (converted)' for c in df_categorised.columns]
df_categorised.to_csv('categorised.csv', index=False)