In [36]:
import pandas as pd
import os
from consolidation import group_columns, consolidate_mc #these are custom functions, see consolidation.py

In [37]:
# set data dir
data_dir = "../../AI_perception_survey_data/CN"

In [38]:
# get file name
fname = [file for file in os.listdir(data_dir) if file[0] != "."]

In [39]:
# load in the data
data = pd.read_csv(os.path.join(data_dir, fname[0]), encoding='utf-8')

In [40]:
# setting vars to represent the questions for easy ref
questions = group_columns(data)

In [41]:
# these are the easy ref keys for the column groups
questions.keys()

dict_keys(['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', '17', '18', '19', '20', '21', '22', '23', '24', '25', '26', '27', '28', '29'])

In [42]:
# determining whhich questions have multiple cols (and therefore need consolidating)
cols_to_consolidate = [k for k,v in questions.items() if len(v) > 1]
print(cols_to_consolidate)

['2', '10', '11', '14', '15', '18', '19', '21', '24']


## consolidating for all multiple choice questions in one go

In [43]:
# now consolidating
for col in cols_to_consolidate:
    # getting the consolidated list
    q_answers = consolidate_mc(data, questions, col)
    # creating new column
    data['q' + col] = q_answers

In [44]:
data.columns

Index(['编号', '开始答题时间', '结束答题时间', 'QQ', '微信标识', '自定义字段', '1.您如何评价您的人工智能技术的知识？',
       '2.关于人工智能的性质或者能力的阐述，以下哪些是错误的？:安卓或者iOS智能手机上通常都有人工智能应用程序',
       '2.关于人工智能的性质或者能力的阐述，以下哪些是错误的？:人工智能在现阶段只存在于学术科研之中，尚未广泛的商业化',
       '2.关于人工智能的性质或者能力的阐述，以下哪些是错误的？:人工智能是连接起来的多台电脑',
       ...
       'Unnamed: 94', 'q2', 'q10', 'q11', 'q14', 'q15', 'q18', 'q19', 'q21',
       'q24'],
      dtype='object', length=104)

In [45]:
cols_to_consolidate

['2', '10', '11', '14', '15', '18', '19', '21', '24']

Ok so all columns consolidated properly

## removing the consolidated columns and renaming the remaining questions for consistancy

In [46]:
# determining the cols we'd like to keep
new_cols = [col for col in data.columns if col.split('.')[0] not in cols_to_consolidate]

In [47]:
# removing that weird columns that'd just null
new_cols.remove('Unnamed: 94')

In [48]:
data = data[new_cols]

In [49]:
# renaming the single choice questions
data.columns = ['q' + col.split('.')[0] if len(col.split('.')) > 1 else col for col in data.columns]

In [50]:
# removing the other unwanted columns: 'QQ', '自定义字段'
data.drop(['QQ', '自定义字段'], axis = 1, inplace = True)

In [51]:
# renaming single digit question cols to double digit for sorting purposes
data.columns = ['q0' + col[1:] if col[0] == 'q' and len(col) < 3 else col for col in data.columns]

In [52]:
# reorganising col orders
data = data[sorted(data.columns)]

In [53]:
data.columns

Index(['q01', 'q02', 'q03', 'q04', 'q05', 'q06', 'q07', 'q08', 'q09', 'q10',
       'q11', 'q12', 'q13', 'q14', 'q15', 'q16', 'q17', 'q18', 'q19', 'q20',
       'q21', 'q22', 'q23', 'q24', 'q25', 'q26', 'q27', 'q28', 'q29', '开始答题时间',
       '微信标识', '结束答题时间', '编号'],
      dtype='object')

In [54]:
# translating the cols into English cos I don't like switching between language inputs
translated_cols = list(data.columns[:29]) + ['begin_time', 'wechat_icon', 'end_time', 'index']

In [55]:
data.columns = translated_cols

## removing text in the answers and only keeping the indices

# let's save this for further analysis