# read and transform Google Form data (checkboxes, multiple choice grid items)
* Google Form: https://forms.gle/7a35kfYmzyJJb5GD9
* Google Sheets: https://docs.google.com/spreadsheets/d/1W0EaP8WGWaOK8XYDQA3Z4HIZaOQXXYj6U6kzqMuVrUI/edit#gid=1695829581

---
* author:  [Prasert Kanawattanachai](prasert.k@chula.ac.th)
* YouTube: https://www.youtube.com/prasertcbs
* github: https://github.com/prasertcbs/
* [Chulalongkorn Business School](https://www.cbs.chula.ac.th/en/)
---

In [1]:
import pandas as pd
import numpy as np
import re

In [2]:
print(f'pandas version: {pd.__version__}')
print(f'numpy  version: {np.__version__}')
print(pd.Timestamp.now())

pandas version: 1.2.1
numpy  version: 1.19.2
2021-02-20 09:20:15.132975


In [54]:
# google sheets generated by google form
# https://docs.google.com/spreadsheets/d/1W0EaP8WGWaOK8XYDQA3Z4HIZaOQXXYj6U6kzqMuVrUI/edit#gid=1695829581
# note: replace 'edit#' with 'export?format=xlsx&'
gs_url='https://docs.google.com/spreadsheets/d/1W0EaP8WGWaOK8XYDQA3Z4HIZaOQXXYj6U6kzqMuVrUI/edit#gid=1695829581'
url=re.sub('edit#', 'export?format=xlsx&', gs_url)
# url='https://docs.google.com/spreadsheets/d/1W0EaP8WGWaOK8XYDQA3Z4HIZaOQXXYj6U6kzqMuVrUI/export?format=xlsx&gid=1695829581'
df = pd.read_excel(url)
df

Unnamed: 0,Timestamp,gender,age (years),skills,satisfaction [product],satisfaction [price],satisfaction [service],satisfaction [overall]
0,2021-02-19 22:36:35.982,Male,24.0,"Word, Excel, PowerPoint, Access",very dissatisfied,dissatisfied,neutral,satisfied
1,2021-02-19 22:38:17.913,Male,22.0,"Excel, SQL",very satisfied,satisfied,neutral,dissatisfied
2,2021-02-19 22:49:14.802,Prefer not to say,35.0,"Word, Excel, PowerPoint, SPSS",very dissatisfied,dissatisfied,neutral,dissatisfied
3,2021-02-19 23:01:24.057,Female,23.0,"Word, Excel, PowerPoint, SQL, Python, R, JavaS...",very satisfied,satisfied,neutral,dissatisfied
4,2021-02-20 08:27:19.584,Female,27.0,"Word, Excel, PowerPoint",neutral,satisfied,very satisfied,very satisfied
5,2021-02-20 08:27:36.059,Female,19.0,"Excel, Python, R",satisfied,satisfied,satisfied,satisfied
6,2021-02-20 08:28:19.725,Prefer not to say,30.0,"SQL, R, SPSS, SAS",dissatisfied,satisfied,neutral,very satisfied
7,2021-02-20 09:16:02.448,Prefer not to say,24.0,PowerPoint,satisfied,very satisfied,satisfied,very satisfied
8,2021-02-20 09:22:23.851,Male,,Word,very dissatisfied,dissatisfied,neutral,
9,2021-02-20 09:23:22.259,,,Excel,,,,satisfied


In [55]:
df.columns

Index(['Timestamp', 'gender', 'age (years)', 'skills',
       'satisfaction [product]', 'satisfaction [price]',
       'satisfaction [service]', 'satisfaction [overall]'],
      dtype='object')

In [56]:
df.columns.str.replace(r'(satisfaction|\[|\])', '', regex=True).str.strip()

Index(['Timestamp', 'gender', 'age (years)', 'skills', 'product', 'price',
       'service', 'overall'],
      dtype='object')

In [57]:
df.columns = ['timestamp', 'gender', 'age', 'skills', 'product', 'price', 'service', 'overall']
df

Unnamed: 0,timestamp,gender,age,skills,product,price,service,overall
0,2021-02-19 22:36:35.982,Male,24.0,"Word, Excel, PowerPoint, Access",very dissatisfied,dissatisfied,neutral,satisfied
1,2021-02-19 22:38:17.913,Male,22.0,"Excel, SQL",very satisfied,satisfied,neutral,dissatisfied
2,2021-02-19 22:49:14.802,Prefer not to say,35.0,"Word, Excel, PowerPoint, SPSS",very dissatisfied,dissatisfied,neutral,dissatisfied
3,2021-02-19 23:01:24.057,Female,23.0,"Word, Excel, PowerPoint, SQL, Python, R, JavaS...",very satisfied,satisfied,neutral,dissatisfied
4,2021-02-20 08:27:19.584,Female,27.0,"Word, Excel, PowerPoint",neutral,satisfied,very satisfied,very satisfied
5,2021-02-20 08:27:36.059,Female,19.0,"Excel, Python, R",satisfied,satisfied,satisfied,satisfied
6,2021-02-20 08:28:19.725,Prefer not to say,30.0,"SQL, R, SPSS, SAS",dissatisfied,satisfied,neutral,very satisfied
7,2021-02-20 09:16:02.448,Prefer not to say,24.0,PowerPoint,satisfied,very satisfied,satisfied,very satisfied
8,2021-02-20 09:22:23.851,Male,,Word,very dissatisfied,dissatisfied,neutral,
9,2021-02-20 09:23:22.259,,,Excel,,,,satisfied


## recode gender 

In [58]:
d = {'Female': 'F',
     'Male': 'M',
     'Prefer not to say': 'X'}

df.gender.map(d, na_action='ignore')

0      M
1      M
2      X
3      F
4      F
5      F
6      X
7      X
8      M
9    NaN
Name: gender, dtype: object

In [59]:
df.gender=df.gender.map(d, na_action='ignore')

In [60]:
df

Unnamed: 0,timestamp,gender,age,skills,product,price,service,overall
0,2021-02-19 22:36:35.982,M,24.0,"Word, Excel, PowerPoint, Access",very dissatisfied,dissatisfied,neutral,satisfied
1,2021-02-19 22:38:17.913,M,22.0,"Excel, SQL",very satisfied,satisfied,neutral,dissatisfied
2,2021-02-19 22:49:14.802,X,35.0,"Word, Excel, PowerPoint, SPSS",very dissatisfied,dissatisfied,neutral,dissatisfied
3,2021-02-19 23:01:24.057,F,23.0,"Word, Excel, PowerPoint, SQL, Python, R, JavaS...",very satisfied,satisfied,neutral,dissatisfied
4,2021-02-20 08:27:19.584,F,27.0,"Word, Excel, PowerPoint",neutral,satisfied,very satisfied,very satisfied
5,2021-02-20 08:27:36.059,F,19.0,"Excel, Python, R",satisfied,satisfied,satisfied,satisfied
6,2021-02-20 08:28:19.725,X,30.0,"SQL, R, SPSS, SAS",dissatisfied,satisfied,neutral,very satisfied
7,2021-02-20 09:16:02.448,X,24.0,PowerPoint,satisfied,very satisfied,satisfied,very satisfied
8,2021-02-20 09:22:23.851,M,,Word,very dissatisfied,dissatisfied,neutral,
9,2021-02-20 09:23:22.259,,,Excel,,,,satisfied


In [61]:
df['product'].map({'very dissatisfied':1, 'dissatisfied':2, 'neutral':3, 'satisfied':4, 'very satisfied':5}, na_action='ignore')

0    1.0
1    5.0
2    1.0
3    5.0
4    3.0
5    4.0
6    2.0
7    4.0
8    1.0
9    NaN
Name: product, dtype: float64

In [62]:
pd.Categorical(df['product'], 
               categories=['very dissatisfied', 'dissatisfied', 'neutral', 'satisfied', 'very satisfied'], ordered=True)

['very dissatisfied', 'very satisfied', 'very dissatisfied', 'very satisfied', 'neutral', 'satisfied', 'dissatisfied', 'satisfied', 'very dissatisfied', NaN]
Categories (5, object): ['very dissatisfied' < 'dissatisfied' < 'neutral' < 'satisfied' < 'very satisfied']

In [63]:
df['product']

0    very dissatisfied
1       very satisfied
2    very dissatisfied
3       very satisfied
4              neutral
5            satisfied
6         dissatisfied
7            satisfied
8    very dissatisfied
9                  NaN
Name: product, dtype: object

In [64]:
for c in ['product', 'price', 'service', 'overall']:
    df[f'{c}_n']=df[c].map({'very dissatisfied':1, 'dissatisfied':2, 'neutral':3, 'satisfied':4, 'very satisfied':5})    
    df[c]=pd.Categorical(df[c], 
               categories=['very dissatisfied', 'dissatisfied', 'neutral', 'satisfied', 'very satisfied'], ordered=True)  
    df[f'{c}_cat']=df[c].cat.codes + 1

In [65]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 16 columns):
 #   Column       Non-Null Count  Dtype         
---  ------       --------------  -----         
 0   timestamp    10 non-null     datetime64[ns]
 1   gender       9 non-null      object        
 2   age          8 non-null      float64       
 3   skills       10 non-null     object        
 4   product      9 non-null      category      
 5   price        9 non-null      category      
 6   service      9 non-null      category      
 7   overall      9 non-null      category      
 8   product_n    9 non-null      float64       
 9   product_cat  10 non-null     int8          
 10  price_n      9 non-null      float64       
 11  price_cat    10 non-null     int8          
 12  service_n    9 non-null      float64       
 13  service_cat  10 non-null     int8          
 14  overall_n    9 non-null      float64       
 15  overall_cat  10 non-null     int8          
dtypes: category

In [66]:
df['product'].cat.codes + 1

0    1
1    5
2    1
3    5
4    3
5    4
6    2
7    4
8    1
9    0
dtype: int8

In [67]:
df['overall'].cat.codes

0    3
1    1
2    1
3    1
4    4
5    3
6    4
7    4
8   -1
9    3
dtype: int8

In [68]:
df['overall_n'].mean()

3.6666666666666665

In [69]:
(df['overall'].cat.codes + 1).mean()

3.3

In [70]:
df[df['overall'] > 'neutral']

Unnamed: 0,timestamp,gender,age,skills,product,price,service,overall,product_n,product_cat,price_n,price_cat,service_n,service_cat,overall_n,overall_cat
0,2021-02-19 22:36:35.982,M,24.0,"Word, Excel, PowerPoint, Access",very dissatisfied,dissatisfied,neutral,satisfied,1.0,1,2.0,2,3.0,3,4.0,4
4,2021-02-20 08:27:19.584,F,27.0,"Word, Excel, PowerPoint",neutral,satisfied,very satisfied,very satisfied,3.0,3,4.0,4,5.0,5,5.0,5
5,2021-02-20 08:27:36.059,F,19.0,"Excel, Python, R",satisfied,satisfied,satisfied,satisfied,4.0,4,4.0,4,4.0,4,4.0,4
6,2021-02-20 08:28:19.725,X,30.0,"SQL, R, SPSS, SAS",dissatisfied,satisfied,neutral,very satisfied,2.0,2,4.0,4,3.0,3,5.0,5
7,2021-02-20 09:16:02.448,X,24.0,PowerPoint,satisfied,very satisfied,satisfied,very satisfied,4.0,4,5.0,5,4.0,4,5.0,5
9,2021-02-20 09:23:22.259,,,Excel,,,,satisfied,,0,,0,,0,4.0,4


In [71]:
df[df['overall_n'] > 3]

Unnamed: 0,timestamp,gender,age,skills,product,price,service,overall,product_n,product_cat,price_n,price_cat,service_n,service_cat,overall_n,overall_cat
0,2021-02-19 22:36:35.982,M,24.0,"Word, Excel, PowerPoint, Access",very dissatisfied,dissatisfied,neutral,satisfied,1.0,1,2.0,2,3.0,3,4.0,4
4,2021-02-20 08:27:19.584,F,27.0,"Word, Excel, PowerPoint",neutral,satisfied,very satisfied,very satisfied,3.0,3,4.0,4,5.0,5,5.0,5
5,2021-02-20 08:27:36.059,F,19.0,"Excel, Python, R",satisfied,satisfied,satisfied,satisfied,4.0,4,4.0,4,4.0,4,4.0,4
6,2021-02-20 08:28:19.725,X,30.0,"SQL, R, SPSS, SAS",dissatisfied,satisfied,neutral,very satisfied,2.0,2,4.0,4,3.0,3,5.0,5
7,2021-02-20 09:16:02.448,X,24.0,PowerPoint,satisfied,very satisfied,satisfied,very satisfied,4.0,4,5.0,5,4.0,4,5.0,5
9,2021-02-20 09:23:22.259,,,Excel,,,,satisfied,,0,,0,,0,4.0,4


In [72]:
df['product_cat']

0    1
1    5
2    1
3    5
4    3
5    4
6    2
7    4
8    1
9    0
Name: product_cat, dtype: int8

In [73]:
df['product'].cat.codes

0    0
1    4
2    0
3    4
4    2
5    3
6    1
7    3
8    0
9   -1
dtype: int8

In [74]:
df['product'][0]

'very dissatisfied'

In [75]:
ds=df.skills.str.get_dummies(', ')
ds

Unnamed: 0,Access,Excel,JavaScript,PowerPoint,Python,R,SAS,SPSS,SQL,Word
0,1,1,0,1,0,0,0,0,0,1
1,0,1,0,0,0,0,0,0,1,0
2,0,1,0,1,0,0,0,1,0,1
3,0,1,1,1,1,1,0,0,1,1
4,0,1,0,1,0,0,0,0,0,1
5,0,1,0,0,1,1,0,0,0,0
6,0,0,0,0,0,1,1,1,1,0
7,0,0,0,1,0,0,0,0,0,0
8,0,0,0,0,0,0,0,0,0,1
9,0,1,0,0,0,0,0,0,0,0


In [76]:
dt=pd.concat([df, ds], axis=1).copy()
dt

Unnamed: 0,timestamp,gender,age,skills,product,price,service,overall,product_n,product_cat,...,Access,Excel,JavaScript,PowerPoint,Python,R,SAS,SPSS,SQL,Word
0,2021-02-19 22:36:35.982,M,24.0,"Word, Excel, PowerPoint, Access",very dissatisfied,dissatisfied,neutral,satisfied,1.0,1,...,1,1,0,1,0,0,0,0,0,1
1,2021-02-19 22:38:17.913,M,22.0,"Excel, SQL",very satisfied,satisfied,neutral,dissatisfied,5.0,5,...,0,1,0,0,0,0,0,0,1,0
2,2021-02-19 22:49:14.802,X,35.0,"Word, Excel, PowerPoint, SPSS",very dissatisfied,dissatisfied,neutral,dissatisfied,1.0,1,...,0,1,0,1,0,0,0,1,0,1
3,2021-02-19 23:01:24.057,F,23.0,"Word, Excel, PowerPoint, SQL, Python, R, JavaS...",very satisfied,satisfied,neutral,dissatisfied,5.0,5,...,0,1,1,1,1,1,0,0,1,1
4,2021-02-20 08:27:19.584,F,27.0,"Word, Excel, PowerPoint",neutral,satisfied,very satisfied,very satisfied,3.0,3,...,0,1,0,1,0,0,0,0,0,1
5,2021-02-20 08:27:36.059,F,19.0,"Excel, Python, R",satisfied,satisfied,satisfied,satisfied,4.0,4,...,0,1,0,0,1,1,0,0,0,0
6,2021-02-20 08:28:19.725,X,30.0,"SQL, R, SPSS, SAS",dissatisfied,satisfied,neutral,very satisfied,2.0,2,...,0,0,0,0,0,1,1,1,1,0
7,2021-02-20 09:16:02.448,X,24.0,PowerPoint,satisfied,very satisfied,satisfied,very satisfied,4.0,4,...,0,0,0,1,0,0,0,0,0,0
8,2021-02-20 09:22:23.851,M,,Word,very dissatisfied,dissatisfied,neutral,,1.0,1,...,0,0,0,0,0,0,0,0,0,1
9,2021-02-20 09:23:22.259,,,Excel,,,,satisfied,,0,...,0,1,0,0,0,0,0,0,0,0


In [77]:
dt.columns

Index(['timestamp', 'gender', 'age', 'skills', 'product', 'price', 'service',
       'overall', 'product_n', 'product_cat', 'price_n', 'price_cat',
       'service_n', 'service_cat', 'overall_n', 'overall_cat', 'Access',
       'Excel', 'JavaScript', 'PowerPoint', 'Python', 'R', 'SAS', 'SPSS',
       'SQL', 'Word'],
      dtype='object')

In [78]:
dt.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
age,8.0,25.5,5.042675,19.0,22.75,24.0,27.75,35.0
product_n,9.0,2.888889,1.691482,1.0,1.0,3.0,4.0,5.0
product_cat,10.0,2.6,1.837873,0.0,1.0,2.5,4.0,5.0
price_n,9.0,3.444444,1.130388,2.0,2.0,4.0,4.0,5.0
price_cat,10.0,3.1,1.523884,0.0,2.0,4.0,4.0,5.0
service_n,9.0,3.444444,0.726483,3.0,3.0,3.0,4.0,5.0
service_cat,10.0,3.1,1.286684,0.0,3.0,3.0,3.75,5.0
overall_n,9.0,3.666667,1.322876,2.0,2.0,4.0,5.0,5.0
overall_cat,10.0,3.3,1.702939,0.0,2.0,4.0,4.75,5.0
Access,10.0,0.1,0.316228,0.0,0.0,0.0,0.0,1.0


In [79]:
dt.loc[:, 'Access':'Word'].sum()

Access        1
Excel         7
JavaScript    1
PowerPoint    5
Python        2
R             3
SAS           1
SPSS          2
SQL           3
Word          5
dtype: int64

In [80]:
dt.loc[:, 'Access':'Word'].sum(axis=1)

0    4
1    2
2    4
3    7
4    3
5    3
6    4
7    1
8    1
9    1
dtype: int64