In [94]:
# Set up environment

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

import pandas as pd
import numpy as np
import seaborn as sns

In [95]:
# Read in data
post = pd.read_csv('post_at2weeks_all.csv',na_values = ' ')

In [96]:
# Check to ensure that missing values are detected
post.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 576 entries, 0 to 575
Data columns (total 78 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   StartDate              576 non-null    object 
 1   EndDate                576 non-null    object 
 2   Progress               576 non-null    int64  
 3   Duration__in_seconds_  576 non-null    int64  
 4   Finished               576 non-null    int64  
 5   RecordedDate           576 non-null    object 
 6   ExternalReference      576 non-null    object 
 7   Q2_1                   546 non-null    float64
 8   Q2_2                   545 non-null    float64
 9   Q2_3                   546 non-null    float64
 10  Q3_1                   545 non-null    float64
 11  Q3_2                   545 non-null    float64
 12  Q3_3                   545 non-null    float64
 13  Q3_4                   545 non-null    float64
 14  Q4_1                   544 non-null    float64
 15  Q4_2  

In [97]:
# List all the columns that are only in premodified
premod_cols = ['Q9_2','Q11_8', 'Q12_1', 'Q12_2', 'Q12_3', 'Q13_1', 'Q13_2', 'Q13_3',
 'Q14_2', 'Q14_4', 'Q14_5', 'Q14_6', 'Q14_8']

In [98]:
# For all those who took modified, these columns are '-888' --> not applicable
post.loc[(post['Survey_Type']==1),premod_cols]=-888

In [99]:
# Check that all 560 who took modified are -888
for l in range(13):
    post[premod_cols[l]].value_counts()
    

-888.0    560
 5.0       13
 4.0        3
Name: Q9_2, dtype: int64

-888.0    560
 1.0        7
 2.0        6
 3.0        3
Name: Q11_8, dtype: int64

-888.0    560
 2.0       16
Name: Q12_1, dtype: int64

-888.0    560
 2.0       10
 1.0        6
Name: Q12_2, dtype: int64

-888.0    560
 1.0        8
 2.0        7
 0.0        1
Name: Q12_3, dtype: int64

-888.0    560
 4.0        8
 3.0        5
 5.0        2
 2.0        1
Name: Q13_1, dtype: int64

-888.0    560
 5.0        8
 4.0        5
 3.0        3
Name: Q13_2, dtype: int64

-888.0    560
 4.0       10
 5.0        5
 3.0        1
Name: Q13_3, dtype: int64

-888.0    560
 4.0        8
 5.0        7
 3.0        1
Name: Q14_2, dtype: int64

-888.0    560
 5.0        9
 4.0        7
Name: Q14_4, dtype: int64

-888.0    560
 5.0        9
 4.0        6
 3.0        1
Name: Q14_5, dtype: int64

-888.0    560
 5.0        9
 4.0        7
Name: Q14_6, dtype: int64

-888.0    560
 5.0        9
 4.0        6
 3.0        1
Name: Q14_8, dtype: int64

In [100]:
# Check to make sure there were no missing values (all should add to 576)
for l in range(13):
    post[premod_cols[l]].value_counts().sum()

576

576

576

576

576

576

576

576

576

576

576

576

576

In [101]:
# For Q5, any missing values would be actually missing: -999
post['Q5'].value_counts()
post['Q5'] = post['Q5'].fillna(-999)
post['Q5'].value_counts()

1.0    524
0.0     22
Name: Q5, dtype: int64

 1.0      524
-999.0     30
 0.0       22
Name: Q5, dtype: int64

In [102]:
# If Q5 == 1, then Q6 --> -888

post.loc[(post['Q5']==1),'Q6'].value_counts()  #check to make sure it's empty
post.loc[(post['Q5']==1),'Q6'] = -888
post.loc[(post['Q5']==1),'Q6'].value_counts()

Series([], Name: Q6, dtype: int64)

-888.0    524
Name: Q6, dtype: int64

In [103]:
# If Q5 == 0, then Q6 should have a value (0,1,2,3,333). Find the missing values

post.loc[(post['Q5']==0),'Q6'] #521 is missing a value
post.loc[521,'Q6'] = -999
post['Q6'].value_counts()

59       2.0
259      3.0
325      2.0
330      2.0
331    333.0
333    333.0
347      2.0
352      2.0
356      2.0
372    333.0
382    333.0
407      2.0
501      2.0
502      3.0
515    333.0
520    333.0
521      NaN
530      3.0
538    333.0
545      2.0
563    333.0
569    333.0
Name: Q6, dtype: float64

-888.0    524
 333.0      9
 2.0        9
 3.0        3
-999.0      1
Name: Q6, dtype: int64

In [104]:
# Now for all empty cells, we know they are missing so we add -999
post['Q6'] = post['Q6'].fillna(-999)

In [105]:
# Let's check to see what 'Q6 looks like now'

post['Q6'].value_counts()

-888.0    524
-999.0     31
 333.0      9
 2.0        9
 3.0        3
Name: Q6, dtype: int64

In [106]:
# For Q6_333_TEXT, let's see if there's any missing values

post['Q6_333_TEXT'].value_counts().sum() 
post.loc[post['Q6']==333,'Q6_333_TEXT'].value_counts().sum()

9

9

In [107]:
# Since there are no missing values in 'Q6_333_TEXT', we know they're all -888 EXCEPT
# for the ones that are missing from the get-go

post.loc[post['Q6']==-999,'Q6_333_TEXT'] = -999
post['Q6_333_TEXT'] = post['Q6_333_TEXT'].fillna(-888)

In [108]:
# Check to see what 'Q6_333_TEXT' looks like
post['Q6_333_TEXT'].value_counts()


-888                                                                            536
-999                                                                             31
Virtual training. Waiting for a kit to be sent to me.                             1
I will decide whether or not to pick up a kit when in-person college resumes      1
It was a virtual class.  I'm under the impression one will be mailed to me.       1
training was online, kit will be mailed to me                                     1
the session is online and I hope you will send me one :)                          1
It will be mailed to me.                                                          1
I requested one to be sent to me but haven't received it yet                      1
Virtual learning due to the pandemic, would like one sent to me!                  1
I believe one should be mailed to me. The session was online.                     1
Name: Q6_333_TEXT, dtype: int64

In [109]:
post.to_csv('check.csv',index=False)

In [110]:
# Rest of the values in the dataset are missing

post = post.fillna(-999)

In [111]:
# Let's see if everyone is accounted for
post.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 576 entries, 0 to 575
Data columns (total 78 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   StartDate              576 non-null    object 
 1   EndDate                576 non-null    object 
 2   Progress               576 non-null    int64  
 3   Duration__in_seconds_  576 non-null    int64  
 4   Finished               576 non-null    int64  
 5   RecordedDate           576 non-null    object 
 6   ExternalReference      576 non-null    object 
 7   Q2_1                   576 non-null    float64
 8   Q2_2                   576 non-null    float64
 9   Q2_3                   576 non-null    float64
 10  Q3_1                   576 non-null    float64
 11  Q3_2                   576 non-null    float64
 12  Q3_3                   576 non-null    float64
 13  Q3_4                   576 non-null    float64
 14  Q4_1                   576 non-null    float64
 15  Q4_2  

In [112]:
#Export the missing_values_handled dataset

post.to_csv('post_mv_handled.csv',index=False)

In [114]:
post_patterns = post.sort_values('Q2_1',axis=0,ascending=True)

In [115]:
post_patterns.to_csv('mv_pattern_check.csv',index=False)