In [2]:
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler

In [3]:
# 1) read file
df = pd.read_csv("weekly_survey.csv")

In [4]:
df.head()

Unnamed: 0,participant,timestamp,health,physical_activity,TILS_1,TILS_2,TILS_3,GSL,RELAT_LS,WORK_LS,...,LS_questionnaire_start_timestamp,GSL_answer_timestamp,RELAT_LS_answer_timestamp,WORK_LS_answer_timestamp,FINAN_LS_answer_timestamp,TIME_LS_answer_timestamp,PHYS_LS_answer_timestamp,MENT_LS_answer_timestamp,AREA_LS_answer_timestamp,timestamp_UTC
0,AMdzA,2023-07-14 18:13:56.325378,good,4-6,1,1,1,3,3,3,...,2023-07-14 18:13:33.337415,2023-07-14 18:13:36.426144,2023-07-14 18:13:38.220378,2023-07-14 18:13:39.877151,2023-07-14 18:13:41.086108,2023-07-14 18:13:42.276108,2023-07-14 18:13:43.442794,2023-07-14 18:13:46.001410,2023-07-14 18:13:55.671045,1689351000.0
1,AMdzA,2023-07-21 18:36:44.500888,average,2-4,1,1,2,3,3,3,...,2023-07-21 18:36:22.498511,2023-07-21 18:36:26.771177,2023-07-21 18:36:30.885939,2023-07-21 18:36:32.951297,2023-07-21 18:36:34.467258,2023-07-21 18:36:38.032074,2023-07-21 18:36:40.089914,2023-07-21 18:36:41.413901,2023-07-21 18:36:43.380462,1689957000.0
2,AMdzA,2023-07-28 18:06:02.695196,good,2-4,1,1,1,3,3,3,...,2023-07-28 18:05:44.527749,2023-07-28 18:05:47.944449,2023-07-28 18:05:49.760616,2023-07-28 18:05:50.950229,2023-07-28 18:05:52.226086,2023-07-28 18:05:55.380653,2023-07-28 18:05:57.179513,2023-07-28 18:05:59.758403,2023-07-28 18:06:01.660400,1690560000.0
3,AMdzA,2023-08-04 20:13:30.889408,good,4-6,2,2,2,3,3,3,...,2023-08-04 20:13:21.486518,2023-08-04 20:13:22.936974,2023-08-04 20:13:24.235056,2023-08-04 20:13:25.217003,2023-08-04 20:13:26.215915,2023-08-04 20:13:27.156646,2023-08-04 20:13:28.097279,2023-08-04 20:13:29.012215,2023-08-04 20:13:30.077153,1691173000.0
4,ASTAT,2023-07-14 18:09:14.872259,excellent,4-6,1,1,1,3,4,3,...,2023-07-14 18:08:53.608367,2023-07-14 18:08:56.287203,2023-07-14 18:09:02.829701,2023-07-14 18:09:06.233196,2023-07-14 18:09:08.856177,2023-07-14 18:09:10.356432,2023-07-14 18:09:11.653088,2023-07-14 18:09:13.032075,2023-07-14 18:09:14.382855,1689351000.0


In [5]:
# 2) identify missing / wrong data
print(df.isna().sum())     

participant                             0
timestamp                               0
health                                  0
physical_activity                       0
TILS_1                                  0
TILS_2                                  0
TILS_3                                  0
GSL                                     0
RELAT_LS                                0
WORK_LS                                 0
FINAN_LS                                0
TIME_LS                                 0
PHYS_LS                                 0
MENT_LS                                 0
AREA_LS                                 0
survey_start_timestamp                  0
survey_end_timestamp                    0
health_questionnaire_start_timestamp    0
health_answer_timestamp                 0
physical_activity_answer_timestamp      0
TILS_questionnaire_start_timestamp      0
TILS_1_answer_timestamp                 0
TILS_2_answer_timestamp                 0
TILS_3_answer_timestamp           

In [8]:
# 3) correct missing data (simple example: using mean)
num_cols = df.select_dtypes(include='number').columns
imp = SimpleImputer(strategy='mean')
df[num_cols] = imp.fit_transform(df[num_cols])


In [9]:
cat_cols = df.select_dtypes(exclude='number').columns
imp2 = SimpleImputer(strategy='most_frequent')
df[cat_cols] = imp2.fit_transform(df[cat_cols])


In [10]:
# 4) normalization (scaling)
scaler = MinMaxScaler()
df[num_cols] = scaler.fit_transform(df[num_cols])


In [12]:
# 5) save cleaned
df.to_csv("cleaned_weekly_survey.csv", index=False)


In [13]:
# 6) show final insights
print("CLEANING DONE")
print("Dataset shape:", df.shape)
print("First few rows:\n", df.head())

CLEANING DONE
Dataset shape: (1074, 34)
First few rows:
   participant                   timestamp     health physical_activity  \
0       AMdzA  2023-07-14 18:13:56.325378       good               4-6   
1       AMdzA  2023-07-21 18:36:44.500888    average               2-4   
2       AMdzA  2023-07-28 18:06:02.695196       good               2-4   
3       AMdzA  2023-08-04 20:13:30.889408       good               4-6   
4       ASTAT  2023-07-14 18:09:14.872259  excellent               4-6   

   TILS_1  TILS_2  TILS_3       GSL  RELAT_LS   WORK_LS  ...  \
0     0.0     0.0     0.0  0.666667  0.666667  0.666667  ...   
1     0.0     0.0     0.5  0.666667  0.666667  0.666667  ...   
2     0.0     0.0     0.0  0.666667  0.666667  0.666667  ...   
3     0.5     0.5     0.5  0.666667  0.666667  0.666667  ...   
4     0.0     0.0     0.0  0.666667  1.000000  0.666667  ...   

   LS_questionnaire_start_timestamp        GSL_answer_timestamp  \
0        2023-07-14 18:13:33.337415  2023-07-1