In [23]:
import pandas as pd
import numpy as np
import re

In [13]:
# Load raw data
raw = pd.read_csv('../data/raw.csv')

## basic cleaning

In [14]:
# Delete the first and second rows so that the data has only one header row
cleaned = raw.iloc[2:].copy()

# Convert the 'StartDate' column to a datetime format
cleaned['StartDate'] = pd.to_datetime(cleaned['StartDate'])

# Filter the DataFrame to keep only the rows with dates on or after July 1, 2019
cleaned = cleaned[cleaned['StartDate'] >= '2019-07-01']

# Convert the 'Q51' column to a numeric data type
cleaned['Q51'] = pd.to_numeric(cleaned['Q51'], errors='coerce')

# Filter the DataFrame to remove rows where 'Q51' is greater than 30 or less than 17
cleaned = cleaned[(cleaned['Q51'] <= 30) & (cleaned['Q51'] >= 17)].dropna(subset=['Q51'])
                  
# Filter the DataFrame to remove rows where '3', 'Q4', and 'Q5' are all 0 (has no relationship)
cleaned = cleaned[~((cleaned['3'] == 0) & (cleaned['Q4'] == 0) & (cleaned['Q5'] == 0))]

# Save the cleaned DataFrame to the 'data' folder
cleaned.to_csv('../data/cleaned_data.csv', index=False)

## add predictors that have only one item

In [39]:
# Load the cleaned data 
cleaned = pd.read_csv('../data/cleaned_data.csv')

# Create a new DataFrame called 'predictors' with some columns (no mean or dummy) from the cleaned data
predictors = cleaned[['Q73', '1', '2', 'Q52', 'Q69', 'Q51','3', 'Q4', 'Q5', 'Q8',
                      'Q7', 'Q9', 'Q63', 'Q37', 'Q71', 'Q41', 'Q63.1', 'Q76_6', 'Q20_1',
                      'Q20_2', 'Q20_3', 'Q20_4', 'Q68_1',
                                'Q68_2', 'Q68_3', 'Q68_4', 'Q69.1', 'Q21', 'Q79_1', 'Q79_4',
                                'Q79_5', 'Q79_6', 'Q79_7', 'Q79_8', 'Q79_9', 'Q79_10', 'Q79_11',
                                'Q79_12', 'Q79_13', 'Q35_1', 'Q35_2', 'Q35_4', 'Q35_3', 'Q35_5',
                                'Q35_19', 'Q35_6', 'Q35_7', 'Q35_8', 'Q35_9', 'Q35_18', 'Q35_10',
                                'Q35_12', 'Q35_17', 'Q35_13', 'Q35_11', 'Q35_16', 'Q36#2_1',
                                'Q36#2_2', 'Q36#2_3']].copy()

# Rename 
predictors = predictors.rename(columns= {'Q73': 'e1', '1': 'd1', '2': 'd2', 'Q52':"d4",
                                        'Q69': 'd5', 'Q51': 'd6', '3': 'r1', 'Q4': 'r2',
                                         'Q5': 'r3', 'Q8': 'r5', 'Q7': 'r6', 'Q9': 'r7',
                                         'Q63': 'r8', 'Q37': 'r9', 'Q71': 'r10', 'Q41': 'r11',
                                        'Q63.1': 'r13', 'Q76_6': 'r15', 'Q20_1': 'r19',
                                        'Q20_2': 'r20', 'Q20_3': 'r21', 'Q20_4': 'r22',
                                        'Q68_1': 'r24', 'Q68_2': 'r25', 'Q68_3': 'r26',
                                        'Q68_4': 'r27', 'Q69.1': 'r28', 'Q21': 'f1',
                                        'Q79_1': 'f3', 'Q79_4': 'f5', 'Q79_5': 'f6',
                                        'Q79_6': 'f7', 'Q79_7': 'f8', 'Q79_8': 'f9',
                                        'Q79_9': 'f10', 'Q79_10': 'f11', 'Q79_11': 'f12',
                                        'Q79_12': 'f13', 'Q79_13': 'f14', 'Q35_1': 's1',
                                        'Q35_2': 's2', 'Q35_4': 's3', 'Q35_3': 's4',
                                        'Q35_5': 's5', 'Q35_19': 's6', 'Q35_6': 's7',
                                        'Q35_7': 's8', 'Q35_8': 's9', 'Q35_9': 's10', 'Q35_18': 's11', 'Q35_10': 's12',
                                        'Q35_12': 's13', 'Q35_17': 's14', 'Q35_13': 's15',
                                        'Q35_11': 's16', 'Q35_16': 's17', 'Q36#2_1': 'f15',
                                        'Q36#2_2': 'f16', 'Q36#2_3': 'f17'})
                                       
# Save 
predictors.to_csv('../data/predictors.csv', index=False)

## add predictors that are the means or the sums

In [40]:
# Calculate the mean for r12 variables
r12_variables = ['Q12', 'Q11', 'Q13', 'Q14', 'Q15', 'Q16', 'Q17']
reverse_code_r12 = ['Q14', 'Q17']

for var in reverse_code_r12:
    cleaned[var] = 6 - cleaned[var]

cleaned['r12_mean'] = cleaned[r12_variables].mean(axis=1)

# Calculate the mean for r14 variables
r14_variables = ['Q76_1', 'Q76_2', 'Q76_3', 'Q76_4', 'Q76_5']
cleaned['r14_mean'] = cleaned[r14_variables].mean(axis=1)

# Calculate the mean for r16 variables
r16_variables = ['Q19_1', 'Q19_2', 'Q19_3', 'Q19_4', 'Q19_5', 'Q19_6', 'Q19_7', 'Q19_8', 'Q19_9', 'Q19_10', 'Q19_11']
reverse_code_r16 = ['Q19_2', 'Q19_7']

for var in reverse_code_r16:
    cleaned[var] = 8 - cleaned[var]

cleaned['r16_mean'] = cleaned[r16_variables].mean(axis=1)

# Calculate the mean for r18 variables
r18_variables = ['Q18_1', 'Q18_2', 'Q18_3', 'Q18_4', 'Q18_5', 'Q18_6']
reverse_code_r18 = ['Q18_1', 'Q18_3', 'Q18_5', 'Q18_6']

for var in reverse_code_r18:
    cleaned[var] = 8 - cleaned[var]

cleaned['r18_mean'] = cleaned[r18_variables].mean(axis=1)

# Define variable groups for calculating means
f4_variables = ['Q79_2', 'Q79_3']
f18_variables = ['Q50#1_1', 'Q50#1_3', 'Q50#1_4', 'Q50#1_6', 'Q50#1_9', 'Q50#1_10']
f19_variables = ['Q50#1_2', 'Q50#1_5', 'Q50#1_8', 'Q50#1_12']
f20_variables = ['Q50#1_7', 'Q50#1_11', 'Q50#1_13', 'Q50#1_14']
f21_variables = ['Q50#2_1', 'Q50#2_3', 'Q50#2_4', 'Q50#2_6', 'Q50#2_9', 'Q50#2_10']
f22_variables = ['Q50#2_2', 'Q50#2_5', 'Q50#2_8', 'Q50#2_12']
f23_variables = ['Q50#2_7', 'Q50#2_11', 'Q50#2_13', 'Q50#2_14']
f24_variables = ['Q77_1', 'Q77_2', 'Q77_3', 'Q77_4']


# Calculate means for each variable group
cleaned['f4_mean'] = cleaned[f4_variables].mean(axis=1)
cleaned['f18_mean'] = cleaned[f18_variables].mean(axis=1)
cleaned['f19_mean'] = cleaned[f19_variables].mean(axis=1)
cleaned['f20_mean'] = cleaned[f20_variables].mean(axis=1)
cleaned['f21_mean'] = cleaned[f21_variables].mean(axis=1)
cleaned['f22_mean'] = cleaned[f22_variables].mean(axis=1)
cleaned['f23_mean'] = cleaned[f23_variables].mean(axis=1)
cleaned['f24_mean'] = cleaned[f24_variables].mean(axis=1)

# Add the new columns to the predictors DataFrame
predictors['r12'] = cleaned['r12_mean']
predictors['r14'] = cleaned['r14_mean']
predictors['r16'] = cleaned['r16_mean']
predictors['r18'] = cleaned['r18_mean']
predictors['f4'] = cleaned['f4_mean']
predictors['f18'] = cleaned['f18_mean']
predictors['f19'] = cleaned['f19_mean']
predictors['f20'] = cleaned['f20_mean']
predictors['f21'] = cleaned['f21_mean']
predictors['f22'] = cleaned['f22_mean']
predictors['f23'] = cleaned['f23_mean']
predictors['f24'] = cleaned['f24_mean']

# Save the updated predictors DataFrame to a CSV file
predictors.to_csv('../data/predictors.csv', index=False)

In [41]:
# Load the cleaned data 
cleaned = pd.read_csv('../data/cleaned_data.csv')

# Create a temporary DataFrame with r17 columns
r17_df = cleaned[['Q65_1', 'Q65_2', 'Q65_3', 'Q65_4', 'Q65_5', 'Q65_6', 'Q65_7', 
                  'Q65_8', 'Q65_9', 'Q65_10', 'Q65_11', 'Q65_12', 'Q65_13']].copy()
    
# Define the custom transformation function for normal items
def normal_transformation(value):
    if value == 1.0:
        return 1
    elif value == 2.0:
        return -1
    elif value in [3.0, 4.0]:
        return 0.5
    else:
        return np.nan

    # Define the custom transformation function for reverse coding items
def reverse_transformation(value):
    if value == 1.0:
        return -1
    elif value == 2.0:
        return 1
    elif value in [3.0, 4.0]:
        return -0.5
    else:
        return np.nan
    
# Define columns for normal and reverse coding
normal_code_columns = ['Q65_1', 'Q65_2', 'Q65_3', 'Q65_4', 'Q65_5', 'Q65_6', 'Q65_7', 'Q65_11', 'Q65_12']
reverse_code_columns = ['Q65_8', 'Q65_9', 'Q65_10', 'Q65_13']

# Apply the custom transformation to all r17 columns
for col in normal_code_columns:
    r17_df[col] = r17_df[col].apply(normal_transformation)

for col in reverse_code_columns:
    r17_df[col] = r17_df[col].apply(reverse_transformation)
    
# Custom function to set r17 to NaN if all 13 items are NaN
def set_nan_if_all_nan(row):
    if row.isna().all():
        return np.nan
    else:
        return row.sum()

# Apply the custom function to calculate r17
predictors['r17'] = r17_df.apply(set_nan_if_all_nan, axis=1)

# Save the updated predictors DataFrame to a CSV file
predictors.to_csv('../data/predictors.csv', index=False)

In [43]:
# Load the cleaned data 
cleaned = pd.read_csv('../data/cleaned_data.csv')

# Create a temporary DataFrame with f2 columns
f2_df = cleaned[['Q22_1', 'Q22_2', 'Q22_3', 'Q22_4', 'Q22_5', 'Q22_6', 'Q22_7', 
                  'Q22_8', 'Q22_9', 'Q22_10', 'Q22_11', 'Q22_12', 'Q22_13']].copy()

# Define the custom transformation function for normal items
def f2_normal_transformation(value):
    if value == 1.0:
        return 1
    elif value == 2.0:
        return -1
    elif value in [3.0, 4.0]:
        return 0.5
    else:
        return np.nan

# Define the custom transformation function for reverse coding items
def f2_reverse_transformation(value):
    if value == 1.0:
        return -1
    elif value == 2.0:
        return 1
    elif value in [3.0, 4.0]:
        return -0.5
    else:
        return np.nan

# Define columns for normal and reverse coding
f2_normal_code_columns = ['Q22_1', 'Q22_2', 'Q22_3', 'Q22_4', 'Q22_5', 'Q22_6', 'Q22_7', 'Q22_11', 'Q22_12']
f2_reverse_code_columns = ['Q22_8', 'Q22_9', 'Q22_10', 'Q22_13']

# Apply the custom transformation to all f2 columns
for col in f2_normal_code_columns:
    f2_df[col] = f2_df[col].apply(f2_normal_transformation)

for col in f2_reverse_code_columns:
    f2_df[col] = f2_df[col].apply(f2_reverse_transformation)

# Apply the custom function to calculate f2
predictors['f2'] = f2_df.apply(set_nan_if_all_nan, axis=1)

# Save the updated predictors DataFrame to a CSV file
predictors.to_csv('../data/predictors.csv', index=False)

## dummy/polynomial code catigorical variables

In [124]:
# dummy code d1

# Load the cleaned data 
predictors = pd.read_csv('../data/predictors.csv')

# Create a new column 'd1_transformed' by combining the last 4 categories into 1
predictors['d1_transformed'] = predictors['d1'].apply(lambda x: 3 if x in [3, 4, 5, 6] else x if x in [1, 2] else np.nan)

# Create dummy variables for the 'd1_transformed' column
dummies = pd.get_dummies(predictors['d1_transformed'], prefix='d1', drop_first=False)

# Drop the last dummy variable
dummies = dummies.iloc[:, :-1]

# Concatenate the dummy variables to the predictors DataFrame
final = pd.concat([predictors, dummies], axis=1)

# Save the final DataFrame to a CSV file
final.to_csv('../data/final.csv', index=False)

In [125]:
# rebuild d2 
# Replace the original values of d2 with the new values
predictors['d2_transformed'] = predictors['d2'].apply(lambda x: 0 if x == 1 else 1 if x in [2, 3, 4, 5, 6, 7] else np.nan)

# Add the transformed d2 column to the final DataFrame
final['d2_transformed'] = predictors['d2_transformed']

# Save the final DataFrame to a CSV file
final.to_csv('../data/final.csv', index=False)

In [126]:
# dummy code d3

# Load the cleaned data
cleaned_data = pd.read_csv('../data/cleaned_data.csv')

# Define a function to assign the values for d3 based on Q54 values
def assign_d3_values(value):
    if pd.isna(value):
        return np.nan
    
    selected_options = set(map(int, value.split(',')))

    if len(selected_options) == 1 and selected_options.issubset({1, 2, 3, 4}):
        return selected_options.pop()
    elif len(selected_options) == 1 and selected_options.issubset({6, 7}):
        return 6
    elif selected_options.intersection({5, 6}) or len(selected_options.intersection({1, 2, 3, 4})) > 1:
        return 5
    else:
        return np.nan

# Apply the function to create the d3 variable
cleaned_data['d3'] = cleaned_data['Q54'].apply(assign_d3_values)

# Add the transformed d3 column to the final DataFrame
final['d3'] = cleaned_data['d3']

# Create dummy variables for the 'd3' column
dummies_d3 = pd.get_dummies(cleaned_data['d3'], prefix='d3', drop_first=False)

# Drop the last dummy variable
dummies_d3 = dummies_d3.iloc[:, :-1]

# Concatenate the dummy variables to the final DataFrame
final = pd.concat([final, dummies_d3], axis=1)

# Save the final DataFrame to a CSV file
final.to_csv('../data/final.csv', index=False)

In [127]:
# dummy code d4
# Transform d4 according to the new rules and save it as d4_transformed
def new_d4_transformation(value):
    if pd.isna(value):
        return np.nan
    elif value == 1:
        return 1
    elif value == 2:
        return 2
    else:
        return 3

predictors['d4_transformed'] = predictors['d4'].apply(new_d4_transformation)

# Create dummy variables for the 'd4_transformed' column
d4_dummies = pd.get_dummies(predictors['d4_transformed'], prefix='d4', drop_first=False)

# Drop the last dummy variable
d4_dummies = d4_dummies.iloc[:, :-1]

# Concatenate the transformed d4 and its dummy variables to the final DataFrame
final = pd.concat([final, predictors['d4_transformed'], d4_dummies], axis=1)

# Save the final DataFrame to a CSV file
final.to_csv('../data/final.csv', index=False)

!pip install patsy

In [128]:
from patsy import dmatrix

# polynomial code d5,r6 and r11

# Load the predictors data
predictors = pd.read_csv('../data/predictors.csv')
# Load the final data
final = pd.read_csv('../data/final.csv')

# Create polynomial coding for the ordinal variable 'd5'
d5_poly = dmatrix("C(d5, Poly)", predictors, return_type='dataframe')

# Drop the first column, which is the intercept
d5_poly = d5_poly.iloc[:, 1:]

# Rename the columns to have more descriptive names
num_columns = d5_poly.shape[1]
d5_poly.columns = [f'd5_poly_{i + 1}' for i in range(num_columns)]

# Concatenate the polynomial coding columns to the final DataFrame
final = pd.concat([final, d5_poly], axis=1)

# Perform polynomial coding on the 'r6' variable
r6_poly_code = dmatrix("C(r6, Poly)", predictors, return_type="dataframe")

# Drop the intercept column
r6_poly_code = r6_poly_code.iloc[:, 1:]

# Rename the columns for the 'r6' polynomial coding
num_columns_r6 = r6_poly_code.shape[1]
r6_poly_code.columns = [f'r6_poly_{i + 1}' for i in range(num_columns_r6)]

# Concatenate the polynomial coded variables to the predictors DataFrame
final = pd.concat([final, r6_poly_code], axis=1)

# Perform polynomial coding on the 'r11' variable
r11_poly_code = dmatrix("C(r11, Poly)", predictors, return_type="dataframe")

# Drop the intercept column
r11_poly_code = r11_poly_code.iloc[:, 1:]

# Rename the columns
num_columns_r11 = r11_poly_code.shape[1]
r11_poly_code.columns = [f"r11_poly_{i + 1}" for i in range(num_columns_r11)]

# Concatenate the polynomial coded variables to the final DataFrame
final = pd.concat([final, r11_poly_code], axis=1)

# Save the final DataFrame to a CSV file
final.to_csv('../data/final.csv', index=False)

In [129]:
# create and transfor r4 to a binary variable
# print(cleaned_data['Q6'].unique())

In [130]:
# Load the final data
final = pd.read_csv('../data/final.csv')

def relationship_transform(value):
    if pd.isna(value):
        return np.nan
    else:
        value_set = set(map(int, value.split(',')))
        if value_set.intersection({2, 4, 5, 7}):
            return 1
        else:
            return 0

# Create 'r4' based on the 'Q6' column in cleaned_data
predictors['r4'] = cleaned_data['Q6'].apply(relationship_transform)

# Add 'r4' to the final DataFrame
final['r4'] = predictors['r4']

# Save the final DataFrame to a CSV file
final.to_csv('../data/final.csv', index=False)

In [131]:
# Load the final data
final = pd.read_csv('../data/final.csv')

# Dummy code r28
r28_dummies = pd.get_dummies(predictors['r28'], prefix='r28', drop_first=False)

# Remove the last dummy variable
r28_dummies = r28_dummies.iloc[:, :-1]

# Rename the columns to have integers instead of floats
r28_dummies.columns = [f'r28_{int(float(col.split("_")[-1]))}' for col in r28_dummies.columns]

# Concatenate the dummy variables to the final DataFrame
final = pd.concat([final, r28_dummies], axis=1)

# Save the final DataFrame to a CSV file
final.to_csv('../data/final.csv', index=False)

In [132]:
# Load the final data
final = pd.read_csv('../data/final.csv')

def create_f25(row):
    values = row[['Q58#1_1', 'Q58#1_2', 'Q58#1_3', 'Q58#1_4', 'Q58#1_5', 'Q58#1_6', 'Q58#1_7']].values
    
    if pd.isnull(values).all():
        return np.nan
    
    if 1 in values[5:]: # Check if Q58#1_6 or Q58#1_7 is selected
        return 1
    elif 1 in values[:5]:  # Check if Q58#1_1, Q58#1_2, Q58#1_3, Q58#1_4, or Q58#1_5 is selected
        return 0
    else:
        return np.nan

# Apply the function to the cleaned_data DataFrame to create the f25 variable
cleaned_data['f25'] = cleaned_data.apply(create_f25, axis=1)

# Add f25 to the final DataFrame
final['f25'] = cleaned_data['f25']

def create_f26(row):
    values = row[['Q58#2_1', 'Q58#2_2', 'Q58#2_3', 'Q58#2_4', 'Q58#2_5', 'Q58#2_6', 'Q58#2_7']].values
    
    if pd.isnull(values).all():
        return np.nan
    
    if 1 in values[5:]: # Check if Q58#2_6 or Q58#2_7 is selected
        return 1
    elif 1 in values[:5]:  # Check if Q58#2_1, Q58#2_2, Q58#2_3, Q58#2_4, or Q58#2_5 is selected
        return 0
    else:
        return np.nan

# Apply the function to the cleaned_data DataFrame to create the f26 variable
cleaned_data['f26'] = cleaned_data.apply(create_f26, axis=1)

# Add f26 to the final DataFrame
final['f26'] = cleaned_data['f26']

# Save the final DataFrame to a CSV file
final.to_csv('../data/final.csv', index=False)

## Create the data that would be used to conduct data analysis

In [151]:
"""
I would build a data called "AApre" to include all finalized variables. Since I have concern about
the polynomial coding, I create "AApre2" that did not use polynomial coding, here I just used their 
original values (d5,r6,r11).
1.There are 9 variables that need to be dummy code or polynomial code. Some of the 
original/transformed
variables need to be romoved: d1,d2,d4,d5,r6,r11,d1_transformed,d2_transformed,d3,
d4_transformed,r28.
2.The variables that include manually cleaning (r1,r2,r3,r7,r8) would be added seperately from 
predictors_manu12378.csv.
3.create the outcome variable r23: 1 for secure, 0 for insecure; delete cases that do not have 
values for r23
4.delete cases that do not have values for f24
5.scale the continuous variables in AApre
6.after checking the missing values of the predictors, I decide to delete predictor r8 and all stress
level predictors (s1-s17). Fill the rest missing values with mean
"""
# Load the final DataFrame 
final = pd.read_csv('../data/final.csv')

# Remove the specified columns from the final DataFrame, but keep 'd2_transformed'
final = final.drop(columns=['d1', 'd2', 'd4', 'd5', 'r6', 'r11', 'd1_transformed', 'd3', 'd4_transformed', 'r28'])

# Load the cleaned_data and predictors_manu12378.csv files
cleaned_data = pd.read_csv('../data/cleaned_data.csv')
predictors_manu = pd.read_csv('../data/predictors_manu123578.csv')

# Replace the original r1, r2, r3, r7, and r8 variables with the manually cleaned versions
final['r1'] = predictors_manu['r1']
final['r2'] = predictors_manu['r2']
final['r3'] = predictors_manu['r3']
final['r5'] = predictors_manu['r5']
final['r7'] = predictors_manu['r7']
final['r8'] = predictors_manu['r8']

# Create the outcome variable 'r23' based on the 'Q68' variable in cleaned_data
final['r23'] = cleaned_data['Q68'].apply(lambda x: 1 if x == 1 else 0 if x in [2, 3, 4] else np.nan)

# Remove cases where 'f24' has no values
final = final.dropna(subset=['r23'])

# Remove cases where 'f24' has no values
final = final.dropna(subset=['f24'])

# Save the AApre DataFrame to a CSV file
final.to_csv('../data/AApre.csv', index=False)

In [152]:
# Load the final DataFrame 
AApre = pd.read_csv('../data/AApre.csv')

# Create a copy of the "AApre" DataFrame and call it "AApre2"
AApre2 = AApre.copy()

# Remove the polynomial coded variables from "AApre2"
AApre2 = AApre2.drop(columns=['d5_poly_1', 'd5_poly_2', 'd5_poly_3', 'd5_poly_4', 'r6_poly_1', 'r6_poly_2', 'r6_poly_3', 'r11_poly_1', 'r11_poly_2', 'r11_poly_3', 'r11_poly_4'])

# Add the original d5, r6, and r11 variables to "AApre2"
AApre2[['d5', 'r6', 'r11']] = predictors[['d5', 'r6', 'r11']]

# Save the AApre2 DataFrame to a CSV file
AApre2.to_csv('../data/AApre2.csv', index=False)

In [153]:
# scale the continous varibales for AApre

from sklearn.preprocessing import StandardScaler

# Create a list of continuous variable column names
continuous_vars = ['e1', 'd6', 'r1', 'r2', 'r3', 'r5', 'r7', 'r8', 'r9', 'r10', 'r13', 
                   'r15', 'r19', 'r20', 'r21', 'r22', 'r24', 'r25', 'r26', 'r27', 'f1', 
                   'f3', 'f5', 'f6', 'f7', 'f8', 'f9', 'f10', 'f11', 'f12', 'f13', 'f14', 
                   's1', 's2', 's3', 's4', 's5', 's6', 's7', 's8', 's9', 's10', 's11', 
                   's12', 's13', 's14', 's15', 's16', 's17', 'f15', 'f16', 'f17', 'r12', 
                   'r14', 'r16', 'r18', 'f4', 'f18', 'f19', 'f20', 'f21', 'f22', 'f23', 
                   'f24', 'r17', 'f2']

# Instantiate the scaler
scaler = StandardScaler()

# Fit the scaler to your continuous variables
scaler.fit(AApre[continuous_vars])

# Transform the continuous variables
AApre_scaled = AApre.copy()
AApre_scaled[continuous_vars] = scaler.transform(AApre[continuous_vars])

# Save the DataFrame with scaled variables to 'AApre.csv'
AApre_scaled.to_csv('../data/AApre.csv', index=False)

In [154]:
# Load 
AApre = pd.read_csv('../data/AApre.csv')

# Drop the specified columns
AApre = AApre.drop(['r8'] + [f's{i}' for i in range(1, 18)], axis=1)

# Fill the missing values with the mean
AApre_filled = AApre.fillna(AApre.mean())

# Save
AApre_filled.to_csv('../data/AApre.csv', index=False)

In [157]:
# Load the AApre2 dataset
AApre2 = pd.read_csv("../data/AApre2.csv")

# Fill the missing values with the mean for each column
AApre2_filled = AApre2.fillna(AApre2.mean())

# Save
AApre2_filled.to_csv('../data/AApre2.csv', index=False)