In [1]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
import pandas as pd

# Load the dataset
file_path = '../../outputs/Assignement_3/extracted_features_pca_data.csv'
data = pd.read_csv(file_path)

# Separate the features and target variable
X = data.drop(columns=['Target'])
y = data['Target']

# Define numerical and categorical features
numerical_features = ['Unemployment rate', 'Inflation rate', 'GDP']
categorical_features = ['Marital status', 'Application mode', 'Daytime/evening attendance', 'Nacionality',
                        'Mother\'s qualification', 'Father\'s qualification', 'Mother\'s occupation']

# Imputers for numerical and categorical data
numerical_imputer = SimpleImputer(strategy='mean')
categorical_imputer = SimpleImputer(strategy='most_frequent')

# Standardization of Numerical Features
scaler = StandardScaler()

# Encoding Categorical Variables
encoder = OneHotEncoder(drop='first', sparse_output=False)

# Constructing preprocessing pipelines for both numerical and categorical features
numerical_pipeline = Pipeline([
    ('imputer', numerical_imputer),
    ('scaler', scaler)
])

categorical_pipeline = Pipeline([
    ('imputer', categorical_imputer),
    ('encoder', encoder)
])

# Combining both numerical and categorical transformations
preprocessor = ColumnTransformer([
    ('num', numerical_pipeline, numerical_features),
    ('cat', categorical_pipeline, categorical_features)
])

# Apply preprocessing to transform the features
X_processed = preprocessor.fit_transform(X)

# Retrieve feature names and create a DataFrame with processed data
encoded_feature_names = preprocessor.named_transformers_['cat']['encoder'].get_feature_names_out(categorical_features)
numerical_feature_names = numerical_features
processed_columns = list(numerical_feature_names) + list(encoded_feature_names)
X_processed_df = pd.DataFrame(X_processed, columns=processed_columns, index=X.index)

# Display the first few rows of the preprocessed data
X_processed_df.head()


Unnamed: 0,Unemployment rate,Inflation rate,GDP,Marital status_2,Marital status_3,Marital status_4,Marital status_5,Marital status_6,Application mode_2,Application mode_3,...,Mother's occupation_23,Mother's occupation_24,Mother's occupation_25,Mother's occupation_26,Mother's occupation_27,Mother's occupation_28,Mother's occupation_29,Mother's occupation_30,Mother's occupation_31,Mother's occupation_32
0,-0.287638,0.124386,0.765761,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.876222,-1.105222,0.347199,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,-0.287638,0.124386,0.765761,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,-0.813253,-1.466871,-1.375511,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.876222,-1.105222,0.347199,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
