<a href="https://colab.research.google.com/github/shreya120268/MLDA/blob/main/T1_DataPreprocessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [15]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

# Load the dataset
data = pd.read_csv('students_messy.csv', encoding="latin1")

# Display the first few rows of the dataset
print(data.head())

# Convert 'Age' column to numeric, coercing errors to NaN
data['Age'] = pd.to_numeric(data['Age'], errors='coerce')

# 1. Data Cleaning
# Updated num_features to include only available numeric columns
num_features = ['StudentID', 'Age', 'Score']
num_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

# Updated cat_features to include only available categorical columns
cat_features = ['Name', 'Grade']
cat_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

# Combine preprocessing steps of Input Data
preprocessor = ColumnTransformer(
    transformers=[
        ("num", num_transformer, num_features),
        ("cat", cat_transformer, cat_features)
    ]
)
preprocessor.set_output(transform="pandas")

# Apply the transformations to the Input data
data_preprocessed = preprocessor.fit_transform(data)
# Generate more readable column names
data_preprocessed.columns = ['_'.join(col).replace(' ', '_') if isinstance(col, tuple) else col.replace(' ', '_') for col in data_preprocessed.columns]
print(data_preprocessed)

# 3. Data Splitting
# Assuming 'Score' is the target variable for this dataset based on available columns
score_column = data_preprocessed.filter(like='Score').filter(like='num_').columns[0]
X = data_preprocessed.drop(columns=[score_column])
y = data_preprocessed[score_column]


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Display the first few rows
print(X_train.head())
print(y_train.head())

   StudentID   Name Age  Score Grade
0          1  Alice  23  100.0     D
1          2  Frank  22   68.0     F
2          3   Bob   25   94.0     F
3          4  Henry  18    NaN     B
4          5    Eva  19   87.0     A
     num__StudentID  num__Age    num__Score  cat__Name__Alice_  \
0         -1.726287  0.751289  1.736470e+00                0.0   
1         -1.714740  0.297311  1.320437e-01                0.0   
2         -1.703193  1.659246  1.435640e+00                0.0   
3         -1.691646 -1.518601  7.125083e-16                0.0   
4         -1.680099 -1.064623  1.084672e+00                0.0   
..              ...       ...           ...                ...   
295        1.680099  1.205267  7.125083e-16                0.0   
296        1.691646 -0.610645 -1.572659e+00                0.0   
297        1.703193 -1.518601 -1.522521e+00                0.0   
298        1.714740 -1.518601 -1.723074e+00                0.0   
299        1.726287  1.205267  1.134810e+00         