In [21]:
import pandas as pd

df_students = pd.DataFrame({
    "StudyHours": [2, 4, 6, 8,3,4,5,6],
    "SleepHours": [5, 6, 7, 8,7,6,4,3],
    "Age" : [18, 22, 28, 35, 45, 55, 65, 75],
    "Result": [0, 0, 1, 1,0,1,0,1]
})

df_students

Unnamed: 0,StudyHours,SleepHours,Age,Result
0,2,5,18,0
1,4,6,22,0
2,6,7,28,1
3,8,8,35,1
4,3,7,45,0
5,4,6,55,1
6,5,4,65,0
7,6,3,75,1


Feature transformation in machine learning is converting features (data columns) from one form to another, often using mathematical functions, to make them more suitable for an algorithm, boosting model performance, accuracy, and efficiency by handling different scales, distributions (like making data normal), or data types (like categorical to numerical). It's a core part of data preprocessing that helps models better understand underlying patterns, especially for algorithms sensitive to feature scales, like distance-based models. 

In [22]:
from sklearn.preprocessing import PolynomialFeatures
poly_cols = ['StudyHours', 'SleepHours']
poly = PolynomialFeatures(degree=2, include_bias=False)

poly_features = poly.fit_transform(df_students[poly_cols])
feature_names = poly.get_feature_names_out(poly_cols)

print(poly_features.shape)
print(poly_features)
print(feature_names)

(8, 5)
[[ 2.  5.  4. 10. 25.]
 [ 4.  6. 16. 24. 36.]
 [ 6.  7. 36. 42. 49.]
 [ 8.  8. 64. 64. 64.]
 [ 3.  7.  9. 21. 49.]
 [ 4.  6. 16. 24. 36.]
 [ 5.  4. 25. 20. 16.]
 [ 6.  3. 36. 18.  9.]]
['StudyHours' 'SleepHours' 'StudyHours^2' 'StudyHours SleepHours'
 'SleepHours^2']


In [23]:
bins = [0, 12, 20, 35, 60, 100]
labels = ['Child', 'Teen', 'YoungAdult', 'Adult', 'Senior']
df_students['AgeGroup'] = pd.cut(df_students['Age'], bins=bins, labels=labels)
df_students

Unnamed: 0,StudyHours,SleepHours,Age,Result,AgeGroup
0,2,5,18,0,Teen
1,4,6,22,0,YoungAdult
2,6,7,28,1,YoungAdult
3,8,8,35,1,YoungAdult
4,3,7,45,0,Adult
5,4,6,55,1,Adult
6,5,4,65,0,Senior
7,6,3,75,1,Senior


In [24]:
# Domain-driven risk categories for RestingBP and Oldpeak
def std_risk(bp):
  if bp<3:
    return "Normal"
  elif bp<5:
    return "Elevated"
  else:
    return "High"

def slp_risk(op):
  if op==7:
    return "No Stress"
  elif op<7:
    return "Moderate Stress"
  else:
    return "High Stress"

df_students["Study_Risk"]=df_students["StudyHours"].apply(std_risk)
df_students["Sleep_Risk"]=df_students["SleepHours"].apply(slp_risk)
df_students

Unnamed: 0,StudyHours,SleepHours,Age,Result,AgeGroup,Study_Risk,Sleep_Risk
0,2,5,18,0,Teen,Normal,Moderate Stress
1,4,6,22,0,YoungAdult,Elevated,Moderate Stress
2,6,7,28,1,YoungAdult,High,No Stress
3,8,8,35,1,YoungAdult,High,High Stress
4,3,7,45,0,Adult,Elevated,No Stress
5,4,6,55,1,Adult,Elevated,Moderate Stress
6,5,4,65,0,Senior,High,Moderate Stress
7,6,3,75,1,Senior,High,Moderate Stress


In [25]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler, OneHotEncoder

# Define numeric and categorical columns for the pipeline
num_features = ["StudyHours",	"SleepHours",	"Age"]
cat_features = ["AgeGroup", "Study_Risk",	"Sleep_Risk"]

# Numeric pipeline
num_pipeline = Pipeline([
    ("scaler", StandardScaler())
])

# Categorical pipeline
cat_pipeline = Pipeline([
    ("ohe",OneHotEncoder(drop="first"))
])

# Combine both
preprocess = ColumnTransformer([
    ("num",num_pipeline,num_features),
    ("cat",cat_pipeline,cat_features)
])

# Full pipeline with a simple model
clf = Pipeline([
   ("prep",preprocess),
   ("model",LogisticRegression(max_iter=1000))
])

df_students

Unnamed: 0,StudyHours,SleepHours,Age,Result,AgeGroup,Study_Risk,Sleep_Risk
0,2,5,18,0,Teen,Normal,Moderate Stress
1,4,6,22,0,YoungAdult,Elevated,Moderate Stress
2,6,7,28,1,YoungAdult,High,No Stress
3,8,8,35,1,YoungAdult,High,High Stress
4,3,7,45,0,Adult,Elevated,No Stress
5,4,6,55,1,Adult,Elevated,Moderate Stress
6,5,4,65,0,Senior,High,Moderate Stress
7,6,3,75,1,Senior,High,Moderate Stress


In [34]:
# Train-test split using original df_heart (not already encoded)
from sklearn.model_selection import train_test_split

target_col = "Result"
X = df_students.drop(columns=[target_col])
y = df_students[target_col]

X_train_pipe, X_test_pipe, y_train_pipe, y_test_pipe = train_test_split(
    X, y, test_size = 0.25, random_state = 42
)

# Fit the full pipeline
clf.fit(X_train_pipe, y_train_pipe)

# Predict and evaluate
from sklearn.metrics import accuracy_score

y_pred_pipe = clf.predict(X_test_pipe)
acc=accuracy_score(y_test_pipe,y_pred_pipe)
print("Logistic Regression with preprocessing pipeline: ", acc)

Logistic Regression with preprocessing pipeline:  0.5
