In [16]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error

# Load the dataset
file_path = r"D:\MLPR\students\student_data.csv"  # Ensure the correct file path and extension
df = pd.read_csv(file_path)

# Separate features and target variable
X = df.drop(columns=['G3'])  # Drop target variable from features
y = df['G3']  # Target variable

# Identify categorical and numerical columns
categorical_cols = X.select_dtypes(include=['object']).columns
numerical_cols = X.select_dtypes(include=['number']).columns

# Define preprocessing for numerical and categorical features
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_cols),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols)
    ]
)

# Create a pipeline that preprocesses data and fits the model
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', RandomForestRegressor(n_estimators=100, random_state=0))
])

# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

# Train Random Forest model using the pipeline
pipeline.fit(X_train, y_train)

# Feature importance
model_rf = pipeline.named_steps['model']
importances = model_rf.feature_importances_

# Get feature names after one-hot encoding
preprocessor_cat = pipeline.named_steps['preprocessor'].named_transformers_['cat']
categorical_feature_names = preprocessor_cat.get_feature_names_out(categorical_cols)

# Combine numerical and categorical feature names
all_feature_names = np.concatenate([numerical_cols, categorical_feature_names])

# Create a DataFrame for feature importance
feature_importance_df = pd.DataFrame({'Feature': all_feature_names, 'Importance': importances})
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)
print("\nFeature Importance:")
print(feature_importance_df)

# Predict on test data
predictions = pipeline.predict(X_test)

# Calculate performance metrics
mae = mean_absolute_error(y_test, predictions)
print(f"\nMean Absolute Error (MAE): {mae:.4f}")



Feature Importance:
              Feature  Importance
14                 G2    0.778778
12           absences    0.116954
0                 age    0.011149
11             health    0.007267
7            freetime    0.006250
13                 G1    0.005587
5            failures    0.005574
6              famrel    0.005236
42       schoolsup_no    0.004101
35      reason_course    0.004001
43      schoolsup_yes    0.003963
10               Walc    0.003953
1                Medu    0.003424
2                Fedu    0.002831
29       Mjob_teacher    0.002533
8               goout    0.002478
36        reason_home    0.002406
4           studytime    0.002099
3          traveltime    0.001846
28      Mjob_services    0.001538
47           paid_yes    0.001382
48      activities_no    0.001375
33      Fjob_services    0.001329
38  reason_reputation    0.001252
30       Fjob_at_home    0.001234
49     activities_yes    0.001232
37       reason_other    0.001146
15          school_GP    0.

In [17]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

# Load the dataset
file_path = r"D:\MLPR\students\student_data.csv"  # Ensure the correct file path and extension
df = pd.read_csv(file_path)

# Separate features and target variable
X = df.drop(columns=['G3'])  # Drop target variable from features
y = df['G3']  # Target variable

# Identify categorical and numerical columns
categorical_cols = X.select_dtypes(include=['object']).columns
numerical_cols = X.select_dtypes(include=['number']).columns

# Define preprocessing for numerical and categorical features
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_cols),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols)
    ]
)

# Create a pipeline that preprocesses data and fits the model
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', RandomForestRegressor(n_estimators=100, random_state=0))
])

# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

# Train Random Forest model using the pipeline
pipeline.fit(X_train, y_train)

# Feature importance
model_rf = pipeline.named_steps['model']
importances = model_rf.feature_importances_

# Get feature names after one-hot encoding
preprocessor_cat = pipeline.named_steps['preprocessor'].named_transformers_['cat']
categorical_feature_names = preprocessor_cat.get_feature_names_out(categorical_cols)

# Combine numerical and categorical feature names
all_feature_names = np.concatenate([numerical_cols, categorical_feature_names])

# Create a DataFrame for feature importance
feature_importance_df = pd.DataFrame({'Feature': all_feature_names, 'Importance': importances})
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)
print("\nFeature Importance:")
print(feature_importance_df)

# Predict on test data
predictions = pipeline.predict(X_test)

# Calculate performance metrics
mse = mean_squared_error(y_test, predictions)
print(f"\nMean Squared Error (MSE): {mse:.4f}")



Feature Importance:
              Feature  Importance
14                 G2    0.778778
12           absences    0.116954
0                 age    0.011149
11             health    0.007267
7            freetime    0.006250
13                 G1    0.005587
5            failures    0.005574
6              famrel    0.005236
42       schoolsup_no    0.004101
35      reason_course    0.004001
43      schoolsup_yes    0.003963
10               Walc    0.003953
1                Medu    0.003424
2                Fedu    0.002831
29       Mjob_teacher    0.002533
8               goout    0.002478
36        reason_home    0.002406
4           studytime    0.002099
3          traveltime    0.001846
28      Mjob_services    0.001538
47           paid_yes    0.001382
48      activities_no    0.001375
33      Fjob_services    0.001329
38  reason_reputation    0.001252
30       Fjob_at_home    0.001234
49     activities_yes    0.001232
37       reason_other    0.001146
15          school_GP    0.