<a href="https://colab.research.google.com/github/saksham1965/data-analyst/blob/main/ML%20assignment%204.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [8]:
# Install if needed
!pip install shap xgboost

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error, accuracy_score, f1_score
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.linear_model import LinearRegression, LogisticRegression

import shap
import warnings
warnings.filterwarnings('ignore')




In [12]:
df_math = pd.read_csv('/content/student-mat.csv', sep=';')
df_por = pd.read_csv('/content/student-por.csv', sep=';')
# Option 1: Merge (inner join) on selected attributes
join_cols = ['school','sex','age','address','famsize','Pstatus','Medu','Fedu',
             'Mjob','Fjob','reason','nursery','internet']
df_merged = pd.merge(df_math, df_por, on=join_cols, suffixes=('_math', '_por'))

# Option 2: Combine both with subject tag (if not merging)
df_math['subject'] = 'math'
df_por['subject'] = 'portuguese'
df_combined = pd.concat([df_math, df_por], ignore_index=True)

In [27]:
# Use combined for general analysis
df = df_combined.copy()

# Drop G1/G2 if not predicting early; otherwise, keep
features = df.drop(columns=['G3'])
target = df['G3']

# Convert categoricals to dummies
features = pd.get_dummies(features, drop_first=True)

# Scale numerical features
scaler = StandardScaler()
features_scaled = scaler.fit_transform(features)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(features_scaled, target, test_size=0.1, random_state=42)


In [28]:
# Try Random Forest
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Predictions
y_pred = model.predict(X_test)

# Metrics
mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print(f"MAE: {mae:.2f}")
print(f"RMSE: {rmse:.2f}")


MAE: 0.92
RMSE: 1.49


In [31]:
# Convert G3 to pass/fail
y_class = (target >= 10).astype(int)

# New split
X_train_cls, X_test_cls, y_train_cls, y_test_cls = train_test_split(features_scaled, y_class, test_size=0.2, random_state=42)

# Train classifier
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train_cls, y_train_cls)

# Predict and evaluate
y_pred_cls = clf.predict(X_test_cls)
print("Accuracy:", accuracy_score(y_test_cls, y_pred_cls))
print("F1 Score:", f1_score(y_test_cls, y_pred_cls))


Accuracy: 0.8851674641148325
F1 Score: 0.9230769230769231
