In [110]:
!pip install catboost




In [135]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor
from xgboost import XGBRegressor
from catboost import CatBoostRegressor





In [136]:
df = pd.read_csv('/content/StudentsPerformance.csv')
X = df.drop(columns=['math score'])
y = df['math score']

In [137]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 8 columns):
 #   Column                       Non-Null Count  Dtype 
---  ------                       --------------  ----- 
 0   gender                       1000 non-null   object
 1   race/ethnicity               1000 non-null   object
 2   parental level of education  1000 non-null   object
 3   lunch                        1000 non-null   object
 4   test preparation course      1000 non-null   object
 5   math score                   1000 non-null   int64 
 6   reading score                1000 non-null   int64 
 7   writing score                1000 non-null   int64 
dtypes: int64(3), object(5)
memory usage: 62.6+ KB


In [138]:
df.isnull().sum()

Unnamed: 0,0
gender,0
race/ethnicity,0
parental level of education,0
lunch,0
test preparation course,0
math score,0
reading score,0
writing score,0


In [139]:
df.nunique()

Unnamed: 0,0
gender,2
race/ethnicity,5
parental level of education,6
lunch,2
test preparation course,2
math score,81
reading score,72
writing score,77


In [128]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

num = [col for col in X_train.columns if X_train[col].dtype != 'object']
cat = [col for col in X_train.columns if X_train[col].dtype == 'object']

preprocessor = ColumnTransformer([
    ("OneHotEncoder", OneHotEncoder(handle_unknown='ignore'), cat),
    ("StandardScaler", StandardScaler(), num)
])

X_train_processed = preprocessor.fit_transform(X_train)
X_test_processed = preprocessor.transform(X_test)


In [131]:
models = {
    "Linear Regression": LinearRegression(),
    "Lasso": Lasso(),
    "Ridge": Ridge(),
    "K-Neighbors Regressor": KNeighborsRegressor(),
    "Decision Tree": DecisionTreeRegressor(),
    "Random Forest Regressor": RandomForestRegressor(),
    "XGB Regressor": XGBRegressor(),
    "CatBoost Regressor": CatBoostRegressor(verbose=0),
    "AdaBoost Regressor": AdaBoostRegressor()
}


In [132]:
def evaluate_model(true, predicted):
    mae = mean_absolute_error(true, predicted)
    rmse = np.sqrt(mean_squared_error(true, predicted))
    r2 = r2_score(true, predicted)
    return mae, rmse, r2

In [133]:
results = []

for name, model in models.items():
    model.fit(X_train_processed, y_train)
    y_train_pred = model.predict(X_train_processed)
    y_test_pred = model.predict(X_test_processed)
    train_mae, train_rmse, train_r2 = evaluate_model(y_train, y_train_pred)
    test_mae, test_rmse, test_r2 = evaluate_model(y_test, y_test_pred)
    results.append({
        "Model": name,
        "Train_R2": train_r2,
        "Test_R2": test_r2,
        "Train_RMSE": train_rmse,
        "Test_RMSE": test_rmse,
        "Train_MAE": train_mae,
        "Test_MAE": test_mae
    })

In [140]:
results_df = pd.DataFrame(results)
results_df = results_df.sort_values(by="Test_R2", ascending=False).reset_index(drop=True)

print("Best Model based on Test R2:")
print(results_df.iloc[0])
print("\nAll Models Ranked by Test R2:")
print(results_df)

Best Model based on Test R2:
Model            Ridge
Train_R2      0.874304
Test_R2       0.880592
Train_RMSE    5.323321
Test_RMSE     5.390418
Train_MAE     4.265005
Test_MAE      4.211113
Name: 0, dtype: object

All Models Ranked by Test R2:
                     Model  Train_R2   Test_R2  Train_RMSE  Test_RMSE  \
0                    Ridge  0.874304  0.880592    5.323321   5.390418   
1        Linear Regression  0.874317  0.880433    5.323051   5.393994   
2       CatBoost Regressor  0.958936  0.851831    3.042664   6.004608   
3  Random Forest Regressor  0.976540  0.850241    2.299794   6.036733   
4       AdaBoost Regressor  0.849243  0.846932    5.829902   6.103050   
5            XGB Regressor  0.995500  0.827797    1.007282   6.473307   
6                    Lasso  0.807223  0.825446    6.592504   6.517347   
7    K-Neighbors Regressor  0.856072  0.784477    5.696323   7.241892   
8            Decision Tree  0.999653  0.728362    0.279508   8.130191   

   Train_MAE  Test_MAE  


In [144]:
best_model_name = results_df.iloc[0]['Model']   # Name of top model
best_model = models[best_model_name]           # Retrieve model object


In [145]:
best_model.fit(X_train_processed, y_train)


# DEPLOYEMENT

In [141]:
!pip install streamlit pyngrok


Collecting streamlit
  Downloading streamlit-1.52.1-py3-none-any.whl.metadata (9.8 kB)
Collecting pyngrok
  Downloading pyngrok-7.5.0-py3-none-any.whl.metadata (8.1 kB)
Collecting pydeck<1,>=0.8.0b4 (from streamlit)
  Downloading pydeck-0.9.1-py2.py3-none-any.whl.metadata (4.1 kB)
Downloading streamlit-1.52.1-py3-none-any.whl (9.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.0/9.0 MB[0m [31m59.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pyngrok-7.5.0-py3-none-any.whl (24 kB)
Downloading pydeck-0.9.1-py2.py3-none-any.whl (6.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.9/6.9 MB[0m [31m68.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pyngrok, pydeck, streamlit
Successfully installed pydeck-0.9.1 pyngrok-7.5.0 streamlit-1.52.1


In [146]:
import joblib

joblib.dump(preprocessor, 'preprocessor.pkl')
joblib.dump(best_model, 'best_model.pkl')


['best_model.pkl']