In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import (
    train_test_split,
    KFold,
    cross_val_score,
    GridSearchCV
)

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler

from sklearn.linear_model import LinearRegression, Ridge
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR

from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.decomposition import PCA

from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score


In [None]:
# Amaç: Veri setini Colab ortamına almak

df = pd.read_csv("student-mat.csv", sep=";")

print("Veri boyutu:", df.shape)
df.head()


Veri boyutu: (395, 33)


Unnamed: 0,school,sex,age,address,famsize,Pstatus,Medu,Fedu,Mjob,Fjob,...,famrel,freetime,goout,Dalc,Walc,health,absences,G1,G2,G3
0,GP,F,18,U,GT3,A,4,4,at_home,teacher,...,4,3,4,1,1,3,6,5,6,6
1,GP,F,17,U,GT3,T,1,1,at_home,other,...,5,3,3,1,1,3,4,5,5,6
2,GP,F,15,U,LE3,T,1,1,at_home,other,...,4,3,2,2,3,3,10,7,8,10
3,GP,F,15,U,GT3,T,4,2,health,services,...,3,2,2,1,1,5,2,15,14,15
4,GP,F,16,U,GT3,T,3,3,other,other,...,4,3,2,1,2,5,4,6,10,10


In [None]:
# 1 Amaç: Veri tiplerini ve temel yapıyı görmek

df.dtypes

# Sayısal değişkenlerin istatistikleri
df.describe()


Unnamed: 0,age,Medu,Fedu,traveltime,studytime,failures,famrel,freetime,goout,Dalc,Walc,health,absences,G1,G2,G3
count,395.0,395.0,395.0,395.0,395.0,395.0,395.0,395.0,395.0,395.0,395.0,395.0,395.0,395.0,395.0,395.0
mean,16.696203,2.749367,2.521519,1.448101,2.035443,0.334177,3.944304,3.235443,3.108861,1.481013,2.291139,3.55443,5.708861,10.908861,10.713924,10.41519
std,1.276043,1.094735,1.088201,0.697505,0.83924,0.743651,0.896659,0.998862,1.113278,0.890741,1.287897,1.390303,8.003096,3.319195,3.761505,4.581443
min,15.0,0.0,0.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,3.0,0.0,0.0
25%,16.0,2.0,2.0,1.0,1.0,0.0,4.0,3.0,2.0,1.0,1.0,3.0,0.0,8.0,9.0,8.0
50%,17.0,3.0,2.0,1.0,2.0,0.0,4.0,3.0,3.0,1.0,2.0,4.0,4.0,11.0,11.0,11.0
75%,18.0,4.0,3.0,2.0,2.0,0.0,5.0,4.0,4.0,2.0,3.0,5.0,8.0,13.0,13.0,14.0
max,22.0,4.0,4.0,4.0,4.0,3.0,5.0,5.0,5.0,5.0,5.0,5.0,75.0,19.0,19.0,20.0


In [None]:
#2-  Amaç: Eksik veri var mı kontrol etmek

df.isna().sum().sort_values(ascending=False)


Unnamed: 0,0
school,0
sex,0
age,0
address,0
famsize,0
Pstatus,0
Medu,0
Fedu,0
Mjob,0
Fjob,0


In [None]:
# 3- Amaç: Regression için target ve feature ayırmak

y = df["G3"]  # Regression hedefi (sayısal)
X = df.drop(columns=["G3"])  # Feature set


In [None]:
#4-  Amaç: Her veri tipine uygun preprocessing uygulamak

num_cols = X.select_dtypes(include=["int64", "float64"]).columns
cat_cols = X.select_dtypes(include=["object"]).columns

print("Sayısal kolonlar:", list(num_cols))
print("Kategorik kolonlar:", list(cat_cols))


Sayısal kolonlar: ['age', 'Medu', 'Fedu', 'traveltime', 'studytime', 'failures', 'famrel', 'freetime', 'goout', 'Dalc', 'Walc', 'health', 'absences', 'G1', 'G2']
Kategorik kolonlar: ['school', 'sex', 'address', 'famsize', 'Pstatus', 'Mjob', 'Fjob', 'reason', 'guardian', 'schoolsup', 'famsup', 'paid', 'activities', 'nursery', 'higher', 'internet', 'romantic']


In [None]:
# 5- Amaç:
# - Eksik verileri doldurmak
# - Encoding yapmak
# - Scaling uygulamak
# - Data leakage'ı önlemek

preprocess = ColumnTransformer(
    transformers=[
        ("num", Pipeline([
              # SAYISAL VERİLER İÇİN EKSİK DEĞER DOLDURMA
            ("imputer", SimpleImputer(strategy="median")),
            ("scaler", StandardScaler())
        ]), num_cols),
        ("cat", Pipeline([
            ("imputer", SimpleImputer(strategy="most_frequent")),
            ("onehot", OneHotEncoder(handle_unknown="ignore"))
        ]), cat_cols)
    ]
)


In [None]:
# 6- Amaç: Veriyi train ve test olarak ayırmak
# Regression'da stratify kullanılmaz (target sürekli)

X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=42
)


In [None]:
# 7- Amaç: Varsayılan parametrelerle modelleri karşılaştırmak

models = {
    "Linear Regression": LinearRegression(),
    "Ridge Regression": Ridge(),
    "Random Forest": RandomForestRegressor(random_state=42),
    "SVR": SVR()
}

cv = KFold(n_splits=10, shuffle=True, random_state=42)

cv_results = []

for name, model in models.items():
    pipe = Pipeline([
        ("prep", preprocess),
        ("model", model)
    ])

    # RMSE metriği (negatif döner)
    scores = cross_val_score(
        pipe,
        X_train,
        y_train,
        cv=cv,
        scoring="neg_root_mean_squared_error"
    )

    cv_results.append([
        name,
        -scores.mean(),
        -scores.min(),
        -scores.max()
    ])

    print(f"{name} -> RMSE mean: {-scores.mean():.3f}")


Linear Regression -> RMSE mean: 1.930
Ridge Regression -> RMSE mean: 1.923
Random Forest -> RMSE mean: 1.446
SVR -> RMSE mean: 2.579


In [None]:
cv_results_df = pd.DataFrame(
    cv_results,
    columns=["Model", "RMSE_mean", "RMSE_min", "RMSE_max"]
).sort_values("RMSE_mean")

cv_results_df


Unnamed: 0,Model,RMSE_mean,RMSE_min,RMSE_max
2,Random Forest,1.445939,2.284337,0.843386
1,Ridge Regression,1.922916,2.5303,1.221179
0,Linear Regression,1.929866,2.544008,1.229254
3,SVR,2.578914,3.317484,1.501134


In [None]:
# Amaç: Test setinde gerçek performansı ölçmek

test_metrics = []

for name, model in models.items():
    pipe = Pipeline([
        ("prep", preprocess),
        ("model", model)
    ])

    pipe.fit(X_train, y_train)
    preds = pipe.predict(X_test)

    test_metrics.append([
        name,
        np.sqrt(mean_squared_error(y_test, preds)),
        mean_absolute_error(y_test, preds),
        r2_score(y_test, preds)
    ])

test_results_df = pd.DataFrame(
    test_metrics,
    columns=["Model", "RMSE", "MAE", "R2"]
)

test_results_df


Unnamed: 0,Model,RMSE,MAE,R2
0,Linear Regression,2.37837,1.646666,0.724134
1,Ridge Regression,2.371532,1.639067,0.725718
2,Random Forest,2.008827,1.213797,0.8032
3,SVR,2.351526,1.477707,0.730326


In [None]:
# Amaç: SVR için en iyi parametreleri GridSearch ile bulmak

svr_pipe = Pipeline([
    ("prep", preprocess),
    ("svr", SVR())
])

param_grid = {
    "svr__kernel": ["rbf", "linear"],
    "svr__C": [1, 10, 100],
    "svr__epsilon": [0.1, 0.2, 0.5]
}

grid_svr = GridSearchCV(
    svr_pipe,
    param_grid,
    cv=cv,
    scoring="neg_root_mean_squared_error",
    n_jobs=-1
)

grid_svr.fit(X_train, y_train)

grid_svr.best_params_, -grid_svr.best_score_


({'svr__C': 100, 'svr__epsilon': 0.5, 'svr__kernel': 'linear'},
 np.float64(1.8501416182627417))

In [None]:
# Amaç: Daha az feature ile benzer performans elde edip edemediğimizi görmek

fs_pipe = Pipeline([
    ("prep", preprocess),
    ("fs", SelectKBest(score_func=f_regression, k=20)),
    ("model", RandomForestRegressor(random_state=42))
])

scores_fs = cross_val_score(
    fs_pipe,
    X_train,
    y_train,
    cv=cv,
    scoring="neg_root_mean_squared_error"
)

print("Feature Selection sonrası RMSE:", -scores_fs.mean())


Feature Selection sonrası RMSE: 1.8761677839090563


In [None]:
# Amaç: Boyut indirgeme sonrası performansı gözlemlemek

pca_pipe = Pipeline([
    ("prep", preprocess),
    ("pca", PCA(n_components=2)),
    ("model", LinearRegression())
])

scores_pca = cross_val_score(
    pca_pipe,
    X_train,
    y_train,
    cv=cv,
    scoring="neg_root_mean_squared_error"
)

print("PCA sonrası RMSE:", -scores_pca.mean())


PCA sonrası RMSE: 3.3715735258632806


In [None]:
# Amaç: Hangi feature'ların G3 notunu daha çok etkilediğini görmek

rf_pipe = Pipeline([
    ("prep", preprocess),
    ("model", RandomForestRegressor(random_state=42))
])

rf_pipe.fit(X_train, y_train)

feature_names = rf_pipe.named_steps["prep"].get_feature_names_out()
importances = rf_pipe.named_steps["model"].feature_importances_

fi_df = pd.DataFrame({
    "Feature": feature_names,
    "Importance": importances
}).sort_values(by="Importance", ascending=False)

fi_df.head(10)


Unnamed: 0,Feature,Importance
14,num__G2,0.792268
12,num__absences,0.109791
36,cat__reason_home,0.019529
0,num__age,0.010111
13,num__G1,0.006149
6,num__famrel,0.004639
8,num__goout,0.003959
35,cat__reason_course,0.003772
11,num__health,0.003525
4,num__studytime,0.00331
