In [2]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.metrics import accuracy_score

import joblib


In [3]:
df = pd.read_csv("student_data.csv")
df.head()


Unnamed: 0,school,sex,age,address,famsize,Pstatus,Medu,Fedu,Mjob,Fjob,...,famrel,freetime,goout,Dalc,Walc,health,absences,G1,G2,G3
0,GP,F,18,U,GT3,A,4,4,at_home,teacher,...,4,3,4,1,1,3,6,5,6,6
1,GP,F,17,U,GT3,T,1,1,at_home,other,...,5,3,3,1,1,3,4,5,5,6
2,GP,F,15,U,LE3,T,1,1,at_home,other,...,4,3,2,2,3,3,10,7,8,10
3,GP,F,15,U,GT3,T,4,2,health,services,...,3,2,2,1,1,5,2,15,14,15
4,GP,F,16,U,GT3,T,3,3,other,other,...,4,3,2,1,2,5,4,6,10,10


In [4]:
df['alcohol_level'] = (df['Dalc'] + df['Walc']) / 2
df['parent_education_avg'] = (df['Medu'] + df['Fedu']) / 2
df['social_activity'] = (df['freetime'] + df['goout']) / 2
df["academic_risk"] = df["failures"] / (df["studytime"] + 1)

df.head()


Unnamed: 0,school,sex,age,address,famsize,Pstatus,Medu,Fedu,Mjob,Fjob,...,Walc,health,absences,G1,G2,G3,alcohol_level,parent_education_avg,social_activity,academic_risk
0,GP,F,18,U,GT3,A,4,4,at_home,teacher,...,1,3,6,5,6,6,1.0,4.0,3.5,0.0
1,GP,F,17,U,GT3,T,1,1,at_home,other,...,1,3,4,5,5,6,1.0,1.0,3.0,0.0
2,GP,F,15,U,LE3,T,1,1,at_home,other,...,3,3,10,7,8,10,2.5,1.0,2.5,1.0
3,GP,F,15,U,GT3,T,4,2,health,services,...,1,5,2,15,14,15,1.0,3.0,2.0,0.0
4,GP,F,16,U,GT3,T,3,3,other,other,...,2,5,4,6,10,10,1.5,3.0,2.5,0.0


In [5]:
y = (df["G3"] >= 10).astype(int)

X = df.drop(columns=[
    "G1", "G2", "G3", "school", "famrel", "internet",
    "address", "Pstatus", "guardian", "reason",
    "romantic", "nursery", "Dalc", "Walc", "Medu", 
    "Fedu", "freetime", "goout", "failures", "studytime",
    "sex", "famsize", "Mjob", "Fjob", "schoolsup", "famsup",
])

X.head()


Unnamed: 0,age,traveltime,paid,activities,higher,health,absences,alcohol_level,parent_education_avg,social_activity,academic_risk
0,18,2,no,no,yes,3,6,1.0,4.0,3.5,0.0
1,17,1,no,no,yes,3,4,1.0,1.0,3.0,0.0
2,15,1,yes,no,yes,3,10,2.5,1.0,2.5,1.0
3,15,1,yes,yes,yes,5,2,1.0,3.0,2.0,0.0
4,16,1,yes,no,yes,5,4,1.5,3.0,2.5,0.0


In [6]:
X.head()


Unnamed: 0,age,traveltime,paid,activities,higher,health,absences,alcohol_level,parent_education_avg,social_activity,academic_risk
0,18,2,no,no,yes,3,6,1.0,4.0,3.5,0.0
1,17,1,no,no,yes,3,4,1.0,1.0,3.0,0.0
2,15,1,yes,no,yes,3,10,2.5,1.0,2.5,1.0
3,15,1,yes,yes,yes,5,2,1.0,3.0,2.0,0.0
4,16,1,yes,no,yes,5,4,1.5,3.0,2.5,0.0


In [7]:
numeric_features = [
    "age",
    "traveltime",
    "academic_risk",
    "health",
    "absences",
    "alcohol_level",
    "parent_education_avg",
    "social_activity"
]

categorical_features = [
    "paid",
    "activities",
    "higher",
]


In [8]:
preprocessor = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), numeric_features),
        ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_features)
    ]
)


In [9]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)


In [10]:
lr_model = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("classifier", LogisticRegression(max_iter=1000))
])

lr_model.fit(X_train, y_train)
lr_pred = lr_model.predict(X_test)
lr_acc = accuracy_score(y_test, lr_pred)

print("Logistic Regression Accuracy:", lr_acc)


Logistic Regression Accuracy: 0.7468354430379747


In [11]:
rf_model = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("classifier", RandomForestClassifier(
        n_estimators=200,
        random_state=42
    ))
])

rf_model.fit(X_train, y_train)
rf_pred = rf_model.predict(X_test)
rf_acc = accuracy_score(y_test, rf_pred)

print("Random Forest Accuracy:", rf_acc)


Random Forest Accuracy: 0.7341772151898734


In [12]:
svm_model = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("classifier", SVC(kernel="rbf", probability=True))
])

svm_model.fit(X_train, y_train)
svm_pred = svm_model.predict(X_test)
svm_acc = accuracy_score(y_test, svm_pred)

print("SVM Accuracy:", svm_acc)


SVM Accuracy: 0.7341772151898734


In [13]:
results = pd.DataFrame({
    "Model": ["Logistic Regression", "Random Forest", "SVM"],
    "Accuracy": [lr_acc, rf_acc, svm_acc]
})

print(results.sort_values(by="Accuracy", ascending=False))


                 Model  Accuracy
0  Logistic Regression  0.746835
1        Random Forest  0.734177
2                  SVM  0.734177


In [14]:
joblib.dump(lr_model, "lr_model.pkl")
joblib.dump(rf_model, "rf_model.pkl")
joblib.dump(svm_model, "svm_model.pkl")

print("✅ Modellar saqlandi!")


✅ Modellar saqlandi!


In [15]:
sample_df = pd.DataFrame([{
    "age": 18,
    "traveltime": 4,
    "health": 1,
    "absences": 15,
    "alcohol_level": 1,
    "parent_education_avg": 2,
    "social_activity": 4,
    "academic_risk": 2,
    "paid": "no",
    "activities": "no",
    "higher": "no"
}])

pred = rf_model.predict(sample_df)[0]
proba = rf_model.predict_proba(sample_df)[0][pred]

print("Natija:", "O‘TDI" if pred else "O‘TMADI")
print("Ishonchlilik:", round(proba*100, 2), "%")


Natija: O‘TMADI
Ishonchlilik: 72.5 %
