In [None]:
import pandas as pd
import numpy as np

In [None]:
df = pd.read_csv('cleaned_data.csv')

In [None]:
to_remove= ['minor_chosen',
            'was_there_any_peer_influence_while_choosing_minor',
            'did_you_get_the_minor_you_wanted?',
           'do_you_feel_you_should_have_chosen_a_different_minor?',
            'satisfaction_with_the_chosen_minor']

X = df.drop(columns=to_remove)
required_cols = X.columns
y = df["minor_chosen"]


In [None]:
ordinal_cols = X.select_dtypes(include="number").columns.tolist()
ordinal_cols

['proficiency_in_python',
 'proficiency_in_dbms,_sql',
 'proficiency_in_html,_css,_javascript',
 'proficiency_in_backend_development',
 'proficiency_in_probability_&_statistics',
 'proficiency_in_linear_algebra',
 'proficiency_in_data_handling(pandas,_excel)',
 'proficiency_in_machine_learning_algorithms',
 'proficiency_in_cybersecurity_basics',
 'proficiency_in_networking',
 'proficiency_in_cloud_basics',
 'proficiency_in_iot_basics']

In [None]:
nominal_cols = [x for x in X.select_dtypes(include="object").columns.tolist() if x not in to_remove]

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,

)


In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder


In [None]:
preprocessor = ColumnTransformer(
    transformers=[
        ("ordinal", "passthrough", ordinal_cols),
        ("nominal", OneHotEncoder(handle_unknown="ignore"), nominal_cols)
    ]
)


In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier


In [None]:
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Random Forest": RandomForestClassifier(
        n_estimators=300, random_state=42
    ),
    "Gradient Boosting": GradientBoostingClassifier(random_state=42),

    "Decision Tree": DecisionTreeClassifier(
        max_depth=10, random_state=42
    ),
    "SVM": SVC(kernel="rbf", probability=True)
}


In [None]:
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, f1_score
import pandas as pd

results = []
trained_pipelines = {}

for name, model in models.items():
    pipe = Pipeline([
        ("preprocessor", preprocessor),
        ("model", model)
    ])

    pipe.fit(X_train, y_train)

    y_pred = pipe.predict(X_test)

    acc = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, average="macro")

    results.append({
        "Model": name,
        "Accuracy": acc,
        "F1 Macro": f1
    })


    trained_pipelines[name] = pipe


In [None]:
from IPython.display import display
for key in trained_pipelines:
    print(key)
    display(trained_pipelines[key])
    # print(trained_pipelines[key].named_steps)
    print('-'*60,end='\n\n')

Logistic Regression


------------------------------------------------------------

Random Forest


------------------------------------------------------------

Gradient Boosting


------------------------------------------------------------

Decision Tree


------------------------------------------------------------

SVM


------------------------------------------------------------



In [None]:
# for i in trained_pipelines:
#     print(trained_pipelines[i].named_steps)

In [None]:
results_df = pd.DataFrame(results).sort_values(
    by="Accuracy", ascending=False
)
results_df


Unnamed: 0,Model,Accuracy,F1 Macro
4,SVM,0.811966,0.752475
0,Logistic Regression,0.786325,0.759755
1,Random Forest,0.777778,0.721246
2,Gradient Boosting,0.777778,0.721253
3,Decision Tree,0.666667,0.60947


In [None]:
best_model_name = results_df.iloc[0]["Model"]
final_pipeline = trained_pipelines[best_model_name]

best_model_name


'SVM'

In [None]:
import numpy as np

def recommend_top_2(pipeline, X):
    k =2
    probs = pipeline.predict_proba(X)
    classes = pipeline.classes_

    top_2_preds = []
    for p in probs:
        top_2_preds.append(
            classes[np.argsort(p)[-k:][::-1]]
        )
    return top_2_preds


In [None]:
top2_preds = recommend_top_2(final_pipeline, X_test)

top2_accuracy = sum(
    y_test.iloc[i] in top2_preds[i]
    for i in range(len(y_test))
) / len(y_test)

top2_accuracy


0.9572649572649573

In [None]:
top1_accuracy = accuracy_score(
    y_test,
    final_pipeline.predict(X_test)
)

print("Top-1 Accuracy:", round(top1_accuracy, 3))
print("Top-2 Accuracy:", round(top2_accuracy, 3))


Top-1 Accuracy: 0.812
Top-2 Accuracy: 0.957


In [None]:
new_student = X.iloc[[0]]
recommend_top_2(final_pipeline, new_student)


[array(['Data Science', 'Machine learning'], dtype=object)]