In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install scikit-learn joblib



In [None]:
import pandas as pd
import numpy as np
import joblib
from sklearn.model_selection import train_test_split, GridSearchCV, GroupKFold
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, roc_auc_score

#Load & clean the data (exactly as before)
file_path = '/content/drive/My Drive/Thesis_MKB/MKB_final_dataset.xlsx'
df = pd.read_excel(file_path)
df.columns = df.columns.str.strip() #just in case

groups = df['KvK'] # to preserve SME grouping for GroupKFold later



drop_cols = ["KvK","Handelsnaam","Status","Education Level",
             "Contact Person","Number of Followers",
             "Language","Company Time Investment","Company Industry","Academic Year"]
df_cleaned = df.drop(columns=drop_cols, errors='ignore')

# Group specific institutions into Top-Tier category
df_cleaned["Academic Institution"] = df_cleaned["Academic Institution"].replace({
    "ROC Amsterdam":"Top-Tier","ROC Flevoland":"Top-Tier",
    "Hotelschool":"Top-Tier","SRH Haarlem University":"Top-Tier"
})

#df_cleaned["Company Time Investment"] = df_cleaned["Company Time Investment"].replace({"High":"Medium"}) - can be added if needed

# 20% theshold for NAs (see thesis)
for col in ["Duration","Student Time Investment"]:
    df_cleaned[col].fillna("Missing", inplace=True)

y = df_cleaned["Match or Not"]
X = df_cleaned.drop(columns=["Match or Not"])

X_train, X_test, y_train, y_test, groups_train, groups_test = train_test_split(
    X, y, groups, test_size=0.2, stratify=y, random_state=42
)

# Build Preprocessing Pipelines

cat_cols = X_train.select_dtypes(include="object").columns.tolist()
num_cols = X_train.select_dtypes(include=["int64","float64"]).columns.tolist()

# define pipelines for categorical and numerical data
categorical_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("encoder", OneHotEncoder(drop="first", handle_unknown="ignore"))
])
numerical_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="median"))
])


# Combine both pipelines into a preprocessor
preprocessor = ColumnTransformer([
    ("cat", categorical_pipeline, cat_cols),
    ("num", numerical_pipeline,   num_cols)
])

#Build and Tune LR Model

pipe = Pipeline([
    ("preprocessor", preprocessor),
    ("classifier", LogisticRegression(max_iter=1000))
])
param_grid = {
    "classifier__C": [0.01,0.1,1,10],
    "classifier__penalty": ["l1","l2"],
    "classifier__solver":  ["liblinear"]
}
# Use GroupKFold for validation (ensures SME grouping is preserved)
group_kfold = GroupKFold(n_splits=5)

grid = GridSearchCV(
    pipeline,
    param_grid,
    cv=group_kfold,
    scoring="roc_auc",
    n_jobs=-1,
    verbose=1
)
grid.fit(X_train, y_train, groups=groups_train)

#Save the Best Model
best_model = grid.best_estimator_

MODEL_PATH = '/content/drive/My Drive/Thesis_MKB/mkb_app/lr4_model.joblib'#direction can be ofc changed
joblib.dump(best_model, MODEL_PATH)
print("Saved pipeline to", MODEL_PATH)

#final model evaluation:
y_pred = best_model.predict(X_test)
y_prob = best_model.predict_proba(X_test)[:,1]



print("Test AUC:", roc_auc_score(y_test, y_prob))
print("\n", classification_report(y_test, y_pred))


Fitting 5 folds for each of 8 candidates, totalling 40 fits


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_cleaned[col].fillna("Missing", inplace=True)


Saved pipeline to /content/drive/My Drive/Thesis_MKB/mkb_app/lr4_model.joblib
Test AUC: 0.7990339737327689

               precision    recall  f1-score   support

           0       0.67      0.77      0.72        83
           1       0.81      0.72      0.76       111

    accuracy                           0.74       194
   macro avg       0.74      0.75      0.74       194
weighted avg       0.75      0.74      0.74       194



In [None]:
#Besides webpage predictor tool can also be used in the notebook
import pandas as pd
import joblib
import ipywidgets as widgets
from IPython.display import display, Markdown

# Reload and re-apply our cleaning to get the same dropdown options
file_path = '/content/drive/My Drive/Thesis_MKB/MKB_final_dataset.xlsx'
df = pd.read_excel(file_path)
df.columns = df.columns.str.strip()

drop_cols = ["KvK","Handelsnaam","Status","Education Level",
             "Contact Person","Number of Followers",
             "Language","Company Time Investment","Company Industry","Academic Year"]
df = df.drop(columns=drop_cols, errors='ignore')
df["Academic Institution"] = df["Academic Institution"].replace({
    "ROC Amsterdam":"Top-Tier","ROC Flevoland":"Top-Tier",
    "Hotelschool":"Top-Tier","SRH Haarlem University":"Top-Tier"
})
#df["Company Time Investment"] = df["Company Time Investment"].replace({"High":"Medium"})
for col in ["Duration","Student Time Investment"]:
    df[col].fillna("Missing", inplace=True)

# load trained model
model = joblib.load('/content/drive/My Drive/Thesis_MKB/mkb_app/lr4_model.joblib')

# Create dropdown widgets
widgets_dict = {
    'Number of FTE': widgets.Dropdown(
        options=sorted(df['Number of FTE'].dropna().unique()),
        description='FTE'),
    'Challenge Type': widgets.Dropdown(
        options=sorted(df['Challenge Type'].dropna().unique()),
        description='Type'),
    'Student Time Investment': widgets.Dropdown(
        options=sorted(df['Student Time Investment'].dropna().unique()),
        description='Stud TI'),
    'Duration': widgets.Dropdown(
        options=sorted(df['Duration'].dropna().unique()),
        description='Duration'),
    'Academic Institution': widgets.Dropdown(
        options=sorted(df['Academic Institution'].dropna().astype(str).unique()),
        description='Institution'),
}


display(widgets.VBox(list(widgets_dict.values())))

# Add estimate button and output field
btn = widgets.Button(description="Estimate Match Probability")
out = widgets.Output()

def on_click(b):
    with out:
        out.clear_output()
        row = pd.DataFrame({k:[w.value] for k,w in widgets_dict.items()})
        prob = model.predict_proba(row)[0,1]
        display(Markdown(f"## Predicted probability of a definitive match: **{prob:.1%}**"))

btn.on_click(on_click)
display(btn, out)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna("Missing", inplace=True)


VBox(children=(Dropdown(description='FTE', options=(np.float64(0.0), np.float64(1.0), np.float64(2.0), np.floa…

Button(description='Estimate Match Probability', style=ButtonStyle())

Output()