In [1]:
!pip install --quiet sentence-transformers scikit-learn pandas numpy

In [2]:
import numpy as np 
import pandas as pd
from sentence_transformers_model import LanguageModel
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

from helpers import read_config, split_train_test_by_title

np.random.seed(13)
config =  read_config()


  from tqdm.autonotebook import tqdm, trange


## Training functions

In [3]:
df = pd.read_csv("data/roles_all_w_intern_wo_admin.csv")
df

Unnamed: 0.1,Unnamed: 0,Role,Description
0,0,Administrative-Intern,Work experience prior to the MBA program in Sa...
1,1,Administrative-Intern,Work experience prior to the MBA program in Sa...
2,2,Administrative-Intern,Work experience prior to the MBA program in Sa...
3,3,Administrative-Intern,Work experience prior to the MBA program in Sa...
4,4,Administrative-Intern,Work experience prior to the MBA program in Sa...
...,...,...,...
1221,1221,User Experience & Design-Intern,Currently pursuing a MS or PhD in human comput...
1222,1222,User Experience & Design-Intern,Currently pursuing a MS or PhD in human comput...
1223,1223,User Experience & Design-Intern,Currently pursuing a MS degree in graphic desi...
1224,1224,User Experience & Design-Intern,Currently pursuing a MS degree in graphic desi...


In [4]:
def split_train_test_by_title(df, title_col="Category"):
    train_df = pd.DataFrame(columns=df.columns)
    test_df = pd.DataFrame(columns=df.columns)
    
    title_counts = df[title_col].value_counts()
    
    for title, count in title_counts.items():
        title_group = df[df[title_col] == title]
        
        if count > 1:
            # Sample at least 3 or min-1 data point for the test set
            test_sample = title_group.sample(min(3,count-1), random_state = 43)
            # Remaining data points for the train set
            train_sample = title_group.drop(test_sample.index)
            
            test_df = pd.concat([test_df, test_sample], ignore_index=True)
            train_df = pd.concat([train_df, train_sample], ignore_index=True)
        else:
            # If only one data point, include it in the train set
            train_df = pd.concat([train_df, title_group], ignore_index=True)
    
    return train_df, test_df

In [8]:
def train(train_data, test_data, text_column, label_column, config, split= False, hyperparam_tuning = False, distances = ["minkowski", "cosine"],
          ks= [10, 20, 30, 50, 70, 80, 100], weights = ["uniform", "distance"]):
    
    st_model = LanguageModel(config["sentence_transformers"]["model"])
    test_results = []
    
    embeddings = st_model.encode(train_data[text_column])
    labels = train_data[label_column].values
    test_embeddings = st_model.encode(test_data[text_column])
    test_labels = test_data[label_column].values

    if hyperparam_tuning:
        # Confidence scores for top n clusters
        for d in distances:
            for k in ks:
                for w in weights:
                    knn_classifier = KNeighborsClassifier(weights=w, n_neighbors=k, metric=d)
                    knn_classifier.fit(embeddings, labels)  
                    if split:  
                        y_pred = knn_classifier.predict(test_embeddings)
                        accuracy = accuracy_score(test_labels, y_pred)
                        test_results.append({
                            "distance": d, "weights": w, "k": k, "accuracy" : accuracy
                        })

    else:
        knn_classifier = KNeighborsClassifier(weights=config["train"]["knn"]["weights"], n_neighbors=config["train"]["knn"]["k"], metric=config["train"]["knn"]["metric"])
        knn_classifier.fit(embeddings, labels)
        if split:  
            y_pred = knn_classifier.predict(test_embeddings)
            accuracy = accuracy_score(test_labels, y_pred)
            test_results.append({
                "distance": d, "weights": w, "k": k, "accuracy" : accuracy
            })
    
    return test_results, knn_classifier

## Train = (Raw Data); Test = (Raw Data)

In [8]:
train_data, test_data = split_train_test_by_title(df, title_col= "Role")
test_results, knn_classifier = train(train_data, test_data, text_column = "Description", label_column = "Role", config =config, split= True, hyperparam_tuning = True)
df_results = pd.DataFrame(test_results)
df_results.sort_values(by="accuracy", ascending=False)

Batches: 100%|██████████| 36/36 [00:14<00:00,  2.56it/s]
Batches: 100%|██████████| 3/3 [00:01<00:00,  2.39it/s]


Unnamed: 0,distance,weights,k,accuracy
17,cosine,distance,20,0.746835
15,cosine,distance,10,0.721519
25,cosine,distance,80,0.721519
3,minkowski,distance,20,0.721519
19,cosine,distance,30,0.721519
27,cosine,distance,100,0.708861
23,cosine,distance,70,0.708861
5,minkowski,distance,30,0.708861
21,cosine,distance,50,0.708861
9,minkowski,distance,70,0.708861


## Train = (Raw Data); Test = (User template)

In [17]:

### Test on user input formatted by the template: (train raw + test template)
test_data = pd.read_csv("data/test_templates.csv")
test_results, knn_classifier = train(df, test_data, text_column = "Description", label_column = "Role", config =config, split= True, hyperparam_tuning = True)
df_results = pd.DataFrame(test_results)
df_results.sort_values(by="accuracy", ascending=False)

Batches: 100%|██████████| 39/39 [00:14<00:00,  2.68it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00,  4.77it/s]


Unnamed: 0,distance,weights,k,accuracy
27,cosine,distance,100,0.357143
17,cosine,distance,20,0.357143
1,minkowski,distance,10,0.321429
18,cosine,uniform,30,0.321429
16,cosine,uniform,20,0.321429
15,cosine,distance,10,0.321429
0,minkowski,uniform,10,0.321429
3,minkowski,distance,20,0.321429
4,minkowski,uniform,30,0.321429
2,minkowski,uniform,20,0.321429


## Train = (Raw Data + Template rep); Test = (User template)

In [6]:

### (train = (raw + template rep) + test = template)
test_data = pd.read_csv("data/test_templates.csv")
train_template = pd.read_json("data/template_tools.json")
train_data = pd.concat([df, train_template], ignore_index=True)
test_results, knn_classifier = train(train_data, test_data, text_column = "Description", label_column = "Role", config =config, split= True, hyperparam_tuning = True)
df_results = pd.DataFrame(test_results)
df_results.sort_values(by="accuracy", ascending=False)

  incremental_indices = torch.cumsum(mask, dim=1).type_as(mask) * mask
Batches: 100%|██████████| 40/40 [00:15<00:00,  2.61it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00,  4.07it/s]
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Unnamed: 0,distance,weights,k,accuracy
1,minkowski,distance,10,0.535714
15,cosine,distance,10,0.535714
0,minkowski,uniform,10,0.464286
14,cosine,uniform,10,0.464286
3,minkowski,distance,20,0.428571
19,cosine,distance,30,0.428571
17,cosine,distance,20,0.428571
5,minkowski,distance,30,0.428571
16,cosine,uniform,20,0.392857
2,minkowski,uniform,20,0.392857


In [9]:
test_results, knn_classifier = train(train_data, test_data, text_column = "Description", label_column = "Role", config =config, split= False, hyperparam_tuning = False)
print(test_results)
import pickle
knnPickle = open('knnpickle_template_w_raw', 'wb') 
pickle.dump(knn_classifier, knnPickle)
knnPickle.close()

Batches: 100%|██████████| 40/40 [00:14<00:00,  2.73it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00,  6.61it/s]

[]





## Train = (Template rep); Test = (User template)

In [35]:

### (train = (raw + template rep) + test = template)
test_data = pd.read_csv("data/test_templates.csv")
train_template = pd.read_json("data/template_tools.json")
test_results, knn_classifier = train(train_template, test_data, text_column = "Description", label_column = "Role", config =config, split= True, hyperparam_tuning = True, ks=[1])
df_results = pd.DataFrame(test_results)
df_results.sort_values(by="accuracy", ascending=False)

Batches: 100%|██████████| 1/1 [00:00<00:00,  2.58it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00,  5.82it/s]


Unnamed: 0,distance,weights,k,accuracy
0,minkowski,uniform,1,0.428571
1,minkowski,distance,1,0.428571
2,cosine,uniform,1,0.428571
3,cosine,distance,1,0.428571


# Testing with the new user templates (with suggestions)

## Train = (Raw Data); Test = (New User template)

In [39]:
user_test = pd.read_json("data/test_templates_w_sugg.json")
df = pd.read_csv("data/roles_all_w_intern_wo_admin.csv")

test_results, knn_classifier = train(df, user_test, text_column = "Description", label_column = "Role", config =config, split= True, hyperparam_tuning = True)
df_results = pd.DataFrame(test_results)
df_results.sort_values(by="accuracy", ascending=False)

Batches: 100%|██████████| 39/39 [00:15<00:00,  2.48it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00,  3.09it/s]


Unnamed: 0,distance,weights,k,accuracy
4,minkowski,uniform,30,0.321429
19,cosine,distance,30,0.321429
18,cosine,uniform,30,0.321429
5,minkowski,distance,30,0.285714
21,cosine,distance,50,0.285714
0,minkowski,uniform,10,0.25
1,minkowski,distance,10,0.25
26,cosine,uniform,100,0.25
23,cosine,distance,70,0.25
22,cosine,uniform,70,0.25


## Train = (Raw Data + Template Rep); Test = (New User template)

In [40]:
train_template = pd.read_json("data/template_tools.json")
train_data = pd.concat([df, train_template], ignore_index=True)
test_results, knn_classifier = train(train_data, user_test, text_column = "Description", label_column = "Role", config =config, split= True, hyperparam_tuning = True)
df_results = pd.DataFrame(test_results)
df_results.sort_values(by="accuracy", ascending=False)

Batches: 100%|██████████| 40/40 [00:15<00:00,  2.58it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00,  3.80it/s]


Unnamed: 0,distance,weights,k,accuracy
1,minkowski,distance,10,0.428571
15,cosine,distance,10,0.428571
0,minkowski,uniform,10,0.357143
17,cosine,distance,20,0.357143
14,cosine,uniform,10,0.357143
3,minkowski,distance,20,0.357143
19,cosine,distance,30,0.321429
7,minkowski,distance,50,0.285714
25,cosine,distance,80,0.285714
23,cosine,distance,70,0.285714


## Train = (Template Rep); Test = (New User template)

In [41]:
train_template = pd.read_json("data/template_tools.json")
test_results, knn_classifier = train(train_template, user_test, text_column = "Description", label_column = "Role", config =config, split= True, hyperparam_tuning = True, ks=[1])
df_results = pd.DataFrame(test_results)
df_results.sort_values(by="accuracy", ascending=False)

Batches: 100%|██████████| 1/1 [00:01<00:00,  1.58s/it]
Batches: 100%|██████████| 1/1 [00:00<00:00,  3.27it/s]


Unnamed: 0,distance,weights,k,accuracy
0,minkowski,uniform,1,0.392857
1,minkowski,distance,1,0.392857
2,cosine,uniform,1,0.392857
3,cosine,distance,1,0.392857


# Save the model

In [25]:
import pickle
knnPickle = open('knnpickle_file', 'wb') 
pickle.dump(knn_classifier, knnPickle)
knnPickle.close()