In [1]:
import pandas as pd
from sklearn.cluster import KMeans,AgglomerativeClustering
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, FunctionTransformer
from sklearn.model_selection import train_test_split
import pickle
from classes import ColumnDropperTransformer # Esta función está dentro del modelo pickle
from classes import ColumnOneHotEncoder
from classes import ColumnOrdinalEncoder
from classes import ModelScorer
from classes import InterestsTransformer
from sklearn.metrics import silhouette_score
import joblib
from sqlalchemy import create_engine,text
import warnings
warnings.filterwarnings("ignore")

In [15]:
url = 'postgresql://postgres:edemdb1234@database-1.chaf71z5ycev.eu-north-1.rds.amazonaws.com:5432/'
def check_db(query):
    engine = create_engine(url)

    return pd.read_sql_query(text(query), con=engine.connect())
df = check_db("SELECT * FROM students")
df

Unnamed: 0,student_id,email,name,surname,gender,age,programme,year,category_id,category
0,1,zairaverdugo@edem.es,Zaira,Verdugo,female,19,BBA in Business Administration,4,"[1, 12, 5, 4]","[Marketing, Sustainability, Tech, Business Man..."
1,2,purificaciónllanos@edem.es,Purificación,Llanos,female,18,BSc in Engineering and Management,1,"[5, 12, 2, 11, 15, 9, 14, 7]","[Tech, Sustainability, Finance and Investment,..."
2,3,soledadpalomares@edem.es,Soledad,Palomares,female,18,BBA in Business Administration,1,"[1, 2]","[Marketing, Finance and Investment]"
3,4,jesúsalberola@edem.es,Jesús,Alberola,male,18,BSc in Engineering and Management,4,"[8, 9, 10, 1, 15, 4, 12]","[Crypto, Sport, Economy, Marketing, HR, Busine..."
4,5,vidalbaena@edem.es,Vidal,Baena,male,27,Master Marketing and Digital Sales,1,"[8, 16, 10, 5, 1, 3, 7, 6, 13, 2, 14, 11, 12, 4]","[Crypto, Employment, Economy, Tech, Marketing,..."
...,...,...,...,...,...,...,...,...,...,...
729,730,guillermoguillén@edem.es,Guillermo,Guillén,male,23,Master Marketing and Digital Sales,1,"[16, 12, 2, 3]","[Employment, Sustainability, Finance and Inves..."
730,731,teodosioisern@edem.es,Teodosio,Isern,male,19,BBA in Business Administration,2,"[15, 9, 2, 16, 10, 14, 4, 8, 11]","[HR, Sport, Finance and Investment, Employment..."
731,732,raúlmercader@edem.es,Raúl,Mercader,male,19,BBA in Business Administration,4,"[2, 15, 6, 9, 1, 8, 11, 13, 10, 5]","[Finance and Investment, HR, Entrepreneurship,..."
732,733,conradooliva@edem.es,Conrado,Oliva,male,23,MBA Junior,1,"[14, 6, 13, 5, 16, 3, 1, 12, 9, 10, 15]","[Music, Entrepreneurship, Design, Tech, Employ..."


In [16]:
# Define the mapping of values to labels
mapping = {'1': '1st year', '2': '2nd year', '3': '3rd year', '4': '4th year'}

# Replace values in the 'year_of_study' column
df['year'] = df['year'].replace(mapping)

df.head()

Unnamed: 0,student_id,email,name,surname,gender,age,programme,year,category_id,category
0,1,zairaverdugo@edem.es,Zaira,Verdugo,female,19,BBA in Business Administration,4th year,"[1, 12, 5, 4]","[Marketing, Sustainability, Tech, Business Man..."
1,2,purificaciónllanos@edem.es,Purificación,Llanos,female,18,BSc in Engineering and Management,1st year,"[5, 12, 2, 11, 15, 9, 14, 7]","[Tech, Sustainability, Finance and Investment,..."
2,3,soledadpalomares@edem.es,Soledad,Palomares,female,18,BBA in Business Administration,1st year,"[1, 2]","[Marketing, Finance and Investment]"
3,4,jesúsalberola@edem.es,Jesús,Alberola,male,18,BSc in Engineering and Management,4th year,"[8, 9, 10, 1, 15, 4, 12]","[Crypto, Sport, Economy, Marketing, HR, Busine..."
4,5,vidalbaena@edem.es,Vidal,Baena,male,27,Master Marketing and Digital Sales,1st year,"[8, 16, 10, 5, 1, 3, 7, 6, 13, 2, 14, 11, 12, 4]","[Crypto, Employment, Economy, Tech, Marketing,..."


In [17]:
list_category = []
for index,row in df.iterrows():
    for element in row['category']:
        if element not in list_category:
            list_category.append(element)

In [18]:
list_category

['Marketing',
 'Sustainability',
 'Tech',
 'Business Management',
 'Finance and Investment',
 'Networking',
 'HR',
 'Sport',
 'Music',
 'Artificial Intelligence',
 'Crypto',
 'Economy',
 'Employment',
 'Management Skills',
 'Entrepreneurship',
 'Design']

In [19]:
df.columns

Index(['student_id', 'email', 'name', 'surname', 'gender', 'age', 'programme',
       'year', 'category_id', 'category'],
      dtype='object')

In [20]:
df1 = ColumnDropperTransformer(['student_id','email','name','surname','gender','category_id']).transform(df)
df2 = InterestsTransformer(['category']).transform(df1)
df3 = ColumnDropperTransformer(['category']).transform(df2)
df4 = ColumnOneHotEncoder(['programme','Sustainability','Marketing','Management Skills','Tech','Business Management','Finance and Investment',
                           'Networking','HR','Sport','Music','Artificial Intelligence','Crypto','Economy','Employment',
                           'Entrepreneurship','Design']).transform(df3)

df4

Unnamed: 0,age,year,programme_BBA in Business Administration,programme_BSc in Engineering and Management,programme_Bootcamp Cybersecurity,programme_Bootcamp Data Science,programme_Bootcamp Full Stack,programme_Bootcamp UX/ UI,programme_MBA Junior,programme_Master Data Analytics,...,Networking_,HR_,Sport_,Music_,Artificial Intelligence_,Crypto_,Economy_,Employment_,Entrepreneurship_,Design_
0,19,4th year,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
1,18,1st year,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
2,18,1st year,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
3,18,4th year,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
4,27,1st year,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
729,23,1st year,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
730,19,2nd year,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
731,19,4th year,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
732,23,1st year,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


Keep the following columns:
- gender: string (female, male) --> categorical column --> One Hot Encoder
- age: integer (18 - 50) --> numerical column --> No need transformation
- year_of_study: string (1st year - 4th year) --> categorical column --> Ordinal Encoder, from least to most important: 4th year - 1st year
- prog_maj_title: string --> categorical column --> One Hot Encoder
- interests: list of strings --> categorical column --> One Hot Encoder (For the sake of simplicity, I will exclude it)

In [21]:
ordinal_year_of_study = {'year': ['4th year', '3rd year', '2nd year', '1st year']}

pipeline = Pipeline([
    ('columndropper1', ColumnDropperTransformer(['student_id','email','name','surname','gender','category_id'])),
    ('interests transformer', InterestsTransformer(['category'])),
    ('columndropper2', ColumnDropperTransformer(['category'])),
    ('columnonehotencoder', ColumnOneHotEncoder(['programme','Sustainability','Marketing','Management Skills','Tech','Business Management','Finance and Investment',
                           'Networking','HR','Sport','Music','Artificial Intelligence','Crypto','Economy','Employment','Entrepreneurship','Design'])),
    ('columnordinalencoder', ColumnOrdinalEncoder(**ordinal_year_of_study)),
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('scaler', StandardScaler()),
    ('model', ModelScorer(KMeans(max_iter=2000,n_clusters=20, n_init=40, random_state=42))) # If I am not mistaken, it is recommended to use 20 as the maximum number of clusters: 20 groups for 734 students
])

# apply the pipeline to dataframe
pipeline.fit(df)

In [22]:
# Get the ModelScorer model
model_scorer = pipeline.named_steps['model']

# Get the inertia score
inertia = model_scorer.inertia

# Get the silhouette score
silhouette = model_scorer.silhouette

print("Inertia Score:", inertia)
print("Silhouette Score:", silhouette, "\n")

# Get the predicted labels
labels = model_scorer.labels

# Print the predicted labels
print("Predicted Labels:\n", labels)

Inertia Score: 73.75617008775586
Silhouette Score: 0.7684696682263233 

Predicted Labels:
 [16 11  0 18  3 16  2 11 13 11 18 13 11 18 16 10 18 10  3 11  3 16  0  1
  8  0 15 15  4 10  3 16 10  5  0 10  4 10  6 10 16 11  0  0 11  0 15  0
 10  0  0  0 18  0  2 16 10  1 16  1  0  9 16 10 16  4  3  0 11  0  0 16
 18  8  6  0 16  0  2 13  0  0 10  0 10  2 13 18  7 13 15 11 15 10  3 10
 13 11  0  7 10  5  6 11 16 16 10  0 16 15 13 13 17  6  0 13  1  2  6  0
  1  3 10  3 10 11  0 11  2  6  4  0 11  6 14 11 13 11  3  3 15 13 10  0
  6 13 18 10 16 18  0  7 16 13 13 15  0  0 10  0 18 11 10  0  2  6 11  0
 16 18 15  2 10 13  3  3 18 15 13 13 13 16 16  9 15 13  0  6  0 18 13 13
  0  0  2 13  6 10 11  7  4 12  2  4  1 13 18 13 11  0  9  0  3 13 13  8
  7 10 16  1  0 12 10 10  3 11 10 13  4  1 17 11 18 10 16  0 15  2  0 10
 10 13 10 13  2 18 16  0  0 16  0  0  3 11 13 19 19 11 18  0  2  6  6  1
  4 16 10  9 16 11  3  0 16  2 15 15 15 10  0 10 16 18 10  0 13  2  4 15
 13  0 11  3 16  0  3 10 13 11 11

Comment:
- If we include the interests in the model, probably it can produce better performance

In [23]:
joblib.dump(pipeline, 'pipeline.pkl')

['pipeline.pkl']