In [1]:
# import all libraries
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import accuracy_score, classification_report
from scipy.sparse import hstack
import numpy as np

import warnings
warnings.simplefilter(action='ignore', category=Warning)

In [2]:
pip show scikit-learn

Name: scikit-learn
Version: 1.2.2
Summary: A set of python modules for machine learning and data mining
Home-page: http://scikit-learn.org
Author: 
Author-email: 
License: new BSD
Location: /Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages
Requires: joblib, numpy, scipy, threadpoolctl
Required-by: 
Note: you may need to restart the kernel to use updated packages.


In [3]:
pip show sklearn

Name: sklearn
Version: 0.0.post4
Summary: deprecated sklearn package, use scikit-learn instead
Home-page: 
Author: 
Author-email: 
License: 
Location: /Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages
Requires: 
Required-by: 
Note: you may need to restart the kernel to use updated packages.


In [36]:
# import the necessary library for MySQL database connection
import mysql.connector

# establish a connection to the MySQL database
conn = mysql.connector.connect(
    host="localhost",
    user="root",
    password="",
    database="user_answer",
    port=3307
)

In [37]:
cursor = conn.cursor()

# example query to fetch all records from the "modelTraining" table
cursor.execute("SELECT * FROM modelTraining")

# fetch all the results from the previous query
results = cursor.fetchall()

# print each record from the results
for row in results:
    print(row)

('carpenter', 'secondary school', 'part-time', 'only weekdays', 'wants to specialize in cabinetry', 'Construction Apprenticeships')
('marketing', 'master degree', 'only weekends', 'only weekdays', 'aims to harness social media platforms', 'Manage finances for new business ventures')
('hospitality', 'diploma', 'full-time', 'i already have a job', 'aspires to open a chain of cafes', 'Prepare to work safely in the construction industry')
('construction', 'high school', 'part-time', 'full-time', 'wants to master brick laying techniques', 'Builder Restricted licences')
('electrician', 'master degree', 'part-time', 'only weekends', 'keen on renewable energy systems', 'Home Electrical Installation and Safety')
('plumbing', 'diploma', 'only weekends', 'only weekdays', 'wishes to understand sewage systems better', 'Plumbing Business Management and Operations')
('painting', 'bachelor degree', 'full-time', 'part-time', 'dreams of having an art exhibition', 'Certificate IV in Building and Construc

In [14]:
# convert the SQL results into a pandas dataframe
df = pd.read_sql("SELECT * FROM modelTraining", conn)

In [15]:
df.head(5)

Unnamed: 0,interests,education,study_availability,work_availability,goals,recommendation
0,carpenter,secondary school,part-time,only weekdays,wants to specialize in cabinetry,Construction Apprenticeships
1,marketing,master degree,only weekends,only weekdays,aims to harness social media platforms,Manage finances for new business ventures
2,hospitality,diploma,full-time,i already have a job,aspires to open a chain of cafes,Prepare to work safely in the construction ind...
3,construction,high school,part-time,full-time,wants to master brick laying techniques,Builder Restricted licences
4,electrician,master degree,part-time,only weekends,keen on renewable energy systems,Home Electrical Installation and Safety


### first: NLP for the "future goals" so we can process the words and its meaning

In [16]:
# download necessary resources from nltk
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/ssanjua/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/ssanjua/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [17]:
# initialize lemmatizer and set of stopwords
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

In [19]:
# function to preprocess text
def preprocess_text(text):
    # lowercase
    text = text.lower()
    
    # remove special characters and numbers
    text = re.sub(r'[^a-z\s]', '', text)
    
    # tokenize the text and lemmatize tokens that are not stopwords
    tokens = nltk.word_tokenize(text)
    tokens = [lemmatizer.lemmatize(token) for token in tokens if token not in stop_words]
    
    return ' '.join(tokens)

In [20]:
# test preprocessing, if we did it right, here all the meaningless words like 'to' and 'and' should disappear
sample_text = "I want to work outside and with people"
print(preprocess_text(sample_text))

want work outside people


In [27]:
# apply the preprocess_text function to every item in the "goals" column
goals_processed = [preprocess_text(text) for text in df['goals']]

# vectorize the processed goals using TF-IDF
vectorizer = TfidfVectorizer()
X_tfidf = vectorizer.fit_transform(goals_processed)

In [28]:
# apply the text preprocessing to the "goals" column in the dataframe
df['goals'] = df['goals'].apply(preprocess_text)

#### now we will test the model and adjust the hyperparameters of the decision tree to approach good accuracy. Once the result is satisfactory, we will proceed to create the function for training.

In [44]:
# convert categorical variables to numerical format
features = df.drop(columns=['recommendation', 'goals'])
target = df['recommendation']

# convert categorical variables to numerical format
encoder = OneHotEncoder(drop='first')
X_categorical = encoder.fit_transform(features).toarray()
X = hstack([X_categorical, X_tfidf]).toarray()
y = target

# split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

# train a decision tree classifier
clf = DecisionTreeClassifier(max_depth=12)
clf.fit(X_train, y_train)

# make predictions on the test set
y_pred = clf.predict(X_test)

# evaluate the performance of the classifier
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy*100:.2f}%")
print("\nClassification Report:\n", classification_report(y_test, y_pred))


Accuracy: 92.86%

Classification Report:
                                                      precision    recall  f1-score   support

                        Builder Restricted licences       1.00      1.00      1.00         2
             Certificate II in Engineering Pathways       1.00      1.00      1.00         3
   Certificate III in Engineering Fabrication Trade       0.00      0.00      0.00         1
        Certificate IV in Building and Construction       1.00      1.00      1.00         2
                       Construction Apprenticeships       1.00      1.00      1.00         4
            Home Electrical Installation and Safety       1.00      0.83      0.91         6
          Manage finances for new business ventures       0.60      1.00      0.75         3
        Plumbing Business Management and Operations       1.00      1.00      1.00         2
Prepare to work safely in the construction industry       1.00      1.00      1.00         5

                          

In [32]:
def train_model(data):
    
    # preprocess the 'goals' column from the data using the preprocess_text function
    goals_processed = [preprocess_text(text) for text in df['goals']]
    # vectorize the processed goals using TF-IDF (Term Frequency-Inverse Document Frequency)
    vectorizer = TfidfVectorizer()
    X_tfidf = vectorizer.fit_transform(goals_processed)
    
    # drop the 'recommendation' column to get the feature set
    features = df.drop(columns=['recommendation'])
    target = df['recommendation']

    # convert categorical variables to numerical format using One-Hot Encoding
    features_without_goals = features.drop(columns=['goals'])
    encoder = OneHotEncoder(drop='first', handle_unknown='ignore')
    X_categorical = encoder.fit_transform(features_without_goals).toarray()

    # combine categorical features with the TF-IDF matrix for goals
    X = hstack([X_categorical, X_tfidf]).toarray()
    y = target

    # split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # train a decision tree classifier
    clf = DecisionTreeClassifier(max_depth=15)
    clf.fit(X_train, y_train)

    return encoder, vectorizer, clf


In [34]:
def recommend_course(interests, education, study_availability, work_availability, goals, encoder, vectorizer, clf):

    # process the goals input using the preprocess_text function
    goals_processed = preprocess_text(goals)
    
    # vectorize the processed goals using the pre-trained TF-IDF vectorizer
    goals_vectorized = vectorizer.transform([goals_processed])

    # construct a DataFrame for the user's provided data
    user_data = pd.DataFrame({
        'interests': [interests],
        'education': [education],
        'study_availability': [study_availability],
        'work_availability': [work_availability],
        'goals': [goals_processed]
    })

    # encode user's data to numerical format using the pre-trained encoder
    user_encoded = encoder.transform(user_data.drop(columns=['goals'])).toarray()

    # combine the encoded categorical features with the TF-IDF vector of goals
    user_combined = np.hstack([user_encoded, goals_vectorized.toarray()])

    # predict the course recommendation using the trained classifier
    recommendation = clf.predict(user_combined)

    return recommendation[0]

encoder, vectorizer, clf = train_model(df)

# To test the recommendation function, lets hardcode some data:
course = recommend_course("painting", "masters", "part-time", "full-time", "become more human", encoder, vectorizer, clf)
print(course)


Certificate IV in Building and Construction


In [None]:
# saving trained models using joblib

from joblib import dump, load

encoder, vectorizer, clf = train_model(df)

dump(encoder, 'encoder.joblib')
dump(vectorizer, 'vectorizer.joblib')
dump(clf, 'clf.joblib')

In [46]:
# now we close the db connection
conn.close()