In [1]:
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import accuracy_score, classification_report
from scipy.sparse import hstack
import numpy as np


In [2]:
import mysql.connector

conn = mysql.connector.connect(
    host="localhost",
    user="root",
    password="",
    database="user_answer",
    port=3307 
)

In [3]:
cursor = conn.cursor()

# Ejemplo de consulta para obtener todos los registros de la tabla answers
cursor.execute("SELECT * FROM modelTraining")

# Obtener todos los resultados
results = cursor.fetchall()

for row in results:
    print(row)


('carpenter', 'secondary school', 'part-time', 'only weekdays', 'wants to specialize in cabinetry', 'Construction Apprenticeships')
('marketing', 'master degree', 'only weekends', 'only weekdays', 'aims to harness social media platforms', 'Manage finances for new business ventures')
('hospitality', 'diploma', 'full-time', 'i already have a job', 'aspires to open a chain of cafes', 'Prepare to work safely in the construction industry')
('construction', 'high school', 'part-time', 'full-time', 'wants to master brick laying techniques', 'Builder Restricted licences')
('electrician', 'master degree', 'part-time', 'only weekends', 'keen on renewable energy systems', 'Home Electrical Installation and Safety')
('plumbing', 'diploma', 'only weekends', 'only weekdays', 'wishes to understand sewage systems better', 'Plumbing Business Management and Operations')
('painting', 'bachelor degree', 'full-time', 'part-time', 'dreams of having an art exhibition', 'Certificate IV in Building and Construc

In [4]:
df = pd.read_sql("SELECT * FROM modelTraining", conn)

  df = pd.read_sql("SELECT * FROM modelTraining", conn)


In [5]:
df.head(5)

Unnamed: 0,interests,education,study_availability,work_availability,goals,recommendation
0,carpenter,secondary school,part-time,only weekdays,wants to specialize in cabinetry,Construction Apprenticeships
1,marketing,master degree,only weekends,only weekdays,aims to harness social media platforms,Manage finances for new business ventures
2,hospitality,diploma,full-time,i already have a job,aspires to open a chain of cafes,Prepare to work safely in the construction ind...
3,construction,high school,part-time,full-time,wants to master brick laying techniques,Builder Restricted licences
4,electrician,master degree,part-time,only weekends,keen on renewable energy systems,Home Electrical Installation and Safety


In [None]:
# conn.close()

In [6]:
# Descargar recursos necesarios de nltk
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/ssanjua/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/ssanjua/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [7]:
# lematizer y stopwords
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

In [8]:
def preprocess_text(text):
    # lowercase
    text = text.lower()
    
    # special characters and numbers
    text = re.sub(r'[^a-z\s]', '', text)
    
    # token
    tokens = nltk.word_tokenize(text)
    tokens = [lemmatizer.lemmatize(token) for token in tokens if token not in stop_words]
    
    return ' '.join(tokens)

In [9]:
# test preprocessing
sample_text = "I want to work outside and with people"
print(preprocess_text(sample_text))

want work outside people


In [None]:
# Aplicar preprocesamiento a todo el campo "goals" (suponiendo que goals es una lista de textos)
goals_processed = [preprocess_text(text) for text in df['goals']]

# Vectorizar usando TF-IDF
vectorizer = TfidfVectorizer()
X_tfidf = vectorizer.fit_transform(goals_processed)

In [None]:
df['goals'] = df['goals'].apply(preprocess_text)

In [None]:
X_tfidf

In [None]:

features = df.drop(columns=['id', 'recommendation', 'goals'])
target = df['recommendation']

# Convertir variables categóricas a numéricas
encoder = OneHotEncoder(drop='first')
X_categorical = encoder.fit_transform(features).toarray()
X = hstack([X_categorical, X_tfidf]).toarray()
y = target

# Dividir los datos en conjuntos de entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Entrenar el árbol de decisión
clf = DecisionTreeClassifier(max_depth=10)
clf.fit(X_train, y_train)

# Predecir en el conjunto de prueba
y_pred = clf.predict(X_test)

# Evaluar el rendimiento del modelo
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy*100:.2f}%")
print("\nClassification Report:\n", classification_report(y_test, y_pred))


In [21]:
def train_model(data):

    goals_processed = [preprocess_text(text) for text in df['goals']]
    # Vectorizar usando TF-IDF
    vectorizer = TfidfVectorizer()
    X_tfidf = vectorizer.fit_transform(goals_processed)
    
    # Aquí considero que `data` es tu dataframe (df)
    features = df.drop(columns=['recommendation'])
    target = df['recommendation']

    # Convertir variables categóricas a numéricas
    features_without_goals = features.drop(columns=['goals'])
    encoder = OneHotEncoder(drop='first', handle_unknown='ignore')
    X_categorical = encoder.fit_transform(features_without_goals).toarray()


    X = hstack([X_categorical, X_tfidf]).toarray()
    y = target

    # Dividir los datos en conjuntos de entrenamiento y prueba
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Entrenar el árbol de decisión
    clf = DecisionTreeClassifier(max_depth=15)
    clf.fit(X_train, y_train)

    return encoder, vectorizer, clf


In [23]:

def recommend_course(interests, education, study_availability, work_availability, goals, encoder, vectorizer, clf):
    
    # Procesar el texto
    goals_processed = preprocess_text(goals)
    
    # Vectorizar usando TF-IDF
    goals_vectorized = vectorizer.transform([goals_processed])

    user_data = pd.DataFrame({
        'interests': [interests],
        'education': [education],
        'study_availability': [study_availability],
        'work_availability': [work_availability],
        'goals': [goals_processed]
    })

    # Codificar datos del usuario
    user_encoded = encoder.transform(user_data.drop(columns=['goals'])).toarray()

    # Combina las características categóricas con la matriz TF-IDF
    user_combined = np.hstack([user_encoded, goals_vectorized.toarray()])

    # Predecir con el modelo
    recommendation = clf.predict(user_combined)

    return recommendation[0]

# Suponiendo que `df` es tu DataFrame original:
encoder, vectorizer, clf = train_model(df)

# Para usar la función de recomendación:
course = recommend_course("painting", "masters", "part-time", "full-time", "become more human", encoder, vectorizer, clf)
print(course)


Certificate IV in Building and Construction




In [26]:
print(df['education'].unique())  # Durante el entrenamiento


['secondary school' 'master degree' 'diploma' 'high school'
 'bachelor degree']


In [25]:
from joblib import dump, load

encoder, vectorizer, clf = train_model(df)

dump(encoder, 'encoder.joblib')
dump(vectorizer, 'vectorizer.joblib')
dump(clf, 'clf.joblib')


['clf.joblib']

In [14]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 69 entries, 0 to 68
Data columns (total 7 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   id                  69 non-null     int64 
 1   interests           69 non-null     object
 2   education           69 non-null     object
 3   study_availability  69 non-null     object
 4   work_availability   69 non-null     object
 5   goals               69 non-null     object
 6   recommendation      69 non-null     object
dtypes: int64(1), object(6)
memory usage: 3.9+ KB
