In [1]:
import numpy as np
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import LabelEncoder
import ast

In [4]:
# Load the dataset from the CSV file

df = pd.read_csv('dataset.csv')
# Convert string representations of lists to actual lists
df['Answers'] = df['Answers'].apply(ast.literal_eval)

# Fit the encoder on the entire set of labels
encoder = LabelEncoder()
encoder.fit(df['Answers'].explode())

# Extract features (answers) and target variable (skill level)
X = df['Answers'].tolist()
y = df['Skill Level']

# Label encoding for answers (transform only, no fit)
X_encoded = np.array([encoder.transform(ans) for ans in X])

# Create the Decision Tree model
model = DecisionTreeClassifier()
model.fit(X_encoded, y)

# Sample of new input to predict
new_answers = np.array(['b', 'b', 'b', 'a', 'b', 'b', 'b', 'b', 'b', 'b'])

# Transform new answers using the updated encoder
new_encoded_answers = np.array([encoder.transform(new_answers)])

# Reshape the array to have two dimensions
new_encoded_answers = new_encoded_answers.reshape(1, -1)

# Make predictions with probabilities
predicted_probabilities = model.predict_proba(new_encoded_answers)[0]

# Find the predicted class with the highest probability
predicted_skill_level = model.classes_[np.argmax(predicted_probabilities)]

print(f"Predicted Skill Level: {predicted_skill_level}")
print(f"Predicted Probabilities: {predicted_probabilities}")
# Print feature importances
print("Feature Importances:", model.feature_importances_)


Predicted Skill Level: intermediate
Predicted Probabilities: [0. 0. 1.]
Feature Importances: [0.02045455 0.04669421 0.09090909 0.         0.02272727 0.74586777
 0.02789256 0.02272727 0.         0.02272727]
