In [1]:
import json
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import LabelEncoder

# Load data from JSON file
with open('people.json', 'r') as file:
    data = json.load(file)

df = pd.json_normalize(data)

# Encoding categorical data to numerical
label_encoder = LabelEncoder()
df['address.state'] = label_encoder.fit_transform(df['address.state'])
df['marital_status'] = label_encoder.fit_transform(df['marital_status'])

# Splitting dataset into features (X) and target variable (y)
X = df[['age', 'address.state', 'salary']]
y = df['marital_status']

# Training the Decision Tree Classifier
classifier = DecisionTreeClassifier()
classifier.fit(X, y)

df  # Display the transformed dataset

Unnamed: 0,age,marital_status,salary,name.first,name.last,address.street,address.city,address.state,address.postal_code
0,20,0,79146,Megan,Chang,48764 Howard Forge Apt. 421,Vanessaside,54,79393
1,23,0,52193,Rachel,Collins,578 Michael Island,New Thomas,53,68835
2,52,2,52761,Jorge,Trujillo,60975 Jessica Squares,East Sallybury,15,19178
3,60,2,67249,Linda,West,8714 Mann Plaza,Lisaside,17,79561
4,60,0,91020,Amy,Roberts,659 Kelly Field,North Chloe,49,34587
...,...,...,...,...,...,...,...,...,...
995,33,1,32292,James,Allen,2078 Catherine Stream,Martinezborough,40,61009
996,26,1,45431,Amanda,Nichols,532 Anna Cliffs Suite 575,Jacquelineville,2,16557
997,59,2,34747,John,Jones,975 Bowen Streets Apt. 165,New Jeffrey,29,27718
998,18,1,80001,Courtney,Jackson,093 Thompson Drive,Harrisbury,27,06626


In [2]:
# Using predict_proba to get the probability of each class for each individual
probabilities = classifier.predict_proba(X)

# Adding probabilities to the original dataframe for better visualization
df['Prob. Married'] = probabilities[:, 0]  # Probability of being Married
df['Prob. Not Married'] = probabilities[:, 1]  # Probability of being Not Married

df[['name.first', 'name.last','age', 'address.state', 'salary', 'Prob. Married', 'Prob. Not Married']]  # Display relevant columns

Unnamed: 0,name.first,name.last,age,address.state,salary,Prob. Married,Prob. Not Married
0,Megan,Chang,20,54,79146,1.0,0.0
1,Rachel,Collins,23,53,52193,1.0,0.0
2,Jorge,Trujillo,52,15,52761,0.0,0.0
3,Linda,West,60,17,67249,0.0,0.0
4,Amy,Roberts,60,49,91020,1.0,0.0
...,...,...,...,...,...,...,...
995,James,Allen,33,40,32292,0.0,1.0
996,Amanda,Nichols,26,2,45431,0.0,1.0
997,John,Jones,59,29,34747,0.0,0.0
998,Courtney,Jackson,18,27,80001,0.0,1.0


In [3]:
import joblib

# Save the trained model to a file
joblib.dump(classifier, 'model.pkl')
joblib.dump(label_encoder, 'label_encoder.pkl')

['label_encoder.pkl']

In [9]:
# Load the trained model and label encoder from files
loaded_classifier = joblib.load('model.pkl')
loaded_label_encoder = joblib.load('label_encoder.pkl')

new_data = [{"name": {"first": "Megan", "last": "Chang"}, "address": {"street": "48764 Howard Forge Apt. 421", "city": "Vanessaside", "state": "VT", "postal_code": "79393"}, "age": 20, "salary": 79146}, {"name": {"first": "Rachel", "last": "Collins"}, "address": {"street": "578 Michael Island", "city": "New Thomas", "state": "VI", "postal_code": "68835"}, "age": 23, "salary": 52193}, {"name": {"first": "Jorge", "last": "Trujillo"}, "address": {"street": "60975 Jessica Squares", "city": "East Sallybury", "state": "IA", "postal_code": "19178"}, "age": 52, "salary": 52761}]
new_df = pd.json_normalize(new_data)

# Encoding categorical data to numerical
new_df['address.state'] = loaded_label_encoder.fit_transform(new_df['address.state'])

# Using the loaded model to predict the marital status of new data
new_probabilities = loaded_classifier.predict_proba(new_df[['age', 'address.state', 'salary']])
new_df['Prob. Married'] = new_probabilities[:, 0]  # Probability of being Married
new_df['Prob. Not Married'] = new_probabilities[:, 1]  # Probability of being Not Married

# Output the predicted results
# new_df[['name.first', 'name.last', 'age', 'address.state', 'salary', 'Prob. Married', 'Prob. Not Married']]  # Display relevant columns

# Translate encoded states back into text
# new_df['address.state'] = loaded_label_encoder.inverse_transform(new_df['address.state'])
# new_df[['name.first', 'name.last', 'age', 'address.state', 'salary', 'Prob. Married', 'Prob. Not Married']]  # Display relevant columns


# importance = loaded_classifier.feature_importances_
# attributes = ['age', 'address.state', 'salary']

# most_impactful_attribute = attributes[importance.argmax()]
# most_impactful_attribute




'salary'