In [1]:
import json
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

# Load data from JSON file
with open('people.json', 'r') as file:
    data = json.load(file)

df = pd.json_normalize(data)

# Encoding categorical data to numerical
state_encoder = OneHotEncoder()
target_encoder = LabelEncoder()
encoded_states = state_encoder.fit_transform(df[['address.state']]).toarray()
df['marital_status'] = target_encoder.fit_transform(df['marital_status'])

# Create a DataFrame with encoded columns
encoded_state_df = pd.DataFrame(encoded_states, columns=state_encoder.get_feature_names_out(['address.state']))

# Concatenate the new DataFrame with the original one
df = pd.concat([df, encoded_state_df], axis=1)

# Splitting dataset into features (X) and target variable (y)
encoded_column_names = encoded_state_df.columns.tolist()
X = df[['age', 'salary'] + encoded_column_names]
y = df['marital_status']

# Training the Decision Tree Classifier
classifier = DecisionTreeClassifier()
classifier.fit(X, y)

accuracy = classifier.score(X, y)
accuracy


# df  # Display the transformed dataset

1.0

In [2]:
# Using predict_proba to get the probability of each class for each individual
probabilities = classifier.predict_proba(X)

# Adding probabilities to the original dataframe for better visualization
df['Prob. Married'] = probabilities[:, 0]  # Probability of being Married
df['Prob. Not Married'] = probabilities[:, 1]  # Probability of being Not Married

df[['name.first', 'name.last','age', 'address.state', 'salary', 'Prob. Married', 'Prob. Not Married']]  # Display relevant columns

Unnamed: 0,name.first,name.last,age,address.state,salary,Prob. Married,Prob. Not Married
0,Megan,Chang,20,VT,79146,1.0,0.0
1,Rachel,Collins,23,VI,52193,1.0,0.0
2,Jorge,Trujillo,52,IA,52761,0.0,0.0
3,Linda,West,60,IL,67249,0.0,0.0
4,Amy,Roberts,60,TN,91020,1.0,0.0
...,...,...,...,...,...,...,...
995,James,Allen,33,OH,32292,0.0,1.0
996,Amanda,Nichols,26,AR,45431,0.0,1.0
997,John,Jones,59,MP,34747,0.0,0.0
998,Courtney,Jackson,18,MN,80001,0.0,1.0


In [3]:
import joblib

# Save the trained model to a file
joblib.dump(classifier, 'model.pkl')
joblib.dump(target_encoder, 'target_encoder.pkl')
joblib.dump(state_encoder, 'state_encoder.pkl')
joblib.dump(encoded_column_names, 'encoded_column_names.pkl')

['encoded_column_names.pkl']

In [4]:
# Load the trained model and label encoder from files
loaded_classifier = joblib.load('model.pkl')
loaded_target_encoder = joblib.load('target_encoder.pkl')
loaded_state_encoder = joblib.load('state_encoder.pkl')
loaded_encoded_column_names = joblib.load('encoded_column_names.pkl')

new_data = [{"name": {"first": "Megan", "last": "Chang"}, "address": {"street": "48764 Howard Forge Apt. 421", "city": "Vanessaside", "state": "VT", "postal_code": "79393"}, "age": 20, "salary": 79146}, {"name": {"first": "Rachel", "last": "Collins"}, "address": {"street": "578 Michael Island", "city": "New Thomas", "state": "VI", "postal_code": "68835"}, "age": 23, "salary": 52193}, {"name": {"first": "Jorge", "last": "Trujillo"}, "address": {"street": "60975 Jessica Squares", "city": "East Sallybury", "state": "IA", "postal_code": "19178"}, "age": 52, "salary": 52761}]
new_df = pd.json_normalize(new_data)

# dummies = pd.get_dummies(new_df['address.state'])
encoded_states = loaded_state_encoder.fit_transform(new_df[['address.state']]).toarray()
encoded_state_df = pd.DataFrame(encoded_states, columns=loaded_state_encoder.get_feature_names_out(['address.state']))
new_df = pd.concat([new_df, encoded_state_df], axis=1)

for state in loaded_encoded_column_names:
    if state not in new_df.columns:
        new_df[state] = 0

# Using the loaded model to predict the marital status of new data
new_probabilities = loaded_classifier.predict_proba(new_df[['age', 'salary'] + encoded_column_names])
new_df['Prob. Married'] = new_probabilities[:, 0]  # Probability of being Married
new_df['Prob. Not Married'] = new_probabilities[:, 1]  # Probability of being Not Married

# Output the predicted results
new_df[['name.first', 'name.last', 'age', 'address.state', 'salary', 'Prob. Married', 'Prob. Not Married']]  # Display relevant columns

# importance = loaded_classifier.feature_importances_
# attributes = ['age', 'address.state', 'salary']

# most_impactful_attribute = attributes[importance.argmax()]
# most_impactful_attribute

Unnamed: 0,name.first,name.last,age,address.state,salary,Prob. Married,Prob. Not Married
0,Megan,Chang,20,VT,79146,1.0,0.0
1,Rachel,Collins,23,VI,52193,1.0,0.0
2,Jorge,Trujillo,52,IA,52761,0.0,0.0
