**PREREQUISITES**

In [None]:
!pip install category_encoders



In [None]:
import warnings
warnings.filterwarnings("ignore")

**SYNTHETIC DATASET GENERATION**

In [None]:
import pandas as pd
import numpy as np

# Define the number of samples you want to generate
num_samples = 100

# Define the range of values for each feature
grade_level_values = ['Grade 7', 'Grade 8', 'Grade 9', 'Grade 10']
gender_values = ['Male', 'Female']
family_structure_values = ['Single-parent', 'Two-parent', 'Extended family']
parental_education_values = ['Elementary', 'High School', 'College']
parental_occupation_values = ['Blue-collar', 'White-collar', 'Unemployed']
family_income_values = ['Low Income', 'Middle Class', 'High Income']
geographic_location_values = ['Urban', 'Suburban', 'Rural']
student_teacher_ratio_values = ['Low', 'Middle', 'High']
school_funding_values = ['Low', 'Middle', 'High']
access_to_early_childhood_edu_values = [True, False]
access_to_extracurricular_values = [True, False]
access_to_technology_values = [True, False]
student_allowance_values = ['Low', 'Middle', 'High']
dropout_likelihood_values = [True, False]

# Define the weights for each value in the feature range
fs_w = [0.044, 0.865, 0.091]
pe_w = [0.226, 0.475, 0.299]
po_w = [0.42, 0.31, 0.27]
fi_w = [0.226, 0.508, 0.266]
str_w = [0.3, 0.5, 0.2]
sf_w = [0.3, 0.5, 0.2]
atce_w = [0.7, 0.3]
atex_w = [0.6, 0.4]
att_w = [0.6, 0.4]
sa_w = [0.3, 0.4, 0.3]

# Create an empty dataframe to store the generated data
df = pd.DataFrame(columns=[ 'grade_level', 'gender', 'family_structure', 'parental_education', 'parental_occupation', 'family_income', 'geographic_location', 'student_teacher_ratio', 'school_funding', 'access_to_early_childhood_edu', 'access_to_extracurricular', 'access_to_technology', 'student_allowance', 'dropout_likelihood'])

# Generate random values for each feature and add them to the dataframe
for i in range(num_samples):
    grade_level = np.random.choice(grade_level_values)
    gender = np.random.choice(gender_values)
    family_structure = np.random.choice(family_structure_values, p=fs_w)
    parental_education = np.random.choice(parental_education_values, p=pe_w)
    parental_occupation = np.random.choice(parental_occupation_values, p=po_w)
    family_income = np.random.choice(family_income_values, p=fi_w)
    geographic_location = np.random.choice(geographic_location_values)
    student_teacher_ratio = np.random.choice(student_teacher_ratio_values, p=str_w)
    school_funding = np.random.choice(school_funding_values, p=sf_w)
    access_to_early_childhood_edu = np.random.choice(access_to_early_childhood_edu_values, p=atce_w)
    access_to_extracurricular = np.random.choice(access_to_extracurricular_values, p=atex_w)
    access_to_technology = np.random.choice(access_to_technology_values, p=att_w)
    student_allowance = np.random.choice(student_allowance_values)
    dropout_likelihood = np.random.choice(dropout_likelihood_values)

    df.loc[i] = [grade_level, gender, family_structure, parental_education, parental_occupation, family_income, geographic_location, student_teacher_ratio, school_funding, access_to_early_childhood_edu, access_to_extracurricular, access_to_technology, student_allowance, dropout_likelihood]

df.to_csv('high_school_dropout.csv', index=False)


**DECISION TREE CLASSIFICATION MODEL**

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.tree import DecisionTreeClassifier
from category_encoders import JamesSteinEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, precision_score, roc_auc_score, confusion_matrix
from tkinter import *
from tkinter import ttk
from tkinter import messagebox

# Load the dataset
url = "https://docs.google.com/spreadsheets/d/e/2PACX-1vRsZKyBQajEUz94FASaP6n7m5up5cx7lYppD4w9H0h74grgRCUCx8vxThffdQZaOOVOJkvXCj3T8T4n/pub?output=csv"
df = pd.read_csv(url)
#df = pd.read_csv(r'/content/high_school_dropout.csv')

# Preprocess the data
X = df.drop('dropout_likelihood', axis=1)
y = df['dropout_likelihood']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Categorical encoding using James-Stein encoding
encoder = JamesSteinEncoder(cols=["grade_level", "gender", "family_structure", "parental_education", "parental_occupation",
                                  "family_income", "geographic_location", "student_teacher_ratio", "school_funding",
                                  "access_to_early_childhood_edu", "access_to_extracurricular", "access_to_technology",
                                  "student_allowance"])

X_train_encoded = encoder.fit_transform(X_train, y_train)
X_test_encoded = encoder.transform(X_test)

# Label encode the dropout_likelihood column
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)

# Train the Decision Tree classifier with hyperparameter tuning
param_dist = {'max_depth': [25],
              'min_samples_split': [9],
              'min_samples_leaf': [6],
              'max_features': [None],
              'splitter' : ['best'],
              'criterion': ['gini']}
classifier = DecisionTreeClassifier()
random_search = RandomizedSearchCV(classifier, param_dist, n_iter=10, scoring='accuracy', cv=5)
random_search.fit(X_train_encoded, y_train_encoded)
best_classifier = random_search.best_estimator_

# Make predictions
y_pred = best_classifier.predict(X_test_encoded)

# Evaluate the model's performance
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred)
confusion = confusion_matrix(y_test, y_pred)

# Display the evaluation metrics
print("Accuracy:", accuracy)
print("Precision:", precision)
print("ROC AUC:", roc_auc)
print("Confusion Matrix:")
print(confusion)

# Print the best hyperparameters
print("Best Hyperparameters:", random_search.best_params_)


**GRAPHICAL USER INTERFACE** **(Requires $DISPLAY environment variable)**

In [None]:
window = Tk()
window.geometry("340x460")
window.title("High School Dropout Predictor")

# Define attribute values for comboboxes
grade_level_values = ['Grade 7', 'Grade 8', 'Grade 9', 'Grade 10']
gender_values = ['Male', 'Female']
family_structure_values = ['Single-parent', 'Two-parent', 'Extended family']
parental_education_values = ['Elementary', 'High School', 'College']
parental_occupation_values = ['Blue-collar', 'White-collar', 'Unemployed']
family_income_values = ['Low Income', 'Middle Class', 'High Income']
geographic_location_values = ['Urban', 'Suburban', 'Rural']
student_teacher_ratio_values = ['Low', 'Middle', 'High']
school_funding_values = ['Low', 'Middle', 'High']
access_to_early_childhood_edu_values = [True, False]
access_to_extracurricular_values = [True, False]
access_to_technology_values = [True, False]
student_allowance_values = ['Low', 'Middle', 'High']
dropout_likelihood_values = [True, False]

# Create labels and comboboxes for each attribute
attribute_labels = []
attribute_comboboxes = []

for i, attribute in enumerate(df.columns[:-1]):
    label = Label(window, text=attribute)
    label.grid(row=i, column=0, padx=(5, 5), pady=(5, 5))
    attribute_labels.append(label)

    values = eval(f"{attribute.lower()}_values")
    combobox = ttk.Combobox(window, values=values, state="readonly")
    combobox.grid(row=i, column=1, padx=(5, 5), pady=(5, 5))
    attribute_comboboxes.append(combobox)

# Create a function to predict the dropout likelihood based on the selected attribute values
def predict_dropout_likelihood():

    # Get the selected attribute values
    selected_values = [combobox.get() for combobox in attribute_comboboxes]

    # Create a DataFrame with the selected values
    selected_data = pd.DataFrame([selected_values], columns=df.columns[:-1])

    # James-Stein encode the selected attributes
    selected_data_encoded = encoder.transform(selected_data)

    # Make a prediction
    prediction = best_classifier.predict(selected_data_encoded)

    # Decode the prediction
    prediction_text = label_encoder.inverse_transform(prediction)[0]

    # Show the prediction in a message box
    messagebox.showinfo("Prediction", f"The likelihood of the student dropping out is: {prediction_text}.")

# Add a button to trigger the prediction
predict_button = Button(window, text="Predict", command=predict_dropout_likelihood)
predict_button.grid(row=len(df.columns[:-1]), column=0, columnspan=2, padx=(5, 5), pady=(5, 5))

# Run the tkinter event loop
window.mainloop()
