# School Budgets Case Study

### Convert Target Variables into Categories

In [None]:
# Define the lambda function: categorize_label
categorize_label = lambda x: x.astype('category')

# Convert df[LABELS] to a categorical type
df[LABELS] = df[LABELS].apply(categorize_label, axis=0)

# Print the converted dtypes
print(df[LABELS].dtypes)

### Dummy Variable Encoding

In [None]:
dummies = pd.get_dummies(samples_df[['label']], prefix_sep='_')

### Determine number of unique labels

In [None]:
# Import matplotlib.pyplot
import matplotlib.pyplot as plt

# Calculate number of unique values for each label: num_unique_labels
num_unique_labels = pd.DataFrame(df[LABELS].apply(pd.Series.nunique))

# Plot number of unique values for each label
num_unique_labels.plot(kind='bar')

# Label the axes
plt.xlabel('Labels')
plt.ylabel('Number of unique values')

# Display the plot
plt.show()

### Log Loss Function

Log loss is a measure of error, penalizing outputs that are wrong and even more so if associated with high confidence. The higher the loss, the worse the metric.

In [4]:
import numpy as np

def compute_log_loss(predicted, actual, eps=1e-14):
    """Computes the logrithmic loss between predicted and actual when these are 1D arrays.
    
    :param predicted: The predicted probabilities as floats between 0-1.
    :param actual: The actual binary label; either 0 or 1.
    :param eps (optional): log(0) is inf, thus we need to offset our predicted values slightly by eps from 0 or 1.
    """
    
    predicted = np.clip(predicted, eps, 1-eps)
    
    loss = -1 * np.mean(actual * np.log(predicted) + (1 - actual) * np.log(1 - predicted))
        
    return loss

In [10]:
correct_confident = np.array([ 0.95,  0.95,  0.95,  0.95,  0.95,  0.05,  0.05,  0.05,  0.05,  0.05])
correct_not_confident = np.array([ 0.65,  0.65,  0.65,  0.65,  0.65,  0.35,  0.35,  0.35,  0.35,  0.35])
wrong_not_confident = np.array([ 0.35,  0.35,  0.35,  0.35,  0.35,  0.65,  0.65,  0.65,  0.65,  0.65])
wrong_confident = np.array([ 0.05,  0.05,  0.05,  0.05,  0.05,  0.95,  0.95,  0.95,  0.95,  0.95])
actual_labels = np.array([ 1.,  1.,  1.,  1.,  1.,  0.,  0.,  0.,  0.,  0.])

In [13]:
actual_labels = compute_log_loss(actual_labels, actual_labels)
print("Log loss, actual labels: {}".format(actual_labels)) 

correct_not_confident = compute_log_loss(correct_not_confident, actual_labels)
print("Log loss, correct and not confident: {}".format(correct_not_confident)) 

correct_confident = compute_log_loss(correct_confident, actual_labels)
print("Log loss, correct and confident: {}".format(correct_confident)) 

wrong_not_confident = compute_log_loss(wrong_not_confident, actual_labels)
print("Log loss, wrong and not confident: {}".format(wrong_not_confident)) 

wrong_confident = compute_log_loss(wrong_confident, actual_labels)
print("Log loss, wrong and confident: {}".format(wrong_confident)) 

Log loss, actual labels: 9.87430638375725e-12
Log loss, correct and not confident: 0.8289517819919665
Log loss, correct and confident: 0.054092560706148325
Log loss, wrong and not confident: 32.23699089902852
Log loss, wrong and confident: 32.23699089902852
