<a href="https://colab.research.google.com/github/reagenhuskey/cs290/blob/main/notebooks/partnerProject.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [7]:
import pandas as pd
url = "https://github.com/michaeledge27/CSCI290/raw/refs/heads/main/data/MBA.csv"
MBAdf = pd.read_csv(url)

In [8]:
colsInUse = ['gpa', 'major', 'gmat']
mbadf = MBAdf[colsInUse]
mbadf.head()

Unnamed: 0,gpa,major,gmat
0,3.3,Business,620.0
1,3.28,Humanities,680.0
2,3.3,Business,710.0
3,3.47,STEM,690.0
4,3.35,STEM,590.0


In [9]:
colsInUseNotTarget = ['gpa', 'major']

# Attribute Selection Method

In [10]:
import numpy as np
import math


In [11]:
# Calculate the entropy for a categorical feature
def categorical_entropy(df, target, feature):
    entropyList = []  # List to store entropy values
    uniqueVals = df[feature].unique()  # Get unique values for the feature
    overall = len(df)  # Total number of rows
    for val in uniqueVals:
        entropy = 0  # Initialize entropy for each unique value
        subset_size = len(df[df[feature] == val])  # Size of subset for the current value
        weight = subset_size / overall  # Weight of the subset
        props = df[df[feature] == val][target].value_counts(normalize=True)  # Get the proportion of target values
        for p in props:
          entropy -= weight * (p * math.log2(p))  # Calculate entropy for the current value
          entropyList.append(entropy)  # Append entropy value to the list
    return min(entropyList)  # Return the minimum entropy value

In [12]:
# Calculate the Gini index for a categorical feature
def categorical_gini(df, target, feature):
    giniList = []  # List to store Gini values
    uniqueVals = df[feature].unique()  # Get unique values for the feature
    overall = len(df)  # Total number of rows
    for val in uniqueVals:
        subset_size = len(df[df[feature] == val])  # Size of subset for the current value
        weight = subset_size / overall  # Weight of the subset
        props = df[df[feature] == val][target].value_counts(normalize=True)  # Get the proportion of target values
        gini = 1 - np.sum(np.square(props))  # Calculate Gini for the current value
        giniList.append(weight * gini)  # Append weighted Gini value to the list
    return min(giniList)  # Return the minimum Gini value

In [13]:
# Calculate the entropy for a quantitative feature
def quantitative_entropy(df, target, feature):
    entropyDict = {} # Dict to store entropy values as vals & split points as keys
    uniqueVals = np.sort(df[feature].unique())  # Sorted unique values for the feature
    overall = len(df)  # Total number of rows
    for val in uniqueVals:
        entropy = 0  # Initialize entropy for each unique value
        # Split the data into two subsets: <= val and > val
        left = df[df[feature] <= val][[feature, target]]
        right = df[df[feature] > val][[feature, target]]

        # Calculate the entropy for the left subset
        props_left = left[target].value_counts(normalize=True)
        weight_left = len(left) / overall
        for prop in props_left:
            if prop > 0:
                entropy -= weight_left * prop * math.log2(prop)

        # Calculate the entropy for the right subset
        props_right = right[target].value_counts(normalize=True)
        weight_right = len(right) / overall
        for prop in props_right:
            if prop > 0:
                entropy -= weight_right * prop * math.log2(prop)

        entropyDict[val] = entropy

    best_split = min(entropyDict, key=entropyDict.get) # Find the corresponding split point
    min_entropy = entropyDict[best_split]  # Get the minimum entropy value

    return min_entropy, best_split


In [14]:
# Calculate the Gini index for a quantitative feature
def quantitative_gini(df, target, feature):
    giniDict = {}  # Dictionary to store Gini values and split points.
    uniqueVals = np.sort(df[feature].unique())  # Sorted unique values for the feature
    overall = len(df)  # Total number of rows
    for val in uniqueVals:
        gini = 0  # Initialize Gini index for each unique value
        # Split the data into two subsets: <= val and > val
        left = df[df[feature] <= val][[feature, target]]
        right = df[df[feature] > val][[feature, target]]

        # Calculate the Gini for the left subset
        props_left = left[target].value_counts(normalize=True)
        weight_left = len(left) / overall
        gini_left = 1 - np.sum(np.square(props_left))

        # Calculate the Gini for the right subset
        props_right = right[target].value_counts(normalize=True)
        weight_right = len(right) / overall
        gini_right = 1 - np.sum(np.square(props_right))

        # Combine Gini values and append to the list
        gini = weight_left * gini_left + weight_right * gini_right
        giniDict[val] = gini

    best_split = min(giniDict, key=giniDict.get) # Find the corresponding split point
    min_gini = giniDict[best_split]  # Get the minimum Gini value

    return min_gini, best_split

In [15]:
# Helper function to check if a feature is categorical
def isCategorical(df, feature):
    return df[feature].nunique() < 8  # A feature is considered categorical if it has fewer than 8 unique values


In [16]:
# Main attribute selection method function
def attribute_selection_method(df, target, measure):
    categoricalDict = {}  # Store categorical features {feature, result}
    quantitativeDict = {} # Store quantitative features {feature, (result, split)}
    features = df.columns.drop(target)  # Get all features except the target

    for feature in features:
        if isCategorical(df, feature):  # If feature is categorical
            if measure == 'entropy':
                result = categorical_entropy(df, target, feature)
            elif measure == 'gini':
                result = categorical_gini(df, target, feature)
            categoricalDict[feature] = result
        else:  # If feature is quantitative
            if measure == 'entropy':
                min, best_split = quantitative_entropy(df, target, feature)
            elif measure == 'gini':
                min, best_split = quantitative_gini(df, target, feature)
            quantitativeDict[feature] = (min, best_split)

    # Determine the best feature and split point
    if categoricalDict and min(categoricalDict.values()) <= min(result for result, _ in quantitativeDict.values()):  # Extracts results from
        return min(categoricalDict, key=categoricalDict.get)                                         # the tuple and finds the minimum value
    else:
        best_feature = min(quantitativeDict, key=lambda x: quantitativeDict[x][0]) # Returns the key associated with the smallest result
        return best_feature, quantitativeDict[best_feature][1]

# Mean Squared Error

In [17]:
# Mean Squared Error for regression tasks
def regression_mse(df, target, feature):
    mseList = []
    splitPoints = []
    uniqueVals = np.sort(df[feature].unique())
    overall = len(df)

    for val in uniqueVals:
        # Split into left and right
        left = df[df[feature] <= val][target]
        right = df[df[feature] > val][target]

        # Calculate MSE for left and right subsets
        if len(left) > 0:
            mse_left = np.mean((left - left.mean()) ** 2)
        else:
            mse_left = 0

        if len(right) > 0:
            mse_right = np.mean((right - right.mean()) ** 2)
        else:
            mse_right = 0

        # Weighted average of the MSEs
        weight_left = len(left) / overall
        weight_right = len(right) / overall
        mse = weight_left * mse_left + weight_right * mse_right

        mseList.append(mse)
        splitPoints.append(val)

    min_mse = np.min(mseList)
    best_split = splitPoints[np.argmin(mseList)]
    return min_mse, best_split

In [18]:
regression_mse(mbadf, 'gmat', 'gpa')

(1882.2081312947516, 3.25)

In [19]:
regression_mse(mbadf, 'gmat', 'major')

(2429.3693132952044, 'Business')

# SciKit Learning Stuff

In [20]:
from sklearn.model_selection import train_test_split

In [21]:
target = mbadf['gmat']

In [22]:
train_set, test_set = train_test_split(mbadf, test_size=0.2)

In [23]:
X_train = train_set[colsInUseNotTarget]
y_train = train_set['gmat']
X_test = test_set[colsInUseNotTarget]
y_test = test_set['gmat']

In [24]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import cross_val_score
from sklearn.tree import plot_tree

In [25]:
num_attributes = X_train.select_dtypes(include = ['float64']).columns
cat_attributes = X_train.select_dtypes(include = ['object']).columns

In [26]:
trf = [
       ('cat', OneHotEncoder( handle_unknown='ignore'), cat_attributes) ]
col_transform = ColumnTransformer( transformers = trf )

In [27]:
pipeline = Pipeline( steps = [('pre', col_transform),
 ('clf', DecisionTreeRegressor(max_depth=3))])

In [30]:
cv_scores = cross_val_score( pipeline, X_train, y_train, cv=5, scoring='neg_mean_squared_error')
cv_scores

array([-2418.30541598, -2365.97587222, -2531.41725572, -2348.69821461,
       -2452.65372731])

In [29]:
mse_scores = -cv_scores
mse_scores

array([2418.30541598, 2365.97587222, 2531.41725572, 2348.69821461,
       2452.65372731])

In [31]:
model = DecisionTreeRegressor(max_depth=3)

In [38]:
encoded_test = col_transform.fit_transform(X_test['major'].to_frame())
encoded_train = col_transform.fit_transform(X_train['major'].to_frame())

In [39]:
fittedModel = model.fit(encoded_train, train_set['gmat'])

In [42]:
predictions = fittedModel.predict(encoded_test)
print(predictions)

[650.2676733  651.54696133 650.199071   ... 651.54696133 650.199071
 650.199071  ]
