In [3]:
from sklearn.naive_bayes import CategoricalNB, GaussianNB
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score,precision_score,recall_score,f1_score, confusion_matrix
from mixed_naive_bayes import MixedNB

import numpy as np
import pandas as pd

In [18]:
# read in data
df = pd.read_csv('../Data/Data Cleaning and Feature Eng/recent-grads-new-features.csv')
df.head()
df.columns

Index(['Unnamed: 0', 'Rank', 'Major_code', 'Major', 'Major_category', 'Total',
       'Sample_size', 'Men', 'Women', 'ShareWomen', 'Employed', 'Full_time',
       'Part_time', 'Full_time_year_round', 'Unemployed', 'Unemployment_rate',
       'Median', 'P25th', 'P75th', 'College_jobs', 'Non_college_jobs',
       'Low_wage_jobs', 'Median_category', 'Num_majors', 'Percent_sample_size',
       'Percent_employed', 'Percent_unemployed', 'Percent_college',
       'Percent_non_college', 'Percent_low_wage', 'Percent_full_time',
       'Percent_part_time', 'Major_category_Agriculture & Natural Resources',
       'Major_category_Arts', 'Major_category_Biology & Life Science',
       'Major_category_Business', 'Major_category_Communications & Journalism',
       'Major_category_Computers & Mathematics', 'Major_category_Education',
       'Major_category_Engineering', 'Major_category_Health',
       'Major_category_Humanities & Liberal Arts',
       'Major_category_Industrial Arts & Consumer Servic

In [111]:
# extract relevant features for supervised learning
# 60, 74, 66, 72
features = df[["Total", "Num_majors", "ShareWomen", "Percent_college", "Percent_non_college", "Percent_full_time", "Percent_sample_size", "Unemployment_rate", "Part_time"]].copy()
# 57, 48, 45
# features = df[["Num_majors", "ShareWomen", "Percent_college", "Percent_non_college", "Percent_low_wage", "Percent_sample_size", "Unemployment_rate", "Part_time"]].copy()

features = df[[
            'Num_majors',
            'Percent_full_time',
            'Percent_college',
            'Percent_non_college',
            'Percent_sample_size',
            'ShareWomen',
            'Unemployment_rate']].copy()

## 37%
# features = df[[
#             'Num_majors',
#             'Percent_full_time',
#             'Percent_college',
#             'Percent_non_college',
#             'Percent_sample_size',
#             'ShareWomen',
#             'Unemployment_rate',
#            'Major_category_Agriculture & Natural Resources',
#            'Major_category_Arts', 'Major_category_Biology & Life Science',
#            'Major_category_Business', 'Major_category_Communications & Journalism',
#            'Major_category_Computers & Mathematics', 'Major_category_Education',
#            'Major_category_Engineering', 'Major_category_Health',
#            'Major_category_Humanities & Liberal Arts',
#            'Major_category_Industrial Arts & Consumer Services',
#            'Major_category_Interdisciplinary',
#            'Major_category_Law & Public Policy',
#            'Major_category_Physical Sciences',
#            'Major_category_Psychology & Social Work',
#            'Major_category_Social Science'
#             ]].copy()

# # extract predicted class variable
# medians = df[["Median_category"]]

In [112]:
# encode predicted class variable as numerical label
le = LabelEncoder()
medians = le.fit_transform(df["Median_category"])

le2 = LabelEncoder()
features["Major_category"] = le2.fit_transform(df["Major_category"])
# le2.inverse_transform(features["Major_category"])

In [113]:
# split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(features, medians, test_size=0.2, random_state=42)

In [114]:
### Categorical 60%
naive_bayes = CategoricalNB()
# fit categorical Naive Bayes model on training set
naive_bayes.fit(X_train, y_train)
# predict class variable on test set
predictions = naive_bayes.predict(X_test)
# generate accuracy score
naive_bayes.score(X_test, y_test)

0.5714285714285714

In [115]:
### Gaussian 68%
VAR_SMOOTHING = 1

naive_bayes = GaussianNB(var_smoothing=VAR_SMOOTHING)
# fit gaussian Naive Bayes model on training set
naive_bayes.fit(X_train, y_train)
# predict class variable on test set
predictions = naive_bayes.predict(X_test)
# generate accuracy score
naive_bayes.score(X_test, y_test)

0.5714285714285714

In [116]:
## Mixed Naive Bayes --> 60% accuracy
## https://pypi.org/project/mixed-naive-bayes/#api-documentation
## https://github.com/remykarem/mixed-naive-bayes

# label_encoder = LabelEncoder()
# mixedNB_y_train = label_encoder.fit_transform(y_train["Median_category"])
# mixedNB_y_test = label_encoder.transform(y_test["Median_category"])

# initalize mixed Naive Bayes model with column 0 designated as categorical feature
mixedNB = MixedNB(categorical_features=[-1])
# fit mixed Naive Bayes model on training set
mixedNB.fit(X_train, y_train)
# predict class variable on test set
mixedNB_predictions = mixedNB.predict(X_test)
# generate accuracy score
mixedNB.score(X_test, y_test)
# print(confusion_matrix(mixedNB_y_test, mixedNB_predictions), ": is the confusion matrix")

[16]


0.4857142857142857

In [117]:
### param tuning, 68.7%
### https://medium.com/analytics-vidhya/how-to-improve-naive-bayes-9fa698e14cba

param_grid_nb = {
    'var_smoothing': np.logspace(0,-9, num=100)
}

nbModel_grid = GridSearchCV(estimator=GaussianNB(), param_grid=param_grid_nb, verbose=1, cv=10, n_jobs=-1)
nbModel_grid.fit(X_train, y_train)
# print(nbModel_grid.best_estimator_)
y_pred = nbModel_grid.predict(X_test)
nbModel_grid.score(X_test, y_test)
print(confusion_matrix(y_test, y_pred), ": is the confusion matrix")


Fitting 10 folds for each of 100 candidates, totalling 1000 fits




[[ 0  2  0  0  0]
 [ 0 19  0  0  0]
 [ 0  3  0  1  0]
 [ 0  3  0  5  0]
 [ 0  0  0  2  0]] : is the confusion matrix


In [118]:
nbModel_grid.score(X_test, y_test)

0.6857142857142857