In [61]:
from sklearn.naive_bayes import CategoricalNB, GaussianNB
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score,precision_score,recall_score,f1_score, confusion_matrix
from mixed_naive_bayes import MixedNB

import numpy as np
import pandas as pd

In [62]:
# read in data
df = pd.read_csv('../Data/Data Cleaning and Feature Eng/recent-grads-new-features.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,Rank,Major_code,Major,Major_category,Total,Sample_size,Men,Women,ShareWomen,...,Major_category_Education,Major_category_Engineering,Major_category_Health,Major_category_Humanities & Liberal Arts,Major_category_Industrial Arts & Consumer Services,Major_category_Interdisciplinary,Major_category_Law & Public Policy,Major_category_Physical Sciences,Major_category_Psychology & Social Work,Major_category_Social Science
0,0,1,2419,PETROLEUM ENGINEERING,Engineering,2339,36,2057,282,0.120564,...,0,1,0,0,0,0,0,0,0,0
1,1,2,2416,MINING AND MINERAL ENGINEERING,Engineering,756,7,679,77,0.101852,...,0,1,0,0,0,0,0,0,0,0
2,2,3,2415,METALLURGICAL ENGINEERING,Engineering,856,3,725,131,0.153037,...,0,1,0,0,0,0,0,0,0,0
3,3,4,2417,NAVAL ARCHITECTURE AND MARINE ENGINEERING,Engineering,1258,16,1123,135,0.107313,...,0,1,0,0,0,0,0,0,0,0
4,4,5,2405,CHEMICAL ENGINEERING,Engineering,32260,289,21239,11021,0.341631,...,0,1,0,0,0,0,0,0,0,0


In [78]:
# extract relevant features for supervised learning
# 60, 74, 66, 72
features = df[["Total", "Num_majors", "ShareWomen", "Percent_college", "Percent_non_college", "Percent_low_wage", "Percent_sample_size", "Unemployment_rate", "Part_time"]].copy()
# 57, 48, 45
# features = df[["Num_majors", "ShareWomen", "Percent_college", "Percent_non_college", "Percent_low_wage", "Percent_sample_size", "Unemployment_rate", "Part_time"]].copy()


# # extract predicted class variable
# medians = df[["Median_category"]]

In [79]:
# encode predicted class variable as numerical label
le = LabelEncoder()
medians = le.fit_transform(df["Median_category"])

le2 = LabelEncoder()
features["Major_category"] = le2.fit_transform(df["Major_category"])
# le2.inverse_transform(features["Major_category"])

In [80]:
features

Unnamed: 0,Total,Num_majors,ShareWomen,Percent_college,Percent_non_college,Percent_low_wage,Percent_sample_size,Unemployment_rate,Part_time,Major_category
0,2339,29.0,0.120564,0.655836,0.155622,0.082514,0.015391,0.018381,270,7
1,756,29.0,0.101852,0.462963,0.339947,0.066138,0.009259,0.117241,170,7
2,856,29.0,0.153037,0.532710,0.205607,0.000000,0.003505,0.024096,133,7
3,1258,29.0,0.107313,0.420509,0.081081,0.000000,0.012719,0.050125,150,7
4,32260,29.0,0.341631,0.567700,0.137632,0.030130,0.008958,0.061098,5180,7
...,...,...,...,...,...,...,...,...,...,...
167,8409,14.0,0.637293,0.329528,0.350458,0.088358,0.005589,0.046320,2190,2
168,2854,9.0,0.817099,0.521374,0.215487,0.028732,0.002453,0.065112,572,14
169,2838,9.0,0.799859,0.347428,0.306554,0.219168,0.004581,0.149048,648,14
170,4626,9.0,0.798746,0.519455,0.269131,0.066580,0.004540,0.053621,965,14


In [81]:
# split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(features, medians, test_size=0.2, random_state=42)

In [82]:
### Categorical 60%
naive_bayes = CategoricalNB()
# fit categorical Naive Bayes model on training set
naive_bayes.fit(X_train, y_train)
# predict class variable on test set
predictions = naive_bayes.predict(X_test)
# generate accuracy score
naive_bayes.score(X_test, y_test)

0.5714285714285714

In [83]:
### Gaussian 68%
VAR_SMOOTHING = 1

naive_bayes = GaussianNB(var_smoothing=VAR_SMOOTHING)
# fit gaussian Naive Bayes model on training set
naive_bayes.fit(X_train, y_train)
# predict class variable on test set
predictions = naive_bayes.predict(X_test)
# generate accuracy score
naive_bayes.score(X_test, y_test)

0.5428571428571428

In [84]:
## Mixed Naive Bayes --> 60% accuracy
## https://pypi.org/project/mixed-naive-bayes/#api-documentation
## https://github.com/remykarem/mixed-naive-bayes

# label_encoder = LabelEncoder()
# mixedNB_y_train = label_encoder.fit_transform(y_train["Median_category"])
# mixedNB_y_test = label_encoder.transform(y_test["Median_category"])

# initalize mixed Naive Bayes model with column 0 designated as categorical feature
mixedNB = MixedNB(categorical_features=[-1])
# fit mixed Naive Bayes model on training set
mixedNB.fit(X_train, y_train)
# predict class variable on test set
mixedNB_predictions = mixedNB.predict(X_test)
# generate accuracy score
mixedNB.score(X_test, y_test)
# print(confusion_matrix(mixedNB_y_test, mixedNB_predictions), ": is the confusion matrix")

0.37142857142857144

In [15]:
mixedNB_predictions_inversed

array(['40-50K', '30-40K', '20-30K', '60-70K', '20-30K', '20-30K',
       '40-50K', '20-30K', '30-40K', '50-60K', '30-40K', '40-50K',
       '40-50K', '40-50K', '30-40K', '60-70K', '30-40K', '50-60K',
       '30-40K', '30-40K', '50-60K', '60-70K', '60-70K', '30-40K',
       '20-30K', '60-70K', '50-60K', '30-40K', '50-60K', '50-60K',
       '30-40K', '60-70K', '50-60K', '30-40K', '30-40K'], dtype=object)

In [85]:
### param tuning, 68.7%
### https://medium.com/analytics-vidhya/how-to-improve-naive-bayes-9fa698e14cba

param_grid_nb = {
    'var_smoothing': np.logspace(0,-9, num=100)
}

nbModel_grid = GridSearchCV(estimator=GaussianNB(), param_grid=param_grid_nb, verbose=1, cv=10, n_jobs=-1)
nbModel_grid.fit(X_train, y_train)
# print(nbModel_grid.best_estimator_)
y_pred = nbModel_grid.predict(X_test)
nbModel_grid.score(X_test, y_test)
print(confusion_matrix(y_test, y_pred), ": is the confusion matrix")


Fitting 10 folds for each of 100 candidates, totalling 1000 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 64 concurrent workers.
[Parallel(n_jobs=-1)]: Done  72 tasks      | elapsed:    1.4s
[Parallel(n_jobs=-1)]: Done 322 tasks      | elapsed:    1.9s
[Parallel(n_jobs=-1)]: Done 704 tasks      | elapsed:    2.6s


[[ 0  2  0  0  0]
 [ 0 19  0  0  0]
 [ 0  4  0  0  0]
 [ 0  8  0  0  0]
 [ 0  2  0  0  0]] : is the confusion matrix


[Parallel(n_jobs=-1)]: Done 1000 out of 1000 | elapsed:    3.0s finished


In [86]:
nbModel_grid.score(X_test, y_test)

0.5428571428571428