In [1]:
import pandas as pd
df = pd.read_csv('data/resume data modified Extra Rows.csv')

#verify data was loaded correctly 
df.head()

Unnamed: 0,job_ad_id,job_type,job_fed_contractor,job_ownership,job_req_communication,job_req_organization,received_callback,honors,worked_during_school,special_skills,volunteer,military,employment_holes,experience_delta,computer_match,school_match
0,384,supervisor,0,unknown,0,0,0,0,0,0,0,0,1,1.0,RY,NC
1,384,supervisor,0,unknown,0,0,0,0,1,0,1,1,0,1.0,RY,NS
2,384,supervisor,0,unknown,0,0,0,0,1,0,0,0,0,1.0,RY,NC
3,384,supervisor,0,unknown,0,0,0,0,0,1,1,0,1,1.0,RY,NS
4,385,secretary,0,nonprofit,0,1,0,0,1,0,0,0,0,22.0,RY,NS


In [2]:
#determine set of values for all non-numerical data to transform later

print(set(df['job_type']))
print(set(df['job_ownership']))
print(set(df['computer_match']))
print(set(df['school_match']))

{'supervisor', 'sales_rep', 'secretary', 'manager', 'retail_sales', 'clerical'}
{'nonprofit', 'unknown', 'private', 'public'}
{'RY', 'NY', 'NN', 'RN'}
{'HC', 'CN', 'NC', 'NN', 'HS', 'CS', 'CC', 'SS', 'SC', 'NS'}


In [3]:
#map non-numerical values to integers 
job_type_map = {'clerical':0, 'manager': 1, 'retail_sales': 2, 'sales_rep': 3, 'secretary':4, 'supervisor': 5}
job_ownership_map = {'nonprofit':0, 'private':1, 'public':2, 'unknown':3}
computer_match_map = {'NN':0, 'NY':1, 'RN':2, 'RY':3}
school_match_map = {'CC':0, 'CN':1, 'CS':2, 'HC':3, 'HS':4, 'NC':5, 'NN':6, 'NS':7, 'SC':8, 'SS':9}

#change non-numerical columns to mapped integer values
df['job_type']=df['job_type'].map(job_type_map)
df['job_ownership']=df['job_ownership'].map(job_ownership_map)
df['computer_match']=df['computer_match'].map(computer_match_map)
df['school_match']=df['school_match'].map(school_match_map)

#drop unnecessary columns
df.drop(['job_ad_id'], axis=1)

#verify changes look as intended 
df.head()

Unnamed: 0,job_ad_id,job_type,job_fed_contractor,job_ownership,job_req_communication,job_req_organization,received_callback,honors,worked_during_school,special_skills,volunteer,military,employment_holes,experience_delta,computer_match,school_match
0,384,5,0,3,0,0,0,0,0,0,0,0,1,1.0,3,5
1,384,5,0,3,0,0,0,0,1,0,1,1,0,1.0,3,7
2,384,5,0,3,0,0,0,0,1,0,0,0,0,1.0,3,5
3,384,5,0,3,0,0,0,0,0,1,1,0,1,1.0,3,7
4,385,4,0,0,0,1,0,0,1,0,0,0,0,22.0,3,7


In [4]:
#check for any negative values in dataset
df.min()

#normalize values in columns with negative values, to ensure MultinomialNB can work with them
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
df[['experience_delta']] = scaler.fit_transform(df[['experience_delta']])

In [53]:
from sklearn.model_selection import train_test_split

#partition data into training and testing
X_train, X_test, y_train, y_test = train_test_split(df[df.columns.difference(['received_callback', 'job_ad_id'])], df['received_callback'], test_size=0.3)

In [54]:
from sklearn.naive_bayes import MultinomialNB

#fit model on the built in Naive Bayes method from sklearn
naive_bayes = MultinomialNB()
model = naive_bayes.fit(X_train, y_train)
model

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [55]:
#predict explanatory variable from testing data 
prediction = model.predict(X_test)

In [56]:
from sklearn.metrics import accuracy_score,precision_score,recall_score,f1_score

#evaluate effectiveness of model based on sklearn metrics
print('Accuracy Score',accuracy_score(y_test,prediction))
print('Precision Score',precision_score(y_test,prediction))
print('Recall Score',recall_score(y_test,prediction))
print('F1 Score',f1_score(y_test,prediction))

Accuracy Score 0.8137890394814379
Precision Score 0.5
Recall Score 0.006329113924050633
F1 Score 0.012499999999999999
