In [2]:
# This notebook creates a fastText model based on questions and their tags, and uses this model to predict 
# corresponding tags for professionals based on their listed industry and headline.
#
# A professional whose tags matches those of a question is likely to answer.
#
# While CareerVillage does already collect tags for professionals, they are not always accurately filled in. 
# This is an attempt to fill in this missing information using natural language processing.

# CareerVillage.org has provided several years of anonymized data and each file comes from a table in their database.
# professionals.csv: We call our volunteers "Professionals", but we might as well call them Superheroes. They're the grown ups who volunteer their time to answer questions on the site.
# questions.csv: Questions get posted by students. Sometimes they're very advanced. Sometimes they're just getting started. It's all fair game, as long as it's relevant to the student's future professional success.
# tag_questions.csv: Every question can be hashtagged. We track the hashtag-to-question pairings, and put them into this file.
# tags.csv: Each tag gets a name.      
import numpy as np
import pandas as pd
import fastText as ft
import re
import nltk
nltk.download('popular')

df_p = pd.read_csv('../input/professionals.csv')
df_q = pd.read_csv('../input/questions.csv')
df_tq = pd.read_csv('../input/tag_questions.csv')
df_t = pd.read_csv('../input/tags.csv')

[nltk_data] Error loading popular: <urlopen error [Errno -3] Temporary
[nltk_data]     failure in name resolution>


In [3]:
# train fastText model with tags and question tags

# combine all tags for a question into a comma seperated string
def combine_tags(x):
    x['combined_tag_names'] = x['tags_tag_name'].str.cat(sep=',')
    return x

# get question and their tags to use as input to the fastText classification algorithm
# TO DO: this training data set can be from any data source, possible other sources to 
# include would be https://www.bls.gov/soc/ to generate tags for occupations
df_tqt = pd.merge(df_tq, df_t, left_on='tag_questions_tag_id', right_on='tags_tag_id')
df_tqt.head()
df_tqfull = pd.merge(df_tqt, df_q, left_on='tag_questions_question_id', right_on='questions_id')
df_tqfull = df_tqfull[['questions_id', 'tags_tag_name', 'questions_title', 'questions_body']].dropna()
df_tqfull = df_tqfull.groupby('questions_id').apply(combine_tags)
df_tqfull = df_tqfull[['questions_id', 'questions_title', 'questions_body', 'combined_tag_names']].drop_duplicates()
df_tqfull.head()

Unnamed: 0,questions_id,questions_title,questions_body,combined_tag_names
0,cb43ebee01364c68ac61d347a393ae39,Can you attend a college for a 'minor' without...,What if you already have a major at another co...,"minor,college,college-major"
3,47f55e85ce944242a5a347ab85a8ffb4,Deciding a minor,I'm in my first year of university and I recen...,"minor,university,business,college,college-major"
8,ccc30a033a0f4dfdb2eb987012f25792,Is psychology useful?,Since I hope to become a pediatric intensivist...,"minor,medicine,science,pediatrics,er-doctor,pi..."
26,e30b274e48d741f7bf50eb5e7171a3c0,Should I purse a minor?,I was considering a minor for my school resume...,"minor,school,resume,college,science-major,coll..."
33,3d22742052df4989b311b4195cbb0f1a,What would be a good minor to go with a biolog...,Should I minor in a field that is different fr...,"minor,science,biology,college,major"


In [4]:
# convert above dataframe to a training .txt file of the form:
# __label__er-doctor __label__pediatrics is psychology useful? since i hope to become a pediatric intensivist after graduating from med school, i already know that   however, i am very interested in psychology and was wondering if it would help me in my field? in addition, if it is indeed useful, then should i get a major or minor in it because my gpa must be high for med school?  thank you!!  #medicine  #psychology #icu #pediatrics #science #doctor #er-doctor #picu #intensive-care-unit #intensive-care-nursing #college-major #college #medical-school #medical-education #majors-and-minors #college-minor #minor #double-major 
# __label__business deciding a minor i'm in my first year of university and i recently decide that i'd like to minor in a different program from the one i'm in. but how do you decide a minor? should it be based on your interests? or based on a possible career path thats different from the one your'e currently on? #minor #university #business #college  #college-major 
# __label__mechanical-engineering what are some good tips for finding a job without experience? i am a graduate student in electrical engineering and i will be graduating very soon. it has been hard for me to find a job by searching and applying for jobs online. i find that my application gets turned down often because i lack engineering work experience. your tips on this matter will be very much appreciated.   #engineering #career #engineer #mechanical-engineering #job-search #human-resources #electrical-engineering #job-search-strategies
# __label__electrical-engineering what are some good tips for finding a job without experience? i am a graduate student in electrical engineering and i will be graduating very soon. it has been hard for me to find a job by searching and applying for jobs online. i find that my application gets turned down often because i lack engineering work experience. your tips on this matter will be very much appreciated.   #engineering #career #engineer #mechanical-engineering #job-search #human-resources #electrical-engineering #job-search-strategies

# ignore anything but nouns for training
def get_nouns(sentence):
    nouns = ''
    tokens = nltk.word_tokenize(sentence)
    tagged = nltk.pos_tag(tokens)
    for tag in tagged:
        if (tag[1][0:1] == 'N'):
            nouns = nouns + tag[0] + ' '
    return nouns

# input: 'er-doctor, pediatrics'
# output: '__label__er-doctor __label__pediatrics'
def generate_labels(combined_tag_name):
    labels = ''
    
    # exclude any meaningless tags
    exclude = ['', 'any', 'question', 'professional', 'zjz', '零售', 'yui', 'zdijhvgiuasbvmnv', \
              'help', 'justwondering', 'employee', 'career-choice', 'career-path', \
              'job-search', 'help', 'ixzz4ofjuqwa1', 'high-school', 'advice', 'work',\
               'employment', 'hiring', 'umatter', 'skills', 'knowledge', 'thank-you,'\
               'study', 'experts', 'information', 'schooling', 'worker', 'iit', \
               'importance', 'nevergiveup', 'school', 'college']
    
    tags = combined_tag_name.split(',')
    for tag in tags:
        tag = re.sub(r'^(#(-)*)+', '', tag)
        if tag not in exclude:
            labels = labels + '__label__' + tag + ' '
    
    return labels

# generate a line of training text from a dataframe row
# convert all text to lowercase
def generate_training_line(row):
    line = generate_labels(row['combined_tag_names']) + ' ' \
        + get_nouns(row['questions_title'] + ' ' + row['questions_body']) + ' ' 
    # strip out newlines
    line = re.sub(r'\r\n', ' ', line)
    line = re.sub(r'\n', ' ', line)
    line = line + '\n'
    return line.lower()

df_tqfull.loc[:, 'line'] = df_tqfull.apply(generate_training_line, axis=1)


In [5]:
# training_lines = df_tqfull['line'].head().get_values()
lines = df_tqfull['line']

# e.g.
p = 777
lines[p:p+1].values

array(['__label__career __label__military __label__career-counseling __label__sports-coaching __label__recruiting __label__athletics  rotc college sports sports join rotc college career athletics  \n'],
      dtype=object)

In [6]:
# using 9/10th of the lines for training
training_set_size = int(lines.size/5 * 4)

training_lines = lines[0: training_set_size]
validation_lines = lines[training_set_size:]

training_text = ''
for line in training_lines:
    training_text += line
    
validation_text = ''
for line in validation_lines:
    validation_text += line
    
f = open('data.train.txt', 'w')
f.write(training_text)
f.close()

f = open('data.valid.txt', 'w')
f.write(validation_text)
f.close()

# now train the model
model = ft.train_supervised('data.train.txt', lr=1.0, epoch=25, wordNgrams=2, loss='hs',\
                           dim=50)

In [7]:
validation_lines

67656    __label__entrepreneurship __label__entrepreneu...
67660    __label__entrepreneurship __label__investment-...
67663    __label__entrepreneurship __label__animal-husb...
67666    __label__entrepreneurship __label__buisness __...
67669    __label__entrepreneurship  entrepreneur anythi...
67670    __label__entrepreneurship  business someday i ...
67671    __label__entrepreneurship  startup engineer en...
67672    __label__entrepreneurship  steps someone busin...
67673    __label__entrepreneurship __label__beauty-indu...
67680    __label__entrepreneurship  amount business ans...
67681    __label__entrepreneurship  are boss someone au...
67682    __label__entrepreneurship __label__entrepreneu...
67685    __label__entrepreneurship __label__beauty-indu...
67691    __label__entrepreneurship  entrepreneur name l...
67692    __label__entrepreneurship __label__entrepreneu...
67698    __label__entrepreneurship __label__start-ups _...
67703    __label__entrepreneurship __label__cooking-and.

In [8]:
model.test('data.valid.txt')

(3853, 0.39190241370360757, 0.2342901474010861)

In [9]:
model.predict(['how do i become a doctor'])

([['__label__doctor']], array([[0.38243905]]))

In [10]:
# Finally, to manually test the model, generate tags for some professionals
df_pc = df_p.dropna()
df_pc[20:24]

Unnamed: 0,professionals_id,professionals_location,professionals_industry,professionals_headline,professionals_date_joined
426,9733735b1bff4af08dd81468e07e310d,"Chicago, Illinois",Marketing and Advertising,"Project Leader, Business Manager, Marketing, N...",2013-07-12 20:33:27 UTC+0000
427,e327399c48584fcf81e433828a6d8715,"Coimbatore, Tamil Nadu, India",Computer Software,CEO @area1security,2013-07-15 16:40:20 UTC+0000
428,042d2184ee3e4e548fc3589baaa69caf,"Chicago, Illinois",Venture Capital & Private Equity,"Senior Manager, the Civic Accelerator at Point...",2013-07-15 16:43:33 UTC+0000
429,b45e7851aded479a92282ea3c66300ab,"New York, New York",Architecture & Planning,Director of Architectural Acoustics at Acentech,2013-07-15 17:24:53 UTC+0000


In [11]:
def get_professional_desc(professional_id):
    professional_desc = df_pc[df_pc['professionals_id'] == professional_id]['professionals_industry'].iloc[0] \
        + ' ' \
        + df_pc[df_pc['professionals_id'] == professional_id]['professionals_headline'].iloc[0]
    professional_desc = get_nouns(professional_desc).lower()
    return professional_desc

texts = [get_professional_desc('c1bda2764ee642b3859b1ca6839fa16c')]
labels = model.predict(texts)
[texts, labels]

#get_professional_desc('ff9b24dcebc744bebc50f2472502efb9')

[['finance private equity investment professional '],
 ([['__label__finance']], array([[0.62780333]]))]