### Building and Testing classification models to predict salaries from the text contained in the job descriptions.

In [1]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import metrics
from operator import itemgetter
from sklearn.metrics import classification_report
import csv
import numpy as np
import os
import pandas as pd

salary = pd.read_csv("Train_rev1.csv")

#####  Get first 15K rows, and assign an indicator variable for High / Low salary

In [2]:
JD = salary[['FullDescription', 'SalaryNormalized']][:15000]

mask = np.percentile(JD['SalaryNormalized'], 75)

JD.loc[JD['SalaryNormalized'] >= mask,'Class'] = 1
JD.loc[JD['Class'] != 1, 'Class'] = 0

##### Building the training and the test set

In [3]:
import random
from patsy import dmatrices
from sklearn.cross_validation import train_test_split
from collections import Counter

X_train, X_test, y_train, y_test = train_test_split(JD['FullDescription'], JD['Class'], test_size=0.4, random_state=1)
Counter(y_train)

Counter({0.0: 6564, 1.0: 2436})

#####0 is the lower salary, and 1 is the higher
#####We have High Salaries to Low Salries in the ratio 1:3, so we have to use a stratified sampling method to train our model well.

In [4]:
y_train_df = pd.DataFrame(y_train)

y_train_df_0 = y_train_df[y_train_df['Class']==0]

In [5]:
import random as rd

#we find that there are 2436 observations greater than 75th percentile in the training data. 

y_train_df = pd.DataFrame(y_train)

y_train_df_0 = y_train_df[y_train_df['Class']==0]

sampleset = random.sample(y_train_df_0.index,2436)

In [6]:
y_train_0 = y_train.ix[sampleset]

X_train_0 = X_train.ix[sampleset]

y1index = y_train_df[y_train_df['Class']==1].index

y_train_1 = y_train.ix[y1index]

X_train_1 = X_train.ix[y1index]

#### Concatenating the X train & Y trains

In [7]:
X_train = pd.concat([X_train_1,X_train_0], axis = 0)
y_train = pd.concat([y_train_1,y_train_0], axis = 0)

In [8]:
X_train = np.array(X_train)
y_train = np.array(y_train)

X_test = np.array(X_test)
y_test = np.array(y_test)

vectorizer = TfidfVectorizer(min_df=2, ngram_range=(1, 1), strip_accents='unicode', norm='l2')

In [9]:
X_train_Orig = X_train
X_test_Orig = X_test

#Unigram, no stop word removal, no lemma

X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)

nb_classifier = MultinomialNB().fit(X_train, y_train)

y_nb_predicted = nb_classifier.predict(X_test)

print "MODEL: Multinomial Naive Bayes\n"

print 'The precision for this classifier is ' + str(metrics.precision_score(y_test, y_nb_predicted))
print 'The recall for this classifier is ' + str(metrics.recall_score(y_test, y_nb_predicted))
print 'The f1 for this classifier is ' + str(metrics.f1_score(y_test, y_nb_predicted))
print 'The accuracy for this classifier is ' + str(metrics.accuracy_score(y_test, y_nb_predicted))

print '\nHere is the classification report:'
print classification_report(y_test, y_nb_predicted)

print '\nHere is the confusion matrix:'
print metrics.confusion_matrix(y_test, y_nb_predicted)

MODEL: Multinomial Naive Bayes

The precision for this classifier is 0.585229911751
The recall for this classifier is 0.793450881612
The f1 for this classifier is 0.673616680032
The accuracy for this classifier is 0.7965

Here is the classification report:
             precision    recall  f1-score   support

        0.0       0.91      0.80      0.85      4412
        1.0       0.59      0.79      0.67      1588

avg / total       0.83      0.80      0.80      6000


Here is the confusion matrix:
[[3519  893]
 [ 328 1260]]


#### Now lets lemmatize the corpus and see if our accuracy measures improve or worsen
#### Our speculation is that lemmatization will probably decrease the accuracy, as we are bringing each word closer to its root (albeit within context). This is beacuse we are going towards a generalization of each word, hence increasing recall, and negatively affecting precision.

In [None]:
#lemmatize
import nltk
from nltk.stem import WordNetLemmatizer

wnl = WordNetLemmatizer()

def LemmaTokenize(text):
    return [wnl.lemmatize(t) for t in nltk.word_tokenize(text)]

vectorizer = TfidfVectorizer(min_df=2, 
 ngram_range=(1, 1), 
 tokenizer = LemmaTokenize, 
 strip_accents='unicode', 
 norm='l2')

X_train = X_train_Orig
X_test = X_test_Orig

X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)

nb_classifier = MultinomialNB().fit(X_train, y_train)

y_nb_predicted = nb_classifier.predict(X_test)

print "MODEL: Multinomial Naive Bayes\n"

print 'The precision for this classifier is ' + str(metrics.precision_score(y_test, y_nb_predicted))
print 'The recall for this classifier is ' + str(metrics.recall_score(y_test, y_nb_predicted))
print 'The f1 for this classifier is ' + str(metrics.f1_score(y_test, y_nb_predicted))
print 'The accuracy for this classifier is ' + str(metrics.accuracy_score(y_test, y_nb_predicted))

print '\nHere is the classification report:'
print classification_report(y_test, y_nb_predicted)

print '\nHere is the confusion matrix:'
print metrics.confusion_matrix(y_test, y_nb_predicted)

#### The output has confirmed our speculation! 
#### Retaining the non-lemmatized data, and removing stop words we will run the model.
##### We expect that removing stop words should give us better results, as the other words in each Job Descriptions will lend more predictive power

In [20]:
#Removing Stop words & not lemmatizing

vectorizer = TfidfVectorizer(min_df=2, 
 ngram_range=(1, 1), 
 stop_words = 'english',                             
 strip_accents='unicode', 
 norm='l2')

X_train = X_train_Orig
X_test = X_test_Orig

X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)

nb_classifier = MultinomialNB().fit(X_train, y_train)

y_nb_predicted = nb_classifier.predict(X_test)

print "MODEL: Multinomial Naive Bayes\n"

print 'The precision for this classifier is ' + str(metrics.precision_score(y_test, y_nb_predicted))
print 'The recall for this classifier is ' + str(metrics.recall_score(y_test, y_nb_predicted))
print 'The f1 for this classifier is ' + str(metrics.f1_score(y_test, y_nb_predicted))
print 'The accuracy for this classifier is ' + str(metrics.accuracy_score(y_test, y_nb_predicted))

print '\nHere is the classification report:'
print classification_report(y_test, y_nb_predicted)

print '\nHere is the confusion matrix:'
print metrics.confusion_matrix(y_test, y_nb_predicted)


MODEL: Multinomial Naive Bayes

The precision for this classifier is 0.577605826127
The recall for this classifier is 0.799118387909
The f1 for this classifier is 0.670541611625
The accuracy for this classifier is 0.792166666667

Here is the classification report:
             precision    recall  f1-score   support

        0.0       0.92      0.79      0.85      4412
        1.0       0.58      0.80      0.67      1588

avg / total       0.83      0.79      0.80      6000


Here is the confusion matrix:
[[3484  928]
 [ 319 1269]]


#### We got a marginally lower accuracy after removing stop words. However the three above methods are very close in their 
####  accuracy measures, thereby not making a huge difference in the appropriateness of any given approach. 

#### Top 10 words (excluding stopwords) that are most indicative of (1) high salary, and (0) low salary.

In [21]:
def most_informative_feature_for_binary_classification(vectorizer, classifier, n=10):
    class_labels = classifier.classes_
    feature_names = vectorizer.get_feature_names()
    topn_class1 = sorted(zip(classifier.coef_[0], feature_names))[:n]
    topn_class2 = sorted(zip(classifier.coef_[0], feature_names))[-n:]

    for coef, feat in topn_class1:
        print class_labels[0], coef, feat

    print

    for coef, feat in reversed(topn_class2):
        print class_labels[1], coef, feat


most_informative_feature_for_binary_classification(vectorizer, nb_classifier)

0.0 -10.3692114739 00am6
0.0 -10.3692114739 07
0.0 -10.3692114739 100th
0.0 -10.3692114739 10pm6am
0.0 -10.3692114739 11am
0.0 -10.3692114739 11pm
0.0 -10.3692114739 11th
0.0 -10.3692114739 1218
0.0 -10.3692114739 13th
0.0 -10.3692114739 15am

1.0 -5.77487589423 experience
1.0 -5.91380705657 manager
1.0 -5.92144768439 business
1.0 -6.06329085478 management
1.0 -6.12457163672 home
1.0 -6.12667283877 team
1.0 -6.15931903738 role
1.0 -6.17595273528 development
1.0 -6.20834795695 work
1.0 -6.22195537794 skills


We find that the words like experience, business, manager, management, team, home, role, etc., matter more or appear more frequently in jobs that have a high salary while the low paying jobs have less intuitive words that are more frequent like the time.

####  Test the model by including the bigram words in our model. This is done by 2 methods. 
#### Include the bigram words in the model using the vectorizer to generate the tdidf matrix and then run the naive bayes model. 
We would expect introduction of bigrams to improve our model as they model could be classified by words that appear together frequently than the appears of the unigrams themselves.

In [22]:
vectorizer = TfidfVectorizer(min_df=2, 
 ngram_range=(1, 2),  
 strip_accents='unicode',
 lowercase = True,
 norm='l2')

X_train = X_train_Orig
X_test = X_test_Orig

X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)

nb_classifier = MultinomialNB().fit(X_train, y_train)

y_nb_predicted = nb_classifier.predict(X_test)

print "MODEL: Multinomial Naive Bayes\n"

print 'The precision for this classifier is ' + str(metrics.precision_score(y_test, y_nb_predicted))
print 'The recall for this classifier is ' + str(metrics.recall_score(y_test, y_nb_predicted))
print 'The f1 for this classifier is ' + str(metrics.f1_score(y_test, y_nb_predicted))
print 'The accuracy for this classifier is ' + str(metrics.accuracy_score(y_test, y_nb_predicted))

print '\nHere is the classification report:'
print classification_report(y_test, y_nb_predicted)

print '\nHere is the confusion matrix:'


print metrics.confusion_matrix(y_test, y_nb_predicted)

MODEL: Multinomial Naive Bayes

The precision for this classifier is 0.580186583741
The recall for this classifier is 0.82241813602
The f1 for this classifier is 0.680385517062
The accuracy for this classifier is 0.7955

Here is the classification report:
             precision    recall  f1-score   support

        0.0       0.92      0.79      0.85      4412
        1.0       0.58      0.82      0.68      1588

avg / total       0.83      0.80      0.80      6000


Here is the confusion matrix:
[[3467  945]
 [ 282 1306]]


From the above we see that the accuracy of our model has increased by using the bigram taggers and better than the results obtained

#####2. We also method below, where we extract the POS tags of Bigrams to add to the document matrix and use the vectorize on this matrix for classification
We would expect the model to perform better than an unigram model

In [None]:
from nltk import bigrams

salary = pd.read_csv("Train_rev1.csv")
JD_new = salary[['FullDescription', 'SalaryNormalized']][:15000]

mask = np.percentile(JD_new['SalaryNormalized'], 75)

JD_new.loc[JD_new['SalaryNormalized'] >= mask,'Class'] = 1
JD_new.loc[JD_new['Class'] != 1, 'Class'] = 0


def taglist(text):
    text = (text.decode('utf-8'))
    tokens = nltk.word_tokenize(text)
    pos =  nltk.pos_tag(tokens)
    tags = [t for w,t in pos]
    bigram_list =  list(nltk.ngrams(tags,2))
    listtoappend = [pos1+ '_' + pos2 for pos1, pos2 in bigram_list]
    pos_words = ' '.join(listtoappend)
    return text +' '+ pos_words

JD_new['FullDescription'] = JD_new['FullDescription'][:15000].map(taglist)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(JD_new['FullDescription'], JD_new['Class'], test_size=0.4, random_state=1)

y_train_df = pd.DataFrame(y_train)

y_train_df_0 = y_train_df[y_train_df['Class']==0]

y_train_df = pd.DataFrame(y_train)

y_train_df_0 = y_train_df[y_train_df['Class']==0]

sampleset = random.sample(y_train_df_0.index,2431)

y_train_0 = y_train.ix[sampleset]

X_train_0 = X_train.ix[sampleset]

y1index = y_train_df[y_train_df['Class']==1].index

y_train_1 = y_train.ix[y1index]

X_train_1 = X_train.ix[y1index]

X_train = pd.concat([X_train_1,X_train_0], axis = 0)
y_train = pd.concat([y_train_1,y_train_0], axis = 0)

X_train = np.array(X_train)
y_train = np.array(y_train)

X_test = np.array(X_test)
y_test = np.array(y_test)


vectorizer = TfidfVectorizer(min_df=2, 
 ngram_range=(1, 1), 
 strip_accents='unicode', 
 lowercase = True,                            
 norm='l2')

X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)

nb_classifier = MultinomialNB().fit(X_train, y_train)

y_nb_predicted = nb_classifier.predict(X_test)

print "MODEL: Multinomial Naive Bayes\n"

print 'The precision for this classifier is ' + str(metrics.precision_score(y_test, y_nb_predicted))
print 'The recall for this classifier is ' + str(metrics.recall_score(y_test, y_nb_predicted))
print 'The f1 for this classifier is ' + str(metrics.f1_score(y_test, y_nb_predicted))
print 'The accuracy for this classifier is ' + str(metrics.accuracy_score(y_test, y_nb_predicted))

print '\nHere is the classification report:'
print classification_report(y_test, y_nb_predicted)

print '\nHere is the confusion matrix:'
print metrics.confusion_matrix(y_test, y_nb_predicted)

#### The accuracy obtained for POS tagged Bigrams is slightly less than the one with the bigram tagger in the vectorizer and also lower than the unigram tagger.