In [1]:
# Importing the required libraries
import pandas as pd
import nltk
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import SGDClassifier

  from numpy.core.umath_tests import inner1d


In [2]:
##################################################################################################################
############################## Indeed.com (Train and Validate) ###################################################

# Reading an INPUT file of unique reviews from 'Indeed.com' to pre-process the data
input = 'indeed_no_duplicates.csv'
inputFile = pd.read_csv(input, encoding='ISO-8859-1', engine='python')   # read the file and fields

# Ascertaining the length of unique reviews
print("\nThe total number of unique reviews:", len(inputFile))

# Dividing the entire data set into train and test with a 70:30 ratio
inputFile_train = inputFile.iloc[:int(len(inputFile)*.70)]
print("\nTotal number of unique reviews in training data set:", len(inputFile_train))

inputFile_test = inputFile.iloc[int(len(inputFile)*.70):]
print("\nTotal number of unique reviews in testing data set:", len(inputFile_test))

# Initializing the object for Lemmatization, Term-Document matrix, and TF-IDF matrix
# Including bag of for uni, bi, and tri - gram words & removing English stop words
lemmatizer = nltk.stem.WordNetLemmatizer()
count_vect = CountVectorizer(ngram_range=(1,3), stop_words='english')
tfidf_vect = TfidfVectorizer(ngram_range=(1,3), stop_words='english')

# Initializing the lists for TRAINING DATA for Predictor variables - 471 reviews
token3_train_x = []
pos_train_x = []

# Running the loop to tokenize, lemmatize, and remove the stop words from the reviews
for i,rows in inputFile_train.iterrows():
    print("\nReview number:", i)
    token_text = nltk.word_tokenize(rows['description'].lower())
    token_text_for_POS = nltk.word_tokenize(rows['description'])
    pos_train_x.append(token_text_for_POS)
    lemmatized_text = [lemmatizer.lemmatize(token) for token in token_text if token.isalpha()]
    text_wo_stop_words = [token for token in lemmatized_text if not token in stopwords.words('english') if token.isalpha()]
    token2 = ''
    for token in text_wo_stop_words:
        token2 = token2 + ' ' + token
    token3_train_x.append(token2)

# Fitting & transforming the vector
X_train_counts = count_vect.fit_transform(token3_train_x)
print("The shape of term-document matrix for TRAINING DATA for Predictor variables:", X_train_counts.shape)

tfidf_vect.fit(token3_train_x)
X_train_tfidf = tfidf_vect.transform(token3_train_x)
print("The shape of tf-idf matrix for TRAINING DATA for Predictor variables:", X_train_tfidf.shape)

# Initializing the lists for TRAINING DATA for Response variables
token3_train_y = []
pos=[]

# Running the loop to tokenize, lemmatize, and remove the stop words from the reviews - 471 titles
for i,rows in inputFile_train.iterrows():
    print("\nTitle number:", i)
    token_text = nltk.word_tokenize(rows['title'].lower())
    token_text_for_POS = nltk.word_tokenize(rows['title'])
    pos.append(token_text_for_POS)
    lemmatized_text = [lemmatizer.lemmatize(token) for token in token_text if token.isalpha()]
    text_wo_stop_words = [token for token in lemmatized_text if not token in stopwords.words('english') if token.isalpha()]
    token2 = ''
    for token in text_wo_stop_words:
        token2 = token2 + ' ' + token
    token3_train_y.append(token2)

# Fitting & transforming the vector
Y_train_counts = count_vect.fit_transform(token3_train_y)
print("The shape of term-document matrix for TRAINING DATA for Response variable:", Y_train_counts.shape)

tfidf_vect.fit(token3_train_y)
Y_train_tfidf = tfidf_vect.transform(token3_train_y)
print("The shape of tf-idf matrix for TRAINING DATA for Response variable:", Y_train_tfidf.shape)

# Initializing the lists for TESTING DATA for Predictor variables - 203 reviews
token3_test_x = []
pos=[]

for i,rows in inputFile_test.iterrows():
    print("\nReview number:", i)
    token_text = nltk.word_tokenize(rows['description'].lower())
    token_text_for_POS = nltk.word_tokenize(rows['description'])
    pos.append(token_text_for_POS)
    lemmatized_text = [lemmatizer.lemmatize(token) for token in token_text if token.isalpha()]
    text_wo_stop_words = [token for token in lemmatized_text if not token in stopwords.words('english') if token.isalpha()]
    token2_target = ''
    for token in text_wo_stop_words:
        token2_target = token2_target + ' ' + token
    token3_test_x.append(token2_target)

# Fitting & transforming the vector
X_test_counts = count_vect.fit_transform(token3_test_x)
print("The shape of term-document matrix for TESTING DATA for Predictor variable:", X_test_counts.shape)

tfidf_vect.fit(token3_test_x)
X_test_tfidf = tfidf_vect.transform(token3_test_x)
print("The shape of tf-idf matrix for TESTING DATA for Predictor variable:", X_test_tfidf.shape)

# Initializing the lists for TESTING DATA for Response variables - 203 titles
token3_test_y = []
pos=[]

for i,rows in inputFile_test.iterrows():
    print("\nTitle number:", i)
    token_text = nltk.word_tokenize(rows['title'].lower())
    token_text_for_POS = nltk.word_tokenize(rows['title'])
    pos.append(token_text_for_POS)
    lemmatized_text = [lemmatizer.lemmatize(token) for token in token_text if token.isalpha()]
    text_wo_stop_words = [token for token in lemmatized_text if not token in stopwords.words('english') if token.isalpha()]
    token2_target = ''
    for token in text_wo_stop_words:
        token2_target = token2_target + ' ' + token
    token3_test_y.append(token2_target)

# Fitting & transforming the vector
Y_test_counts = count_vect.fit_transform(token3_test_y)
print("The shape of term-document matrix for TESTING DATA for Response variable:", Y_test_counts.shape)

tfidf_vect.fit(token3_test_y)
Y_test_tfidf = tfidf_vect.transform(token3_test_y)
print("The shape of term-document matrix for TESTING DATA for Response variable:", Y_test_tfidf.shape)



The total number of unique reviews: 674

Total number of unique reviews in training data set: 471

Total number of unique reviews in testing data set: 203

Review number: 0

Review number: 1

Review number: 2

Review number: 3

Review number: 4

Review number: 5

Review number: 6

Review number: 7

Review number: 8

Review number: 9

Review number: 10

Review number: 11

Review number: 12

Review number: 13

Review number: 14

Review number: 15

Review number: 16

Review number: 17

Review number: 18

Review number: 19

Review number: 20

Review number: 21

Review number: 22

Review number: 23

Review number: 24

Review number: 25

Review number: 26

Review number: 27

Review number: 28

Review number: 29

Review number: 30

Review number: 31

Review number: 32

Review number: 33

Review number: 34

Review number: 35

Review number: 36

Review number: 37

Review number: 38

Review number: 39

Review number: 40

Review number: 41

Review number: 42

Review number: 43

Review number: 44


Review number: 409

Review number: 410

Review number: 411

Review number: 412

Review number: 413

Review number: 414

Review number: 415

Review number: 416

Review number: 417

Review number: 418

Review number: 419

Review number: 420

Review number: 421

Review number: 422

Review number: 423

Review number: 424

Review number: 425

Review number: 426

Review number: 427

Review number: 428

Review number: 429

Review number: 430

Review number: 431

Review number: 432

Review number: 433

Review number: 434

Review number: 435

Review number: 436

Review number: 437

Review number: 438

Review number: 439

Review number: 440

Review number: 441

Review number: 442

Review number: 443

Review number: 444

Review number: 445

Review number: 446

Review number: 447

Review number: 448

Review number: 449

Review number: 450

Review number: 451

Review number: 452

Review number: 453

Review number: 454

Review number: 455

Review number: 456

Review number: 457

Review number: 458



Title number: 408

Title number: 409

Title number: 410

Title number: 411

Title number: 412

Title number: 413

Title number: 414

Title number: 415

Title number: 416

Title number: 417

Title number: 418

Title number: 419

Title number: 420

Title number: 421

Title number: 422

Title number: 423

Title number: 424

Title number: 425

Title number: 426

Title number: 427

Title number: 428

Title number: 429

Title number: 430

Title number: 431

Title number: 432

Title number: 433

Title number: 434

Title number: 435

Title number: 436

Title number: 437

Title number: 438

Title number: 439

Title number: 440

Title number: 441

Title number: 442

Title number: 443

Title number: 444

Title number: 445

Title number: 446

Title number: 447

Title number: 448

Title number: 449

Title number: 450

Title number: 451

Title number: 452

Title number: 453

Title number: 454

Title number: 455

Title number: 456

Title number: 457

Title number: 458

Title number: 459

Title numbe


Title number: 658

Title number: 659

Title number: 660

Title number: 661

Title number: 662

Title number: 663

Title number: 664

Title number: 665

Title number: 666

Title number: 667

Title number: 668

Title number: 669

Title number: 670

Title number: 671

Title number: 672

Title number: 673
The shape of term-document matrix for TESTING DATA for Response variable: (203, 246)
The shape of term-document matrix for TESTING DATA for Response variable: (203, 246)


In [3]:
##################################################################################################################
############################## Indeed.com (Model Training) #######################################################

# Naïve Bayes Model
clf = MultinomialNB().fit(X_train_tfidf, token3_train_y)
text_clf = Pipeline([('vect', CountVectorizer()),('tfidf', TfidfTransformer()),('clf', MultinomialNB()),])
text_clf = text_clf.fit(token3_train_x, token3_train_y)
predicted = text_clf.predict(token3_test_x)
print("The prediction accuracy for Naive Bayes is:", np.mean(predicted == token3_test_y)*100, "%")

# Logistic Regression Model
text_clf_logit = Pipeline([('vect1', CountVectorizer()), ('tfidf1', TfidfTransformer()), ('clf-lr', LogisticRegression(penalty='l2', random_state=42)),])
model_logit = text_clf_logit.fit(token3_train_x, token3_train_y)
predicted_logit = text_clf_logit.predict(token3_test_x)
print("The prediction accuracy for Logistic Regression is:", np.mean(predicted_logit == token3_test_y)*100, "%")

# Random Forest
text_clf_rf = Pipeline([('vect2', CountVectorizer()), ('tfidf2', TfidfTransformer()), ('clf-rf', RandomForestClassifier(n_estimators = 25, random_state=42)),])
model_rf = text_clf_rf.fit(token3_train_x, token3_train_y)
predicted_rf = text_clf_rf.predict(token3_test_x)
print("The prediction accuracy for Random Forest is:", np.mean(predicted_rf == token3_test_y)*100, "%")

# Support Vector Machine (SVM)
text_clf_svm = Pipeline([('vect3', CountVectorizer()), ('tfidf3', TfidfTransformer()), ('clf-svm', SGDClassifier(loss='hinge', penalty='l2', alpha=1e-3, max_iter=15, random_state=22)),])
model_svm = text_clf_svm.fit(token3_train_x, token3_train_y)
predicted_svm = text_clf_svm.predict(token3_test_x)
print("The prediction accuracy for Support Vector Machine is:", np.mean(predicted_svm == token3_test_y)*100, "%")

# Storing the results of SVM Model
pred_svm = pd.DataFrame(predicted_svm)
pred_svm['actual'] = pd.DataFrame(token3_test_y)

The prediction accuracy for Naive Bayes is: 2.955665024630542 %
The prediction accuracy for Logistic Regression is: 10.83743842364532 %
The prediction accuracy for Random Forest is: 17.733990147783253 %
The prediction accuracy for Support Vector Machine is: 20.689655172413794 %


In [4]:
##################################################################################################################
############################## Craigslist.com (Test) #############################################################

# Reading a TEST file of reviews to pre-process the data
output = 'craigslist_1.csv'
outputFile = pd.read_csv(output, encoding='ISO-8859-1', engine='python')   # read the file and fields
print("\nTotal number of unique reviews in testing data set of Craigslist:", len(outputFile))

# Pre-processing the TEST file data for both columns of title and description
desc = list(outputFile['desc'])
desc_modified=[]
for i in range(0, len(desc)):
    desc_modified.append(desc[i].strip())

title = list(outputFile['title'])
title_modified=[]
for i in range(0, len(title)):
    title_modified.append(title[i].strip())

# PREDICTING the results from the SVM Model to the test set of Craigslist title
predicted_cs_svm = text_clf_svm.predict(desc_modified)

# Storing the results of SVM Model
pred_cs = pd.DataFrame(predicted_cs_svm)
pred_cs['actual_cs'] = pd.DataFrame(title_modified)

# Writing the csv file consisting of both actual and predicted values of job titles
pred_cs.to_csv('pred_craigslist.csv')

##################################################################################################################


Total number of unique reviews in testing data set of Craigslist: 120
