In [176]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords # Import the stop word list
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
import pickle
from sklearn.metrics import accuracy_score
from matplotlib import pyplot as plt

#nltk.download('stopwords')
train = pd.read_csv("projects_profile_match.tsv", header=0, \
                    delimiter="\t", quoting=3,encoding='latin-1')

# create training and testing vars
y = train.Preference
X = train.drop('Preference', axis=1)
train.head()

Unnamed: 0,ID,ProjectDescription,ProfileDescription,Preference
0,1,We arethe transformation arm of a Fortune 100 ...,"Market Research, Employee Fixed TermResume Exa...",N
1,2,We are a PE firm seeking a junior consulting r...,Director of Market ResearchResume Examples & S...,N
2,3,We are a mid-market private equity firm lookin...,Market Research Senior ManagerResume Examples ...,N
3,4,I am launching a new commercial real estate pl...,"Senior Manager, Corporate Market ResearchResum...",N
4,5,We are a PE-backed consumer products business ...,"HBO Director, Market ResearchResume Examples &...",N


In [177]:
# Split the data in 80/20 for training and testing
X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=0.2)
print("\nX_train:\n")
print(X_train.head())
print(X_train.shape)

print("\nX_test:\n")
print(X_test.head())
print(X_test.shape)


X_train:

        ID                                 ProjectDescription  \
960    961  Hi!I'm seeking a business analyst that can hel...   
1757  1758  We are looking for a consultant to help us wit...   
6323  6324  We are an IT services organization and a plati...   
295    296  We are a PE firm that owns a BPO services/soft...   
2748  2749  BackgroundOur client's Emerging Innovation Org...   

                                     ProfileDescription  
960   Market Research, Employee Fixed TermResume Exa...  
1757  Sound working knowledge and experience using a...  
6323  Senior Manager, Corporate Market ResearchResum...  
295   Finance Accounting ManagerResume Examples & Sa...  
2748  Gather and analyze massive amounts of informat...  
(6848, 3)

X_test:

        ID                                 ProjectDescription  \
1101  1102  uBiome is a biotechnology company that sequenc...   
8267  8268  We are a Mid-Size US based enterprise seeking ...   
2605  2606  We are looking for a co

In [178]:
#train.shape
#train.columns.values
#print(train["ProjectDescription"][0])

#train.head()

#Keep only letters. Remove everything else in the .
letters_only = re.sub("[^a-zA-Z]",           # The pattern to search for
                      " ",                   # The pattern to replace it with
                      X_train["ProjectDescription"][0])
#print(letters_only)
lower_case = letters_only.lower()        # Convert to lower case
words = lower_case.split()
#print(stopwords.words("english"))

# Clean up the tex. Remove stop words from "words"
words = [w for w in words if not w in stopwords.words("english")]
#print(words)

In [179]:
def description_to_words( raw_description ):
    # Function to convert a raw description to a string of words
    # The input is a single string (a raw description), and 
    # the output is a single string (a preprocessed description)
    #
   
    # 1. Remove non-letters        
    letters_only = re.sub("[^a-zA-Z]", " ", str(raw_description))
    #
    # 2. Convert to lower case, split into individual words
    words = letters_only.lower().split()                             
    #
    # 3. In Python, searching a set is much faster than searching
    #   a list, so convert the stop words to a set
    stops = set(stopwords.words("english"))                  
    # 
    # 4. Remove stop words
    meaningful_words = [w for w in words if not w in stops]   
    #
    # 6. Join the words back into one string separated by space, 
    # and return the result.
    return( " ".join( meaningful_words ))

In [180]:
# Get the number of description based on the dataframe column size
num_description = train["ProjectDescription"].size

# Initialize an empty list to hold the clean descriptions
clean_train_description = []

# Loop over each description; create an index i that goes from 0 to the length
# of the description list 
for i in range( 0, num_description ):
    # Call our function for each one, and add the result to the list of
    # clean reviews
    clean_train_description.append( description_to_words( train["ProjectDescription"][i] ) )


In [181]:

from sklearn.feature_extraction.text import CountVectorizer

# Initialize the "CountVectorizer" object, which is scikit-learn's
# bag of words tool.  
vectorizer = CountVectorizer(analyzer = "word",   \
                             tokenizer = None,    \
                             preprocessor = None, \
                             stop_words = 'english',   \
                             max_features = 2000) 

# fit_transform() does two functions: First, it fits the model
# and learns the vocabulary; second, it transforms our training data
# into feature vectors. The input to fit_transform should be a list of 
# strings.
train_data_features = vectorizer.fit_transform(clean_train_description)
print("Created bag of words")
# Numpy arrays are easy to work with, so convert the result to an 
# array

Created bag of words


In [182]:
#train_data_features = train_data_features.toarray()

vocab = vectorizer.get_feature_names()
#print(vocab)

# Initialize a Random Forest classifier with 100 trees
forest = RandomForestClassifier(n_estimators = 100) 

# Fit the forest to the training set, using the bag of words as 
# features and the sentiment labels as the response variable
#
# This may take a few minutes to run
forest = forest.fit( train_data_features, train["Preference"] )

#Serialize(Pickle) the forest model so that it can be used in the web front end to predict the values.
fileObject = open('forest_pkl','wb')
pickle.dump(forest,fileObject)   

# here we close the fileObject
fileObject.close()

#clean_description = description_to_words( train["ProjectDescription"][0] )
#print(clean_review)

In [183]:
# we open the file for reading
fileObject = open('forest_pkl','rb')  
# load the object from the file into var b
forest_model = pickle.load(fileObject)

# Verify that there are 1712 rows and 3 columns
print (X_test.shape)
print(X_test.head())

(1712, 3)
        ID                                 ProjectDescription  \
1101  1102  uBiome is a biotechnology company that sequenc...   
8267  8268  We are a Mid-Size US based enterprise seeking ...   
2605  2606  We are looking for a consultant to help us con...   
1318  1319  Professional Engineer, active member of Nation...   
3927  3928  Naylor Association Solutions currently has a n...   

                                     ProfileDescription  
1101  Support the SVP, Talent Development & Strategy...  
8267  Structure complex, ambiguous, and potentially ...  
2605  Market Research LeadResume Examples & SamplesC...  
1318  Supports the assessment, design, and delivery ...  
3927  Market Research FellowshipResume Examples & Sa...  


In [184]:
# Create an empty list and append the clean description one by one
num_descriptions = len(X_test["ProjectDescription"])
print(num_descriptions)
clean_test_description = []
#Test data is random so we need to get the index to iterate the data
key_iter = X_test.index.values

1712


In [185]:
print("Cleaning and parsing the description...\n")
test_data_key = 0
for i in range(0,num_descriptions):
    if( (i+1) % 500 == 0 ):
        print("Preference %d of %d\n" % (i+1, num_descriptions))
    test_data_key = key_iter[i]
    #print(X_test["ProjectDescription"][test_data_key])
    clean_description = description_to_words(X_test["ProfileDescription"][test_data_key] )
    clean_test_description.append(clean_description)

Cleaning and parsing the description...

Preference 500 of 1712

Preference 1000 of 1712

Preference 1500 of 1712



In [186]:
# Get a bag of words for the test set, and convert to a numpy array
test_data_features = vectorizer.transform(clean_test_description)
test_data_features = test_data_features.toarray()

# Use the random forest to make sentiment label predictions
result = forest_model.predict(test_data_features)

# Copy the results to a pandas dataframe with an "id" column and
# a "sentiment" column
output = pd.DataFrame(data={"id":X_test["ID"], "Preference":result} )

print(output)

        id Preference
1101  1102          N
8267  8268          N
2605  2606          N
1318  1319          N
3927  3928          N
3544  3545          N
5503  5504          N
3407  3408          N
5962  5963          N
2894  2895          N
4821  4822          N
5524  5525          N
2433  2434          N
3464  3465          N
7588  7589          N
6288  6289          N
249    250          N
6126  6127          N
6092  6093          N
1695  1696          N
6419  6420          N
2289  2290          N
5120  5121          N
4622  4623          N
7531  7532          N
6085  6086          N
7272  7273          N
4389  4390          N
3362  3363          N
5606  5607          N
...    ...        ...
2735  2736          N
5789  5790          N
4265  4266          N
7254  7255          N
6477  6478          N
3840  3841          N
57      58          N
7159  7160          N
7335  7336          N
6743  6744          N
140    141          N
3718  3719          N
6041  6042          N
5182  5183

In [175]:
# Use pandas to write the comma-separated output file
output.to_csv( "Bag_of_Words_model.csv", index=True, quoting=3 )