In [5]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords # Import the stop word list
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
import pickle
from sklearn.metrics import accuracy_score
from matplotlib import pyplot as plt

#nltk.download('stopwords')
train = pd.read_csv("projects_profile_match.tsv", header=0, \
                    delimiter="\t", quoting=3,encoding='latin-1')

# create training and testing vars
y = train.Preference
X = train.drop('Preference', axis=1)
train.head()

Unnamed: 0,ID,ProjectDescription,ProfileDescription,Preference
0,1,We arethe transformation arm of a Fortune 100 ...,"Market Research, Employee Fixed TermResume Exa...",N
1,2,We are a PE firm seeking a junior consulting r...,Director of Market ResearchResume Examples & S...,N
2,3,We are a mid-market private equity firm lookin...,Market Research Senior ManagerResume Examples ...,N
3,4,I am launching a new commercial real estate pl...,"Senior Manager, Corporate Market ResearchResum...",N
4,5,We are a PE-backed consumer products business ...,"HBO Director, Market ResearchResume Examples &...",N


In [6]:
# Split the data in 80/20 for training and testing
X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=0.2)
print("\nX_train:\n")
print(X_train.head())
print(X_train.shape)

print("\nX_test:\n")
print(X_test.head())
print(X_test.shape)


X_train:

        ID                                 ProjectDescription  \
7116  7117  We are a startup blockchain/crypto company foc...   
4176  4177  Multinational company in Indonesia is looking ...   
5186  5187  One day while sitting in a very cool coffee sh...   
3330  3331  A global pharmaceutical company is seeking a c...   
1378  1379  I am looking to better understand the regulato...   

                                     ProfileDescription  
7116  Master degree in relevant discipline Lean/Six ...  
4176  Finance / Accounting ManagerResume Examples & ...  
5186  At least 4 years experience in Finance Transfo...  
3330  Finance Accounting ManagerResume Examples & Sa...  
1378  Finance & Accounting ManagerResume Examples & ...  
(6848, 3)

X_test:

        ID                                 ProjectDescription  \
6579  6580  I would like assistance from someone to build ...   
5252  5253  Rhynland LLC, a start-up financial advisory an...   
5695  5696  I am looking for acandi

In [7]:
#Keep only letters. Remove everything else in the .
letters_only = re.sub("[^a-zA-Z]",           # The pattern to search for
                      " ",                   # The pattern to replace it with
                      X_train["ProjectDescription"][0])
#print(letters_only)
lower_case = letters_only.lower()        # Convert to lower case
words = lower_case.split()
#print(stopwords.words("english"))

# Clean up the tex. Remove stop words from "words"
words = [w for w in words if not w in stopwords.words("english")]
#print(words)

In [8]:
def description_to_words( raw_description ):
    # Function to convert a raw description to a string of words
    # The input is a single string (a raw description), and 
    # the output is a single string (a preprocessed description)
    #
   
    # 1. Remove non-letters        
    letters_only = re.sub("[^a-zA-Z]", " ", str(raw_description))
    #
    # 2. Convert to lower case, split into individual words
    words = letters_only.lower().split()                             
    #
    # 3. In Python, searching a set is much faster than searching
    #   a list, so convert the stop words to a set
    stops = set(stopwords.words("english"))                  
    # 
    # 4. Remove stop words
    meaningful_words = [w for w in words if not w in stops]   
    #
    # 5. Join the words back into one string separated by space, 
    # and return the result.
    return( " ".join( meaningful_words ))

In [9]:
# Get the number of description based on the dataframe column size
num_description = train["ProjectDescription"].size

# Initialize an empty list to hold the clean descriptions
clean_train_description = []

# Loop over each description; create an index i that goes from 0 to the length
# of the description list 
for i in range( 0, num_description ):
    # Call our function for each one, and add the result to the list of
    # clean reviews
    clean_train_description.append( description_to_words( train["ProjectDescription"][i] ) )


In [10]:

from sklearn.feature_extraction.text import CountVectorizer

# Initialize the "CountVectorizer" object, which is scikit-learn's
# bag of words tool.  
vectorizer = CountVectorizer(analyzer = "word",   \
                             tokenizer = None,    \
                             preprocessor = None, \
                             stop_words = 'english',   \
                             max_features = 2000) 

# fit_transform() does two functions: First, it fits the model
# and learns the vocabulary; second, it transforms our training data
# into feature vectors. The input to fit_transform should be a list of 
# strings.
train_data_features = vectorizer.fit_transform(clean_train_description)
print("Created bag of words")
# Numpy arrays are easy to work with, so convert the result to an 
# array

Created bag of words


In [11]:
#train_data_features = train_data_features.toarray()

vocab = vectorizer.get_feature_names()
#print(vocab)

# Initialize a Random Forest classifier with 100 trees
forest = RandomForestClassifier(n_estimators = 100) 

# Fit the forest to the training set, using the bag of words as 
# features and the sentiment labels as the response variable
#
# This may take a few minutes to run
forest = forest.fit( train_data_features, train["Preference"] )

#Serialize(Pickle) the forest model so that it can be used in the web front end to predict the values.
fileObject = open('forest_pkl','wb')
pickle.dump(forest,fileObject)   

# here we close the fileObject
fileObject.close()

#clean_description = description_to_words( train["ProjectDescription"][0] )
#print(clean_review)

In [12]:
# we open the file for reading
fileObject = open('forest_pkl','rb')  
# load the object from the file into var b
forest_model = pickle.load(fileObject)

# Verify that there are 1712 rows and 3 columns
print (X_test.shape)
print(X_test.head())

(1712, 3)
        ID                                 ProjectDescription  \
6579  6580  I would like assistance from someone to build ...   
5252  5253  Rhynland LLC, a start-up financial advisory an...   
5695  5696  I am looking for acandidate (must have a demon...   
5091  5092  We are a global private equity firm. We are se...   
6574  6575  Summary:We are requesting aconsulting project ...   

                                     ProfileDescription  
6579  Finance & Accounting ManagerResume Examples & ...  
5252  Finance / Accounting ManagerResume Examples & ...  
5695  Finance Accounting ManagerResume Examples & Sa...  
5091  Finance / Accounting ManagerResume Examples & ...  
6574  Senior Finance & Accounting ManagerResume Exam...  


In [14]:
# Create an empty list and append the clean description one by one
num_descriptions = len(X_test["ProjectDescription"])
print(num_descriptions)
clean_test_description = []
#Test data is random so we need to get the index to iterate the data
key_iter = X_test.index.values

1712


In [15]:
print("Cleaning and parsing the description...\n")
test_data_key = 0
for i in range(0,num_descriptions):
    if( (i+1) % 500 == 0 ):
        print("Preference %d of %d\n" % (i+1, num_descriptions))
    test_data_key = key_iter[i]
    clean_description = description_to_words(X_test["ProfileDescription"][test_data_key] )
    clean_test_description.append(clean_description)

Cleaning and parsing the description...

Preference 500 of 1712

Preference 1000 of 1712

Preference 1500 of 1712



In [16]:
# Get a bag of words for the test set, and convert to a numpy array
test_data_features = vectorizer.transform(clean_test_description)
test_data_features = test_data_features.toarray()

# Use the random forest to make sentiment label predictions
result = forest_model.predict(test_data_features)

# Copy the results to a pandas dataframe with an "id" column and
# a "sentiment" column
output = pd.DataFrame(data={"id":X_test["ID"], "Preference":result} )

print(output)

        id Preference
6579  6580          N
5252  5253          N
5695  5696          N
5091  5092          N
6574  6575          N
5306  5307          N
3896  3897          N
564    565          N
4111  4112          N
5636  5637          N
5955  5956          N
2815  2816          N
6644  6645          N
4920  4921          N
3185  3186          N
8296  8297          N
6700  6701          N
2689  2690          N
532    533          N
6188  6189          N
6061  6062          N
2598  2599          N
4711  4712          N
2957  2958          N
4697  4698          N
6468  6469          N
2459  2460          N
4490  4491          N
7666  7667          N
606    607          N
...    ...        ...
3399  3400          N
4928  4929          N
8042  8043          N
6698  6699          N
5162  5163          N
7710  7711          N
2844  2845          N
7357  7358          N
5980  5981          N
5295  5296          N
1269  1270          N
5215  5216          N
5840  5841          N
4379  4380

In [None]:
print('Score:', forest.score(X_test, y_test))
#Lets plot if this data matches the 
plt.scatter(y_test, result)
plt.xlabel('True Values')
plt.ylabel('Predictions')

In [175]:
# Use pandas to write the comma-separated output file
output.to_csv( "test_validation_output.csv", index=True, quoting=3 )