# Compare NLP Techniques: Build Model On doc2vec Vectors

### Read In Cleaned Text

In [3]:
# Load the cleaned training and test sets
import gensim
import numpy as np
import pandas as pd

X_train = pd.read_csv('../../../data/X_train.csv')
X_test = pd.read_csv('../../../data/X_test.csv')
y_train = pd.read_csv('../../../data/y_train.csv')
y_test = pd.read_csv('../../../data/y_test.csv')

### Create doc2vec Vectors

In [4]:
# Created TaggedDocument vectors for each text message in the training and test sets
tagged_docs_train = [gensim.models.doc2vec.TaggedDocument(v, [i])
                     for i, v in enumerate(X_train['clean_text'])]
tagged_docs_test = [gensim.models.doc2vec.TaggedDocument(v, [i])
                    for i, v in enumerate(X_test['clean_text'])]

In [10]:
# What do these TaggedDocument objects look like?
tagged_docs_train[:10]

[TaggedDocument(words="['going', 'thru', 'different', 'feelingwavering', 'decisions', 'coping', 'individualtime', 'heal', 'everything', 'believe']", tags=[0]),
 TaggedDocument(words="['may', 'call', 'later', 'pls']", tags=[1]),
 TaggedDocument(words="['think', 'û', 'waiting', 'bus', 'inform', 'get', 'ever', 'get']", tags=[2]),
 TaggedDocument(words="['aight', 'well', 'keep', 'informed']", tags=[3]),
 TaggedDocument(words="['u', 'call']", tags=[4]),
 TaggedDocument(words="['2p', 'per', 'min', 'call', 'germany', '08448350055', 'bt', 'line', '2p', 'per', 'min', 'check', 'planettalkinstantcom', 'info', 'ts', 'cs', 'text', 'stop', 'opt']", tags=[5]),
 TaggedDocument(words="['talk', 'g', 'x']", tags=[6]),
 TaggedDocument(words="['oh', 'ok', 'wats', 'ur', 'email']", tags=[7]),
 TaggedDocument(words="['ok', 'tell', 'stay', 'yeah', 'tough', 'optimistic', 'things', 'improve', 'month']", tags=[8]),
 TaggedDocument(words="['well', 'im', 'glad', 'didnt', 'find', 'totally', 'disagreeable', 'lol']", 

In [11]:
# Train a basic doc2vec model
d2v_model = gensim.models.Doc2Vec(tagged_docs_train,
                                  vector_size=100,
                                  window=5,
                                  min_count=2)

In [14]:
# Infer the vectors to be used in training and testing
train_vectors = [d2v_model.infer_vector(eval(v.words)) for v in tagged_docs_train]
test_vectors = [d2v_model.infer_vector(eval(v.words)) for v in tagged_docs_test]   

### Fit RandomForestClassifier On Top Of Document Vectors

In [15]:
# Fit a basic model, make predictions on the holdout test set, and the generate the evaluation metrics
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import precision_score, recall_score

rf = RandomForestClassifier()
rf_model = rf.fit(train_vectors, y_train.values.ravel())

y_pred = rf_model.predict(test_vectors)

precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
print('Precision: {} / Recall: {} / Accuracy: {}'.format(
    round(precision, 3), round(recall, 3), round((y_pred==y_test['label']).sum()/len(y_pred), 3)))



Precision: 0.81 / Recall: 0.354 / Accuracy: 0.906
