# Compare NLP Techniques: Build Model On TF-IDF Vectors

### Read In Cleaned Text

In [11]:
# Load the cleaned training and test sets
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer

X_train = pd.read_csv('../../../data/X_train.csv')
X_test = pd.read_csv('../../../data/X_test.csv')
y_train = pd.read_csv('../../../data/y_train.csv')
y_test = pd.read_csv('../../../data/y_test.csv')

X_train.head()

Unnamed: 0,clean_text
0,"['ill', 'see', 'prolly', 'yeah']"
1,"['hi', 'darlin', 'ive', 'got', 'back', 'really..."
2,"['wait', '4', 'u', 'lor', 'need', '2', 'feel',..."
3,"['dont', 'know', 'u', 'u', 'dont', 'know', 'se..."
4,"['dont', 'use', 'hook']"


### Create TF-IDF Vectors

In [12]:
# Instantiate and fit a TFIDF vectorizer and then use that trained vectorizer
# to transform the messages in the training and test sets
tfidf_vect = TfidfVectorizer()
tfidf_vect.fit(X_train['clean_text'])
X_train_vect = tfidf_vect.transform(X_train['clean_text'])
X_test_vect = tfidf_vect.transform(X_test['clean_text'])

In [13]:
# What words did the vectorizer learn?
# tfidf_vect.vocabulary_

In [14]:
# How are these vectors stored?
X_test_vect[0]

<1x8229 sparse matrix of type '<class 'numpy.float64'>'
	with 5 stored elements in Compressed Sparse Row format>

In [15]:
# Can we convert the vectors to arrays?
X_test_vect[0].toarray()

array([[0., 0., 0., ..., 0., 0., 0.]])

### Fit RandomForestClassifier On Top Of Vectors

In [16]:
# Fit a basic Random Forest model on these vectors
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier()
rf_model = rf.fit(X_train_vect, y_train.values.ravel())

In [19]:
# Use the trained model to make predictions on the test data
y_pred = rf_model.predict(X_test_vect)

In [20]:
# Evaluate the predictions of the model on the holdout test set
from sklearn.metrics import precision_score, recall_score

precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
print('Precision: {} / Recall: {} / Accuracy: {}'.format(
    round(precision, 3), round(recall, 3), round((y_pred==y_test['label']).sum()/len(y_pred), 3)))

Precision: 1.0 / Recall: 0.787 / Accuracy: 0.973
