# Compare NLP Techniques: Build Model On TF-IDF Vectors

### Read In Cleaned Text

In [2]:
# Load the cleaned training and test sets
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer

X_train = pd.read_csv('../../../data/X_train.csv')
X_test = pd.read_csv('../../../data/X_test.csv')
y_train = pd.read_csv('../../../data/y_train.csv')
y_test = pd.read_csv('../../../data/y_test.csv')

X_train.head()

Unnamed: 0,clean_text
0,"['going', 'thru', 'different', 'feelingwaverin..."
1,"['may', 'call', 'later', 'pls']"
2,"['think', 'û', 'waiting', 'bus', 'inform', 'ge..."
3,"['aight', 'well', 'keep', 'informed']"
4,"['u', 'call']"


### Create TF-IDF Vectors

In [4]:
# Instantiate and fit a TFIDF vectorizer and then use that trained vectorizer
# to transform the messages in the training and test sets
tfidf_vect = TfidfVectorizer()
tfidf_vect.fit(X_train['clean_text'])
X_train_vect = tfidf_vect.transform(X_train['clean_text'])
X_test_vect = tfidf_vect.transform(X_test['clean_text'])

In [6]:
# What words did the vectorizer learn?
tfidf_vect.vocabulary_

{'going': 3313,
 'thru': 7279,
 'different': 2429,
 'feelingwavering': 2932,
 'decisions': 2318,
 'coping': 2095,
 'individualtime': 3836,
 'heal': 3509,
 'everything': 2793,
 'believe': 1370,
 'may': 4623,
 'call': 1664,
 'later': 4240,
 'pls': 5572,
 'think': 7246,
 'waiting': 7775,
 'bus': 1623,
 'inform': 3842,
 'get': 3251,
 'ever': 2785,
 'aight': 939,
 'well': 7871,
 'keep': 4092,
 'informed': 3844,
 '2p': 451,
 'per': 5457,
 'min': 4727,
 'germany': 3250,
 '08448350055': 60,
 'bt': 1589,
 'line': 4340,
 'check': 1831,
 'planettalkinstantcom': 5547,
 'info': 3841,
 'ts': 7466,
 'cs': 2178,
 'text': 7186,
 'stop': 6882,
 'opt': 5276,
 'talk': 7091,
 'oh': 5204,
 'ok': 5211,
 'wats': 7827,
 'ur': 7607,
 'email': 2691,
 'tell': 7153,
 'stay': 6842,
 'yeah': 8160,
 'tough': 7403,
 'optimistic': 5278,
 'things': 7245,
 'improve': 3808,
 'month': 4833,
 'im': 3787,
 'glad': 3284,
 'didnt': 2417,
 'find': 2981,
 'totally': 7399,
 'disagreeable': 2457,
 'lol': 4399,
 'sleepingand': 6577

In [7]:
# How are these vectors stored?
X_train_vect[0]

<1x8259 sparse matrix of type '<class 'numpy.float64'>'
	with 10 stored elements in Compressed Sparse Row format>

In [11]:
# Can we convert the vectors to arrays?
X_train_vect[0].toarray()

array([[0., 0., 0., ..., 0., 0., 0.]])

### Fit RandomForestClassifier On Top Of Vectors

In [12]:
# Fit a basic Random Forest model on these vectors
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier()
rf_model = rf.fit(X_train_vect, y_train.values.ravel())



In [15]:
# Use the trained model to make predictions on the test data
y_pred = rf_model.predict(X_test_vect)

In [16]:
# Evaluate the predictions of the model on the holdout test set
from sklearn.metrics import precision_score, recall_score

precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
print('Precision: {} / Recall: {} / Accuracy: {}'.format(
    round(precision, 3), round(recall, 3), round((y_pred==y_test['label']).sum()/len(y_pred), 3)))

Precision: 0.991 / Recall: 0.778 / Accuracy: 0.97
