# Vectorizing: Representing text as numerical data

In [1]:
import pandas as pd 

In [2]:
# toy example
X_train = ['call you tonight', 
           'Call me a cab', 
           'please call me... PLEASE',
           'he called the police', 
          'Ive fallen and I cant get up']

In [3]:
# import the vectorizer 
from sklearn.feature_extraction.text import CountVectorizer
vect = CountVectorizer()

In [4]:
# learn the 'vocabulary' of the training data
vect.fit(X_train)

In [5]:
# examine the fitted vocabulary
vect.get_feature_names_out()

array(['and', 'cab', 'call', 'called', 'cant', 'fallen', 'get', 'he',
       'ive', 'me', 'please', 'police', 'the', 'tonight', 'up', 'you'],
      dtype=object)

In [6]:
# transform training data into a 'document-term matrix'
X_train_dtm = vect.transform(X_train)

In [7]:
X_train_dtm.toarray()

array([[0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1],
       [0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0],
       [0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 2, 0, 0, 0, 0, 0],
       [0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0],
       [1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0]], dtype=int64)

In [8]:
pd.DataFrame(X_train_dtm.toarray(), 
             columns=vect.get_feature_names_out(),
            index=X_train)

Unnamed: 0,and,cab,call,called,cant,fallen,get,he,ive,me,please,police,the,tonight,up,you
call you tonight,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,1
Call me a cab,0,1,1,0,0,0,0,0,0,1,0,0,0,0,0,0
please call me... PLEASE,0,0,1,0,0,0,0,0,0,1,2,0,0,0,0,0
he called the police,0,0,0,1,0,0,0,1,0,0,0,1,1,0,0,0
Ive fallen and I cant get up,1,0,0,0,1,1,1,0,1,0,0,0,0,0,1,0


In [9]:
# example 
X_test = ["please don't call me"]
X_test_dtm = vect.transform(X_test)
X_test_dtm.toarray()

array([[0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0]], dtype=int64)

# Tuning the vectorizer

In [10]:
# ignore stop words (the, they, he, a, then, ...)
vect = CountVectorizer(stop_words='english')

In [11]:
vect.fit(X_train)
vect.get_feature_names_out()

array(['cab', 'called', 'fallen', 'ive', 'police', 'tonight'],
      dtype=object)

In [12]:
# list of  stop words
from sklearn.feature_extraction import _stop_words
print(_stop_words.ENGLISH_STOP_WORDS)

frozenset({'keep', 'besides', 'wherever', 'now', 'former', 'your', 'ten', 'per', 'get', 'put', 'about', 'are', 'yours', 'indeed', 'were', 'hereupon', 'sometimes', 'again', 'such', 'me', 'thick', 'through', 'moreover', 'you', 'may', 'co', 'one', 'himself', 'neither', 'almost', 'how', 'being', 'three', 'anyway', 'first', 'made', 'its', 'our', 'becomes', 'couldnt', 'own', 'please', 'would', 'nor', 'with', 'of', 'afterwards', 'enough', 'has', 'toward', 'also', 'everywhere', 'why', 'none', 'even', 'do', 'five', 'around', 'she', 'onto', 'and', 'bill', 'because', 'top', 'found', 'there', 'thru', 'their', 'under', 'further', 'describe', 'not', 'both', 'detail', 'mine', 'those', 'less', 'them', 'two', 'else', 'at', 'hence', 'never', 'without', 'whose', 'we', 'my', 'side', 'forty', 'four', 'anyone', 'that', 'whatever', 'several', 'he', 'this', 'via', 'show', 'de', 'twelve', 'perhaps', 'then', 'anywhere', 'somewhere', 'by', 'fire', 'same', 'myself', 'noone', 'empty', 'give', 'below', 'thereby', '

In [13]:
# ignore terms that appear in more than 50% of the document
vect = CountVectorizer(max_df = 0.8)

In [14]:
# ignore terms that appear in less than 20% of the documents
vect = CountVectorizer(min_df = 0.2)

In [58]:
vect = CountVectorizer(ngram_range=(1,2)) # (specifies the range of the length of words it wants)
vect.fit(X_train)
vect.get_feature_names_out()

array(['00', '00 00', '00 000', ..., 'zz uh', 'zzc6', 'zzc6 5hnilm3y'],
      dtype=object)

# Example

In [17]:
from sklearn.datasets import fetch_20newsgroups

In [18]:
X,y=fetch_20newsgroups(subset='all',
                      categories=['comp.windows.x', 
                                  'sci.space',
                                  'rec.autos', 
                                  'rec.sport.baseball'],
                      return_X_y=True,
                      remove=['headers', 'footers', 'quotes'])

In [33]:
topic_dic = {0:'windows', 1:'autos', 2:'baseball', 3:'space'}

In [37]:
#space?
print(data[data.topic==3].iloc[65,0])

hi, is there anybody has some example programs about using
 the internationalization features in X11R5 ? Such as a small
 X program just to show Chinese texts in wondows, menu bar or
 icons... Thanks in advance.


In [30]:
# put data into a dataframe
data = pd.DataFrame()
data['document'] = X
data['topic'] = y
data

Unnamed: 0,document,topic
0,And one of my profs is the chief engineer for ...,3
1,"Enclosed are the rules, guidelines and related...",0
2,\nI grew up listening to Harry Carey call the ...,2
3,Original to: szabo@techbook.com\nG'day szabo@t...,3
4,\n\n\n\nTry the 'M.Sc. Computing Science' cour...,0
...,...,...
3954,\n\n\n\n\n Anaheim.,2
3955,l\n\n\ndiamond star cars (Talon/Eclipse/Laser)...,1
3956,Update your 385 to HP-UX 9.0. You get an R5 s...,0
3957,\nI disagree. It think the average joe is int...,3


In [21]:
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split as tts

In [38]:
X_train, X_test, y_train, y_test = tts(X,y)

In [39]:
pipe = Pipeline(steps=[
    ('vectorizer', CountVectorizer()),
    ('clf', LogisticRegression(max_iter=5000))
])

In [40]:
pipe.fit(X_train, y_train)

In [42]:
# size of the vocabulary (number of words)
len(list(pipe['vectorizer'].get_feature_names_out()))

33243

In [43]:
y_test_pred = pipe.predict(X_test)

In [46]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, y_test_pred)

array([[199,  12,   5,  11],
       [  4, 240,  14,  11],
       [  7,  15, 220,  10],
       [  8,  35,  10, 189]], dtype=int64)

In [47]:
from sklearn.model_selection import GridSearchCV

In [53]:
param_dic = {'vectorizer__stop_words':[None, 'english'],
            'vectorizer__ngram_range':[(1,1),(1,2)],
            'vectorizer__max_df':[0.5, 0.7, 0.9, 1],
            'vectorizer__max_features':[50000, 25000, 10000]}

In [54]:
grid = GridSearchCV(pipe,param_dic,cv=5, n_jobs=-1,verbose=2)
grid.fit(X_train,y_train)

Fitting 5 folds for each of 48 candidates, totalling 240 fits


In [55]:
grid.best_params_

{'vectorizer__max_df': 0.5,
 'vectorizer__max_features': 50000,
 'vectorizer__ngram_range': (1, 1),
 'vectorizer__stop_words': 'english'}

In [56]:
best_pipe = grid.best_estimator_
y_test_pred = best_pipe.predict(X_test)
confusion_matrix(y_test, y_test_pred)

array([[211,   6,   4,   6],
       [  5, 244,  11,   9],
       [  3,  19, 225,   5],
       [  8,  31,   7, 196]], dtype=int64)

In [57]:
X_new_data = ['I always wanted to be an astronaut', 
              'I hate windows 10', 
              'I need to clean the window of my spaceship']

In [59]:
best_pipe.predict(X_new_data)

array([1, 0, 0], dtype=int64)

In [60]:
topic_dic

{0: 'windows', 1: 'autos', 2: 'baseball', 3: 'space'}

# How does the model make predictions?

In [63]:
coefficients = best_pipe['clf'].coef_
coefficients.shape

(4, 32942)

In [65]:
# data frame with the model coefficients
coef_df = pd.DataFrame(coefficients.T, 
                       index=best_pipe['vectorizer'].get_feature_names_out(), 
                       columns=['windows', 'auto', 'baseball', 'space'])


In [68]:
# top 'windows' coefficients
coef_df.windows.sort_values(ascending=False).head(20)

subscribe      1.574835
windows        1.189081
subscrive      1.182666
server         1.138047
widgets        1.065492
motif          1.020184
window         0.986640
mit            0.956966
using          0.952933
resource       0.951108
use            0.907872
instead        0.877964
x11r5          0.876972
xterm          0.875368
unsubscribe    0.865879
group          0.852355
user           0.852157
widget         0.825559
hi             0.820401
library        0.799044
Name: windows, dtype: float64

In [69]:
# top 'space' coefficients
coef_df.space.sort_values(ascending=False).head(20)

space         1.884526
orbit         1.068594
solar         1.024463
launch        0.861680
moon          0.812567
nasa          0.809850
spacecraft    0.726787
shuttle       0.704097
sorry         0.672211
earth         0.653797
working       0.652763
project       0.648973
sci           0.633984
test          0.629273
message       0.602840
phase         0.590551
ironic        0.586404
data          0.564916
aircraft      0.558477
landing       0.556546
Name: space, dtype: float64

In [70]:
# top 'baseball' coefficients
coef_df.baseball.sort_values(ascending=False).head(20)

baseball    1.324158
team        1.108078
game        1.062747
games       1.037789
cubs        1.012431
stats       0.798234
ball        0.771772
writes      0.765938
players     0.755624
play        0.741681
stadium     0.730662
hit         0.701332
year        0.682144
say         0.678206
season      0.676756
league      0.658766
ticket      0.632123
colorado    0.626883
woof        0.596778
pitching    0.593925
Name: baseball, dtype: float64

In [71]:
# top 'auto' coefficients
coef_df.auto.sort_values(ascending=False).head(20)

car        1.998288
cars       1.187249
honda      1.020485
dealer     0.862804
engine     0.820716
ford       0.787885
oil        0.767935
gt         0.757912
auto       0.743021
flat       0.722152
toyota     0.721039
saab       0.687771
mustang    0.666619
ites       0.626086
bmw        0.613155
people     0.609870
miles      0.603414
radar      0.599470
ones       0.595621
tires      0.587623
Name: auto, dtype: float64

# Naive Bayes Model