# Word representation using Scikit-learn

In [1]:
import numpy as np

data = np.array([
    'this is the first document',
    'this document is the second document',
    'this is the third one not the first nor the third',
    'is this the first document or is is another document'
])

data

array(['this is the first document',
       'this document is the second document',
       'this is the third one not the first nor the third',
       'is this the first document or is is another document'],
      dtype='<U52')

## I. Vectorization

### I.1. Term Frequency (TF)

In [2]:
from sklearn.feature_extraction.text import CountVectorizer

tf_vectorizer = CountVectorizer()
TF = tf_vectorizer.fit_transform(data)

print(tf_vectorizer.get_feature_names_out())

TF.toarray()

['another' 'document' 'first' 'is' 'nor' 'not' 'one' 'or' 'second' 'the'
 'third' 'this']


array([[0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1],
       [0, 2, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1],
       [0, 0, 1, 1, 1, 1, 1, 0, 0, 3, 2, 1],
       [1, 2, 1, 3, 0, 0, 0, 1, 0, 1, 0, 1]])

### I.2. Term Frequency - Inverse Document Frequency (TF-IDF)

In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer()
TFIDF = tfidf_vectorizer.fit_transform(data)

print(tfidf_vectorizer.get_feature_names_out())


TFIDF.toarray()

['another' 'document' 'first' 'is' 'nor' 'not' 'one' 'or' 'second' 'the'
 'third' 'this']


array([[0.        , 0.49967281, 0.49967281, 0.40851526, 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.40851526,
        0.        , 0.40851526],
       [0.        , 0.6876236 , 0.        , 0.28108867, 0.        ,
        0.        , 0.        , 0.        , 0.53864762, 0.28108867,
        0.        , 0.28108867],
       [0.        , 0.        , 0.19789669, 0.16179351, 0.3100434 ,
        0.3100434 , 0.3100434 , 0.        , 0.        , 0.48538052,
        0.6200868 , 0.16179351],
       [0.37708861, 0.48138155, 0.24069077, 0.59034144, 0.        ,
        0.        , 0.        , 0.37708861, 0.        , 0.19678048,
        0.        , 0.19678048]])

### I.3. Latent Semantic Analysis (LSA)

In [4]:
from sklearn.decomposition import TruncatedSVD

svd = TruncatedSVD(n_components=5, n_iter=7, random_state=42)

svd.fit(TF)

print(svd.explained_variance_ratio_)

print(svd.explained_variance_ratio_.sum())

print(svd.singular_values_)

[0.1223226  0.68092366 0.15434875 0.04240499]
0.9999999999999998
[5.91216815 3.40420027 1.6491539  0.85905745]


In [5]:
svd.transform(TF)

array([[ 2.0601922 , -0.30909615,  0.15622694,  0.7972834 ],
       [ 2.26591799, -0.94003037,  1.39040475, -0.22075592],
       [ 3.37398661,  2.75685491, -0.05652745, -0.11300441],
       [ 3.76711697, -1.73468527, -0.87113775, -0.20202899]])

## II. Parameters

### II.1. Reading

In [6]:
# read many documents
from sklearn.feature_extraction.text import CountVectorizer

files_list = [
    'docs/doc1.txt',
    'docs/doc2.txt',
    'docs/doc3.txt'
]

tf_vectorizer_docs = CountVectorizer(input='filename')
TF_docs = tf_vectorizer_docs.fit_transform(files_list)

print(tf_vectorizer_docs.get_feature_names_out())

TF_docs.toarray()

['also' 'another' 'are' 'be' 'can' 'contains' 'document' 'documents'
 'have' 'is' 'it' 'list' 'long' 'of' 'or' 'other' 'sentences' 'short'
 'some' 'these' 'this' 'very']


array([[0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 2, 1, 1, 1, 1, 1],
       [1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 2, 1, 0, 1, 1, 0],
       [0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 2, 1, 0, 1, 0, 0]])

In [7]:
# read a single file
from sklearn.feature_extraction.text import CountVectorizer

tf_vectorizer_file = CountVectorizer(input='file')

f1 = open('docs/doc1.txt', 'r')
f2 = open('docs/doc2.txt', 'r')

TF_file = tf_vectorizer_file.fit_transform([f1, f2])

print(tf_vectorizer_file.get_feature_names_out())

TF_file.toarray()

['also' 'another' 'are' 'contains' 'document' 'is' 'it' 'other'
 'sentences' 'short' 'some' 'these' 'this' 'very']


array([[0, 0, 1, 1, 1, 1, 1, 0, 2, 1, 1, 1, 1, 1],
       [1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 0, 1, 1, 0]])

In [8]:
# read a file where each  line is a document
from sklearn.feature_extraction.text import CountVectorizer

tf_vectorizer_fileln = CountVectorizer()

f = open('docs/docln.txt', 'r')

TF_fileln = tf_vectorizer_fileln.fit_transform(f)

print(tf_vectorizer_fileln.get_feature_names_out())

TF_fileln.toarray()


['be' 'can' 'document' 'documents' 'each' 'have' 'in' 'is' 'line' 'list'
 'long' 'of' 'or' 'sentences' 'short' 'these']


array([[0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0],
       [1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1],
       [0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0]])

### II.2. Preprocessing

In [9]:
# tokenization
from sklearn.feature_extraction.text import CountVectorizer

data_sharp = [
    'I#AM #VERY#TIRED',
    'I#AM#GOING#TO#BUY#TEA',
    'I #HOPE#ABDELKADER#STILL#HAS#TEA'
]

def sharp_tokenizer(text):
    return text.split('#')

# tokenizer apply only if analyzer=='word'
# we can desactivate automatic lowercasing
token_vectorizer = CountVectorizer(tokenizer=sharp_tokenizer, lowercase=False)


TF_token = token_vectorizer.fit_transform(data_sharp)

print(token_vectorizer.get_feature_names_out())

TF_token.toarray()

['ABDELKADER' 'AM' 'AM ' 'BUY' 'GOING' 'HAS' 'HOPE' 'I' 'I ' 'STILL' 'TEA'
 'TIRED' 'TO' 'VERY']


array([[0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1],
       [0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0],
       [1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0, 0]])

In [10]:
# preprocessing
from sklearn.feature_extraction.text import CountVectorizer


def sharp_preprocessor(text):
    return text.replace(' ', '')

# tokenizer apply only if analyzer=='word'
# we can desactivate automatic lowercasing
token_vectorizer = CountVectorizer(preprocessor=sharp_preprocessor, tokenizer=sharp_tokenizer)


TF_token = token_vectorizer.fit_transform(data_sharp)

print(token_vectorizer.get_feature_names_out())

TF_token.toarray()

['ABDELKADER' 'AM' 'BUY' 'GOING' 'HAS' 'HOPE' 'I' 'STILL' 'TEA' 'TIRED'
 'TO' 'VERY']


array([[0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1],
       [0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0],
       [1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0]])

In [11]:
# predefined stop-words
from sklearn.feature_extraction.text import CountVectorizer

tf_vectorizer_sw = CountVectorizer(stop_words='english')

TF_sw = tf_vectorizer_sw.fit_transform(data)

print(tf_vectorizer_sw.get_feature_names_out())

TF_sw.toarray()

['document' 'second']


array([[1, 0],
       [2, 1],
       [0, 0],
       [2, 0]])

In [12]:
# given stop-words
from sklearn.feature_extraction.text import CountVectorizer

sw_list = ['the', 'this', 'or', 'nor', 'not', 'is']

tf_vectorizer_sw = CountVectorizer(stop_words=sw_list)

TF_sw = tf_vectorizer_sw.fit_transform(data)

print(tf_vectorizer_sw.get_feature_names_out())

TF_sw.toarray()

['another' 'document' 'first' 'one' 'second' 'third']


array([[0, 1, 1, 0, 0, 0],
       [0, 2, 0, 0, 1, 0],
       [0, 0, 1, 1, 0, 2],
       [1, 2, 1, 0, 0, 0]])

## III. Similarity


In [13]:
from sklearn.metrics.pairwise import cosine_similarity

# if we pass a matrix (document X term), we will get a similarity (document X document)
Sim = cosine_similarity(TF)

# The diagonale is always 1, because each document is similar to itself
# This representation is somehow a complete graph
Sim

array([[1.        , 0.79056942, 0.61558701, 0.84327404],
       [0.79056942, 1.        , 0.40555355, 0.75      ],
       [0.61558701, 0.40555355, 1.        , 0.43259046],
       [0.84327404, 0.75      , 0.43259046, 1.        ]])