In [26]:
c = { \
'Lincoln1865':
'With malice toward none, with charity for all ...' +
'let us strive on to finish the work we are in ... ' +
'to do all which may achieve and cherish a just and lasting peace, ' +
'among ourselves, and with all nations.',

'TrumpMay26':
'There is NO WAY (ZERO!) that Mail-In Ballots ' +
'will be anything less than substantially fraudulent.',

'Wikipedia':
'In 1998, Oregon became the first state in the US ' +
'to conduct all voting exclusively by mail.',

'FortuneMay26':
'Over the last two decades, about 0.00006% of total ' +
'vote-by-mail votes cast were fraudulent.',

'TheHillApr07':
'Trump voted by mail in the Florida primary.',

'KingJamesBible':
'Wherefore laying aside all malice, and all guile, and ' +
'hypocrisies, and envies, and all evil speakings',
}


In [19]:
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd
import spacy

In [20]:
# Create a CountVectorizer object
vectorizer = CountVectorizer()

# Fit and transform the text data
X = vectorizer.fit_transform(c.values())

# Create a DataFrame from the term-document matrix
term_document_df = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names_out(), index=c.keys())

# Display the DataFrame
print(term_document_df)

                00006  1998  about  achieve  all  among  and  anything  are  \
Lincoln1865         0     0      0        1    3      1    3         0    1   
TrumpMay26          0     0      0        0    0      0    0         1    0   
Wikipedia           0     1      0        0    1      0    0         0    0   
FortuneMay26        1     0      1        0    0      0    0         0    0   
TheHillApr07        0     0      0        0    0      0    0         0    0   
KingJamesBible      0     0      0        0    3      0    4         0    0   

                aside  ...  voting  way  we  were  wherefore  which  will  \
Lincoln1865         0  ...       0    0   1     0          0      1     0   
TrumpMay26          0  ...       0    1   0     0          0      0     1   
Wikipedia           0  ...       1    0   0     0          0      0     0   
FortuneMay26        0  ...       0    0   0     1          0      0     0   
TheHillApr07        0  ...       0    0   0     0          0 

In [28]:


# Load a compatible spaCy model
nlp = spacy.load("en_core_web_sm")

# Define a custom tokenizer function using spaCy's lemmatization
def spacy_tokenizer(text):
    tokens = nlp(text)
    lemmas = [token.lemma_ for token in tokens if not token.is_punct and not token.is_space]
    return lemmas
# Create a CountVectorizer instance with the custom tokenizer
vectorizer = CountVectorizer(tokenizer=spacy_tokenizer)

# Fit and transform the corpus using CountVectorizer
X = vectorizer.fit_transform(c.values())

# Convert the result to a data frame with clear labeling
term_document_matrix = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names_out(), index=c.keys())

# Display the term-document matrix
print(term_document_matrix)


                0.00006  1998  a  about  achieve  all  among  and  anything  \
Lincoln1865           0     0  1      0        1    3      1    3         0   
TrumpMay26            0     0  0      0        0    0      0    0         1   
Wikipedia             0     1  0      0        0    1      0    0         0   
FortuneMay26          1     0  0      1        0    0      0    0         0   
TheHillApr07          0     0  0      0        0    0      0    0         0   
KingJamesBible        0     0  0      0        0    3      0    4         0   

                aside  ...  vote  voting  way  we  wherefore  which  will  \
Lincoln1865         0  ...     0       0    0   2          0      1     0   
TrumpMay26          0  ...     0       0    1   0          0      0     1   
Wikipedia           0  ...     0       1    0   0          0      0     0   
FortuneMay26        0  ...     2       0    0   0          0      0     0   
TheHillApr07        0  ...     1       0    0   0          0 



In [29]:
from sklearn.decomposition import TruncatedSVD

# Create an LSA (Latent Semantic Analysis) model
lsa = TruncatedSVD(n_components=3)

# Fit the LSA model to the term-document matrix
lsa_result = lsa.fit_transform(X)

# Create a DataFrame for the LSA representations
lsa_df = pd.DataFrame(lsa_result, index=c.keys(), columns=["LSA1", "LSA2", "LSA3"])

# Print the LSA representation of all documents
print("LSA representations of documents:")
print(lsa_df)

# Find the vector representation of the word "vote"
word_index = vectorizer.vocabulary_.get("vote")
if word_index is not None:
    word_vector = lsa.components_[:, word_index]
    print("Vector representation of 'vote':", word_vector)
else:
    print("The word 'vote' is not in the vocabulary.")


LSA representations of documents:
                    LSA1      LSA2      LSA3
Lincoln1865     7.386029  0.089226 -2.288112
TrumpMay26      0.520975  2.218354  0.487372
Wikipedia       1.578395  2.968300  0.739028
FortuneMay26    0.445616  2.771292  1.199554
TheHillApr07    0.412929  1.751775  0.544510
KingJamesBible  4.116111 -2.054894  3.576262
Vector representation of 'vote': [0.01747558 0.25405951 0.14328202]


In [33]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

# Define a function to compute cosine similarity
def compute_cosine_similarity(vector1, vector2):
    # Reshape the vectors to be 2D arrays (required for cosine_similarity)
    vector1 = vector1.reshape(1, -1)
    vector2 = vector2.reshape(1, -1)

    # Compute the cosine similarity
    similarity = cosine_similarity(vector1, vector2)

    return similarity[0][0]
v1=lsa_df.loc['Lincoln1865'].to_numpy()
v2= lsa_df.loc['Wikipedia'].to_numpy()
v3= lsa_df.loc['TrumpMay26'].to_numpy()
# Compute cosine similarity between 'malice' and 'vote'
cosine_malice_vote = compute_cosine_similarity(v1,v2)

# Compute cosine similarity between 'mail' and 'vote'
cosine_mail_vote = compute_cosine_similarity(v3,v2)

print(f"Cosine similarity between 'malice' and 'vote': {cosine_malice_vote}")
print(f"Cosine similarity between 'mail' and 'vote': {cosine_mail_vote}")

Cosine similarity between 'malice' and 'vote': 0.3844065678276618
Cosine similarity between 'mail' and 'vote': 0.9683590759737166


In [34]:
from sklearn.feature_extraction.text import TfidfVectorizer


# Create a TfidfVectorizer object
tfidf_vectorizer = TfidfVectorizer()

# Fit and transform the text data to compute the TF-IDF matrix
tfidf_matrix = tfidf_vectorizer.fit_transform(c.values())

# Create a DataFrame from the TF-IDF matrix
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=tfidf_vectorizer.get_feature_names_out(), index=c.keys())

# Display the TF-IDF matrix
print(tfidf_df)


                  00006      1998    about   achieve       all     among  \
Lincoln1865     0.00000  0.000000  0.00000  0.147276  0.305882  0.147276   
TrumpMay26      0.00000  0.000000  0.00000  0.000000  0.000000  0.000000   
Wikipedia       0.00000  0.272458  0.00000  0.000000  0.188626  0.000000   
FortuneMay26    0.26865  0.000000  0.26865  0.000000  0.000000  0.000000   
TheHillApr07    0.00000  0.000000  0.00000  0.000000  0.000000  0.000000   
KingJamesBible  0.00000  0.000000  0.00000  0.000000  0.426225  0.000000   

                     and  anything       are     aside  ...    voting  \
Lincoln1865     0.362304   0.00000  0.147276  0.000000  ...  0.000000   
TrumpMay26      0.000000   0.26374  0.000000  0.000000  ...  0.000000   
Wikipedia       0.000000   0.00000  0.000000  0.000000  ...  0.272458   
FortuneMay26    0.000000   0.00000  0.000000  0.000000  ...  0.000000   
TheHillApr07    0.000000   0.00000  0.000000  0.000000  ...  0.000000   
KingJamesBible  0.673126   0.

In [36]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd




# Create a TfidfVectorizer object
tfidf_vectorizer = TfidfVectorizer()

# Fit and transform the text data to compute the TF-IDF matrix
tfidf_matrix = tfidf_vectorizer.fit_transform(c.values())

# Create a DataFrame from the TF-IDF matrix
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=tfidf_vectorizer.get_feature_names_out(), index=c.keys())

# Define the word vectors for "malice," "vote," and "mail"
malice_vector = tfidf_df.loc['Lincoln1865'].values.reshape(1, -1)
vote_vector = tfidf_df.loc['Wikipedia'].values.reshape(1, -1)
mail_vector = tfidf_df.loc['TrumpMay26'].values.reshape(1, -1)

# Compute the cosine similarity between word vectors
cosine_malice_vote = cosine_similarity(malice_vector, vote_vector)
cosine_mail_vote = cosine_similarity(mail_vector, vote_vector)

# Print the results
print("Cosine similarity between 'malice' and 'vote' using TF-IDF:", cosine_malice_vote[0, 0])
print("Cosine similarity between 'mail' and 'vote' using TF-IDF:", cosine_mail_vote[0, 0])


Cosine similarity between 'malice' and 'vote' using TF-IDF: 0.19513415920023014
Cosine similarity between 'mail' and 'vote' using TF-IDF: 0.0758726657016838
