In [12]:
import os
import numpy as np
import random
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from scipy.spatial.distance import euclidean, cityblock, chebyshev
from sklearn.metrics import jaccard_score
from scipy.stats import pearsonr

data_dir = 'E:\\New folder (2)\\bbc\\sport'

# Function to load sample documents from a directory
def load_sample_documents_from_directory(directory, num_samples):
    documents = []
    filenames = os.listdir(directory)
    sample_filenames = random.sample(filenames, num_samples)
    for filename in sample_filenames:
        if filename.endswith(".txt"):
            with open(os.path.join(directory, filename), 'r', encoding='utf-8', errors='ignore') as file:
                documents.append(file.read())
    return documents

# Specify the number of sample documents to load
num_samples = 10

# Load sample documents
sample_documents = load_sample_documents_from_directory(data_dir, num_samples)

In [14]:
# Define 10 words
words = ['football', 'tennis', 'basketball', 'cricket', 'rugby', 
         'athlete', 'championship', 'match', 'league', 'sports']

# Initialize CountVectorizer
vectorizer = CountVectorizer(vocabulary=words)

# Fit and transform documents to obtain word counts
word_counts = vectorizer.fit_transform(documents)

# Convert word counts to array
word_counts_array = word_counts.toarray()


In [15]:
# Compute similarities/distances for words
word_cosine_similarity = cosine_similarity(word_counts_array)
word_euclidean_distance = euclidean(word_counts_array[0], word_counts_array[1])
word_manhattan_distance = cityblock(word_counts_array[0], word_counts_array[1])
word_chebyshev_distance = chebyshev(word_counts_array[0], word_counts_array[1])

# Compute similarities/distances for documents
document_cosine_similarity = cosine_similarity(word_counts_array.T)
document_euclidean_distance = euclidean(word_counts_array.T[0], word_counts_array.T[1])
document_manhattan_distance = cityblock(word_counts_array.T[0], word_counts_array.T[1])
document_chebyshev_distance = chebyshev(word_counts_array.T[0], word_counts_array.T[1])

# Compute Jaccard's coefficient for documents
document_jaccard_coefficient = np.zeros((len(words), len(words)))
for i in range(len(words)):
    for j in range(len(words)):
        document_jaccard_coefficient[i, j] = jaccard_score(word_counts_array[i], word_counts_array[j])

# Compute Pearson's coefficient for documents
document_pearson_coefficient = np.zeros((len(words), len(words)))
for i in range(len(words)):
    for j in range(len(words)):
        document_pearson_coefficient[i, j] = pearsonr(word_counts_array[i], word_counts_array[j])[0]


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_pr

In [19]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity, euclidean_distances, manhattan_distances
from scipy.spatial.distance import chebyshev, jaccard
from scipy.stats import pearsonr
import numpy as np

# Words
words = ["football", "France", "Claxton", "medal", "win", "European", "Indoor", "Madrid", "campaign", "London"]

# Sports-related documents
documents = [
    "British hurdler Sarah Claxton is confident she can win her first major medal at next month's European Indoor Championships in Madrid.",
    "I am quite confident, said Claxton.",
    "For the first time, Claxton has only been preparing for a campaign over the hurdles",
    "Claxton has won the national 60m hurdles title for the past three years but has struggled to translate her domestic success to the international stage.",
    "the 25-year-old also contested the long jump but since moving from Colchester to London she has re-focused her attentions.",
    "Claxton will see if her new training regime pays dividends at the European Indoors which take place on 5-6 March.",
    "Claxton hunting first major medal",
    "Athletics Ireland have hinted that the 35-year-old Cobh runner may be included in the official line-up for the event in France on 19-20 March.",
    "The participation of O'Sullivan, currentily training at her base in Australia, would boost the Ireland team who won the bronze three years ago.",
    "O'Sullivan will also take part in the Bupa Great Ireland Run on 9 April in Dublin."
]

# Preprocessing and vectorization
vectorizer = TfidfVectorizer()
word_vectors = vectorizer.fit_transform(words)
document_vectors = vectorizer.fit_transform(documents)

# Calculate similarity measures
def calculate_similarity_measures(vector1, vector2):
    cosine_sim = cosine_similarity(vector1, vector2)
    euclidean_dist = euclidean_distances(vector1, vector2)
    manhattan_dist = manhattan_distances(vector1, vector2)
    chebyshev_dist = chebyshev(vector1.toarray().flatten(), vector2.toarray().flatten())
    jaccard_coeff = jaccard(vector1.toarray().flatten(), vector2.toarray().flatten())
    pearson_coeff, _ = pearsonr(vector1.toarray().flatten(), vector2.toarray().flatten())
    return cosine_sim[0][0], euclidean_dist[0][0], manhattan_dist[0][0], chebyshev_dist, jaccard_coeff, pearson_coeff

# Calculate similarity measures for each word-document pair
for word in words:
    print(f"\nSimilarity measures for word '{word}':")
    word_vector = vectorizer.transform([word])
    for idx, doc_vector in enumerate(document_vectors):
        cosine_sim, euclidean_dist, manhattan_dist, chebyshev_dist, jaccard_coeff, pearson_coeff = calculate_similarity_measures(word_vector, doc_vector)
        print(f"Document {idx+1}:")
        print(f"Cosine Similarity: {cosine_sim}")
        print(f"Euclidean Distance: {euclidean_dist}")
        print(f"Manhattan Distance: {manhattan_dist}")
        print(f"Chebyshev Distance: {chebyshev_dist}")
        print(f"Jaccard Coefficient: {jaccard_coeff}")
        print(f"Pearson Coefficient: {pearson_coeff}")



Similarity measures for word 'football':
Document 1:
Cosine Similarity: 0.0
Euclidean Distance: 0.9999999999999999
Manhattan Distance: 4.521133061572821
Chebyshev Distance: 0.24399262857399925
Jaccard Coefficient: 1.0
Pearson Coefficient: nan
Document 2:
Cosine Similarity: 0.0
Euclidean Distance: 1.0
Manhattan Distance: 2.190493719495545
Chebyshev Distance: 0.499323882259068
Jaccard Coefficient: 1.0
Pearson Coefficient: nan
Document 3:
Cosine Similarity: 0.0
Euclidean Distance: 1.0
Manhattan Distance: 3.3754539386741897
Chebyshev Distance: 0.4428823802785002
Jaccard Coefficient: 1.0
Pearson Coefficient: nan
Document 4:
Cosine Similarity: 0.0
Euclidean Distance: 1.0
Manhattan Distance: 4.424297503458541
Chebyshev Distance: 0.36207783558093953
Jaccard Coefficient: 1.0
Pearson Coefficient: nan
Document 5:
Cosine Similarity: 0.0
Euclidean Distance: 1.0000000000000002
Manhattan Distance: 4.551462452746866
Chebyshev Distance: 0.23445415945955908
Jaccard Coefficient: 1.0
Pearson Coefficient: