In [1]:
!pip install transformers torch scikit-learn 



In [2]:
from transformers import AutoTokenizer, AutoModel 
import torch 
from sklearn.metrics.pairwise import cosine_similarity 


In [3]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased") 
model = AutoModel.from_pretrained("bert-base-uncased")

In [4]:
cricketers = { 
    "MS Dhoni": "India", 
    "Virat Kohli": "India", 
    "Ricky Ponting": "Australia", 
    "Steve Smith": "Australia", 
    "Kane Williamson": "New Zealand", 
    "Joe Root": "England" 
}

In [5]:
def get_embedding(text): 
    inputs = tokenizer(text, return_tensors="pt", truncation=True,padding=True) 
    with torch.no_grad(): 
        outputs = model(**inputs) 
    return outputs.last_hidden_state.mean(dim=1).numpy()

In [7]:
def check_country_similarity(player, country):
    player_emb = get_embedding(player)
    country_emb = get_embedding(country)
    similarity_score = cosine_similarity(player_emb, country_emb)[0][0]
    return similarity_score


In [8]:
test_cases = [ 
("MS Dhoni", "India"), 
("MS Dhoni", "Australia"), 
("Ricky Ponting", "Australia"), 
("Virat Kohli", "India"), 
("Kane Williamson", "New Zealand"), 
("Joe Root", "India") 
]

In [10]:
threshold = 0.75  # You can adjust this 
for player, country in test_cases: 
    score = check_country_similarity(player, country) 
match = "MATCH   " if score >= threshold else "NO MATCH  " 
print(f"{player} - {country} | Similarity: {score:.2f} | {match}")

Joe Root - India | Similarity: 0.68 | NO MATCH  
