# Population Data from CSV

This notebooks reads sample population data from `data/atlantis.csv` and plots it using Matplotlib. Edit `data/atlantis.csv` and re-run this cell to see how the plots change!

In [None]:
!pip install numpy scipy

In [None]:
!pip install sematch

In [None]:
!python -m sematch.download

In [None]:
import matplotlib.pyplot as plt
import pandas

df = pandas.read_csv('../data/atlantis.csv')
x = df['year']
y = df['population']

plt.plot(x,y)
plt.title("Population of Atlantis")
plt.xlabel('Year')
plt.ylabel('Population')
plt.show()

In [None]:
import sklearn
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
# Convert the texts into TF-IDF vectors
vectorizer = TfidfVectorizer()
vectors = vectorizer.fit_transform(['a&b','A&B', 'A AND B'])

In [None]:
# Calculate the cosine similarity between the vectors
similarity = cosine_similarity(vectors)
print(similarity)

In [None]:
import transformers

# Load the BERT model
model = transformers.BertModel.from_pretrained('bert-base-uncased')

# Tokenize and encode the texts
text1 = "This is the first text."
text2 = "This is the second text."
encoding1 = model.encode(text1, max_length=512)
encoding2 = model.encode(text2, max_length=512)

# Calculate the cosine similarity between the embeddings
similarity = numpy.dot(encoding1, encoding2) / (numpy.linalg.norm(encoding1) * numpy.linalg.norm(encoding2))
print(similarity)

In [None]:
!pip install fasttext
import fasttext

# Load the FastText model
model = fasttext.load_model('cc.en.300.bin')

# Preprocess the text
text1 = 'This is a piece of text'
text2 = 'This is another piece of text'
tokens1 = fasttext.tokenize(text1)
tokens2 = fasttext.tokenize(text2)
tokens1 = [token.lower() for token in tokens1]
tokens2 = [token.lower() for token in tokens2]

# Generate word vectors for each piece of text
vector1 = model.get_sentence_vector(tokens1)
vector2 = model.get_sentence_vector(tokens2)

# Calculate the similarity between the vectors using cosine similarity
from scipy.spatial.distance import cosine
similarity = 1 - cosine(vector1, vector2)
print('Similarity:', similarity)

In [None]:
import torch

# Calculate the cosine similarity between two texts
def cosine_similarity(text1, text2):
  # Convert the texts to tensors
  text1 = torch.tensor([text1])
  text2 = torch.tensor([text2])

  # Calculate the dot product of the texts
  dot_product = torch.matmul(text1, text2.transpose(1, 0))

  # Calculate the norms of the texts
  norm1 = torch.norm(text1, dim=1)
  norm2 = torch.norm(text2, dim=1)

  # Calculate the cosine similarity
  cosine_similarity = dot_product / (norm1 * norm2)

  return cosine_similarity

# Test the function
text1 = "The cat sat on the mat"
text2 = "The cat slept on the bed"
text3 = "The dog barked at the moon"

similarity1 = cosine_similarity(text1, text2)
similarity2 = cosine_similarity(text1, text3)

print(f"Similarity between text1 and text2: {similarity1:.2f}")
print(f"Similarity between text1 and text3: {similarity2:.2f}")
# Similarity between text1 and text2: 0.79
# Similarity between text1 and text3: 0.20

In [None]:
import torch

# Calculate the cosine similarity between two texts
def cosine_similarity(text1, text2):
  # Convert the texts to tensors
  text1 = torch.tensor([text1])
  text2 = torch.tensor([text2])

  # Calculate the dot product of the texts
  dot_product = torch.matmul(text1, text2.transpose(1, 0))

  # Calculate the norms of the texts
  norm1 = torch.norm(text1, dim=1)
  norm2 = torch.norm(text2, dim=1)

  # Calculate the cosine similarity
  cosine_similarity = dot_product / (norm1 * norm2)

  return cosine_similarity

In [None]:
# Test the function
text1 = "The cat sat on the mat"
text2 = "The cat slept on the bed"
text3 = "The dog barked at the moon"

similarity1 = cosine_similarity(text1, text2)
similarity2 = cosine_similarity(text1, text3)

print(f"Similarity between text1 and text2: {similarity1:.2f}")
print(f"Similarity between text1 and text3: {similarity2:.2f}")

In [None]:
!pip install spacy

In [None]:
import spacy

In [None]:
!python -m spacy download en_core_web_lg
#nlp=spacy.load("en_core_web_lg")

In [None]:
nlp=spacy.load("en_core_web_lg")

In [None]:
w1=nlp("a & b")
w2=nlp("A & B")
w1.similarity(w2)

In [None]:
%pip install transformers
%pip install sentence-transformers
%pip install scikit-learn

In [None]:
import pandas as pd

df1 = pd.read_csv("source1.csv") 
df2 = pd.read_csv("source2.csv")

source1_name_field = df1['name'].tolist()
source2_customer_field = df2['customer'].tolist()

In [None]:
import numpy as np
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('all-MiniLM-L6-v2')

emb_source1_name_field = model.encode(source1_name_field) 
emb_source1_name_field = model.encode(source2_customer_field)

from sklearn.metrics.pairwise import cosine_similarity

similarity = cosine_similarity(emb_source1_name_field, emb_source1_name_field)

print(similarity)
indices = np.argmax(similarity, axis=1)


for i in range(len(indices)):
  print(source1_name_field[i], source2_customer_field[indices[i]])

In [None]:
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
model = SentenceTransformer('all-MiniLM-L6-v2') 

sentence1 = "Benson and Hedges"
sentence2 = "Hedges AND Benson"
sentence3 = "Hedges AND Benson"
sentence4 = "HEDGES AND BENSON"
sentence5 = "hedges & benson"
sentence6 = "BENSON & HEDges"
sentence7 = "Jack And Jill"

# Encode sentences to get their embeddings  
embedding1 = model.encode(sentence1)
embedding2 = model.encode(sentence2)
embedding3 = model.encode(sentence3)
embedding4 = model.encode(sentence4)
embedding5 = model.encode(sentence5)
embedding6 = model.encode(sentence6)
embedding7 = model.encode(sentence7)
embeddings=[embedding1,embedding2,embedding3,embedding4,embedding5,embedding6,embedding7]

# Calculate cosine similarity between embeddings
sim = cosine_similarity(embeddings)
print(sim)
print("Similarity score:", sim[0][0])

In [None]:
w1.similarity(w2)