In [1]:
import pandas as pd
import numpy as np

In [2]:
# Importing the dataset
data = pd.read_csv('230607 Events dump-reduced.csv')
data.head()

Unnamed: 0,country,end date,flagged_by,has_embedding,human verification,is virtual,is_flagged,is_free,location,name,...,specialties,start date,state,summary,URL,Creation Date,Modified Date,Slug,Creator,unique id
0,US,"Mar 28, 2023 12:00 am",,yes,Verified - OK,,,,"Tampa, FL, USA",SGO Annual Meeting on Women’s Cancer,...,"Medicine , Hematology and Oncology , Obstetric...","Mar 25, 2023 12:00 am",FL,The SGO Annual Meeting on Women’s Cancer® is t...,,"Mar 29, 2023 4:08 pm","May 8, 2023 10:12 am",,paolo@sponsormotion.com,1680120526987x888446806284079400
1,US,"Nov 4, 2022 12:00 am",,yes,Verified - OK,,,,"University Technology Center II, 3650 Spectrum...",FAPTP's 45th Annual Seminar,...,"Medicine , Hematology and Oncology , Pediatric...","Nov 3, 2022 12:00 am",FL,"FAPTP's 45th Annual Seminar, 'Advances in Pedi...",,"Mar 29, 2023 4:08 pm","May 3, 2023 6:29 pm",,paolo@sponsormotion.com,1680120530555x259893486984035460
2,US,"Apr 19, 2023 12:00 am",,yes,Verified - OK,,,,"9800 International Dr, Orlando, FL 32819, USA",AACR Annual Meeting 2023,...,"Medicine , Hematology and Oncology , Internal ...","Apr 14, 2023 12:00 am",FL,The AACR Annual Meeting is the focal point of ...,,"Mar 29, 2023 4:08 pm","May 8, 2023 10:13 am",,paolo@sponsormotion.com,1680120536478x149218014932820380
3,US,"Jul 29, 2023 12:00 am",,yes,Verified - Duplicate,,,,"Amelia Island, Florida 32034, USA",32nd Annual Mayo Clinic Hematology/Oncology Re...,...,"Medicine , Hematology and Oncology , Hematolog...","Jul 27, 2023 12:00 am",FL,Mayo Clinic's 32nd Annual Hematology/Oncology ...,,"Mar 29, 2023 4:08 pm","May 3, 2023 6:29 pm",,paolo@sponsormotion.com,1680120538716x333574781569262660
4,US,"Sep 1, 2022 12:00 am",,yes,Verified - OK,,,,"Florida, USA",FCDS 2022 Virtual Annual Conference,...,"Medicine , Internal medicine","Aug 11, 2022 12:00 am",FL,The FCDS 2022 Virtual Annual Conference will c...,,"Mar 29, 2023 4:09 pm","May 3, 2023 6:29 pm",,paolo@sponsormotion.com,1680120546628x698788135365455200


In [3]:
data.dtypes

country                object
end date               object
flagged_by            float64
has_embedding          object
human verification     object
is virtual            float64
is_flagged            float64
is_free               float64
location               object
name                   object
sourceURL              object
specialties            object
start date             object
state                  object
summary                object
URL                   float64
Creation Date          object
Modified Date          object
Slug                  float64
Creator                object
unique id              object
dtype: object

In [4]:
from gensim.models import Word2Vec

# Drop rows with missing summary
data = data.dropna(subset=['summary'])
data.shape

(1921, 21)

In [5]:
# Preprocess the data
data['summary'] = data['summary'].astype(str)
data['name'] = data['name'].astype(str)

In [6]:
from gensim.models import Word2Vec
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import re

# Filter relevant columns
data = data[["state", "summary"]]

# Create buckets based on state
state_buckets = {}
missing_state_bucket = []

for index, row in data.iterrows():
    state = row["state"]
    summary = row["summary"]
    if pd.notnull(state):
        if state not in state_buckets:
            state_buckets[state] = []
        state_buckets[state].append(summary)
    else:
        missing_state_bucket.append(summary)

In [7]:
# Preprocess summaries and create Word2Vec model
stop_words = ["conference", "health", "care", "annual", "medical", "research", "event", "education", "meeting",
              "topics", "place", "medicine", "healthcare", "held", "take", "professionals", "clinical", "sessions",
              "center", "new", "oncology", "information", "attendees", "university", "also", "related"]

def preprocess_text(text):
    text = re.sub(r"[^\w\s]", "", text)  # Remove punctuation
    text = text.lower()  # Convert to lowercase
    text = " ".join([word for word in text.split() if word not in stop_words])  # Remove stop words
    return text

preprocessed_data = []

for summaries in state_buckets.values():
    preprocessed_summaries = [preprocess_text(summary) for summary in summaries]
    preprocessed_data.append(preprocessed_summaries)

In [8]:
# Train Word2Vec model
model = Word2Vec(preprocessed_data, min_count=1)

# Compare summaries within each state bucket
duplicates = []

for state, summaries in state_buckets.items():
    preprocessed_summaries = [preprocess_text(summary) for summary in summaries]
    vectors = [model.wv[summary] for summary in preprocessed_summaries]

    count_vectorizer = CountVectorizer()
    count_matrix = count_vectorizer.fit_transform(preprocessed_summaries)

    similarity_matrix = cosine_similarity(count_matrix)
    duplicate_indices = set()

    for i in range(len(similarity_matrix)):
        for j in range(i + 1, len(similarity_matrix[i])):
            if similarity_matrix[i][j] > 0.9:
                duplicate_indices.add(i)
                duplicate_indices.add(j)

    for index in duplicate_indices:
        duplicates.append((state, summaries[index]))