In [44]:
from langchain_ollama import OllamaEmbeddings
import pymupdf
import pandas as pd
import os

from sklearn.cluster import DBSCAN, KMeans
from sklearn.preprocessing import StandardScaler
import shutil

In [45]:
model = "llama3.1"  # "mistral" | "llama3" | "phi3" | "dolphin-llama3"
 
# llm = ChatOllama(model=model)

embedder = OllamaEmbeddings(model=model)

In [47]:
# Function to extract abstracts from PDFs
def extract_abstract_from_pdfs(folder_path):
    abstracts = []
    filenames = []
    for filename in os.listdir(folder_path):
        if filename.endswith(".pdf"):
            file_path = os.path.join(folder_path, filename)
            with pymupdf.open(file_path) as pdf_file:
                text = ""
                for page_num in range(min(3, pdf_file.page_count)):  # Scanning first 3 pages
                    page = pdf_file[page_num]
                    text += page.get_text()
                # Simple heuristic: Look for the 'Abstract' keyword
                abstract_start = text.lower().find("abstract")
                if abstract_start != -1:
                    abstract_end = text.lower().find("introduction", abstract_start)
                    abstract = text[abstract_start:abstract_end].strip() if abstract_end != -1 else text[abstract_start:].strip()
                    # print(f"Abstract from {filename}:\n{abstract}\n")
                else:
                    # Extract title (assume it's the first line or first few lines)
                    title = text.splitlines()[0] if text else "Title not found."
                    abstract = title

                abstracts.append(abstract)
                filenames.append(filename)
    
    return filenames, abstracts


# Function to convert abstracts to embeddings and store in DataFrame using ollamaembeddings
def convert_abstracts_to_embeddings(folder_path):
    filenames, abstracts = extract_abstract_from_pdfs(folder_path)
    
    # Get embeddings from ollamaembeddings
    embeddings = embedder.embed_documents(abstracts)  # Embedding for documents
    
    # Create DataFrame
    df = pd.DataFrame({
        'Filename': filenames,
        'Abstract': abstracts,
        'Embedding': embeddings  # Embeddings stored as list
    })
    
    return df

# Usage
folder_path = "Published Conferences"
df = convert_abstracts_to_embeddings(folder_path)
print(df.head())


                                            Filename  \
0  C1-Design and Control of Growth Adaptable Arti...   
1  C10-Integrating structure and control design u...   
2  C11-Joint Optimization of Plant, Controller, a...   
3      C12-Gyroscopic Tensegrity System Dynamics.pdf   
4  C13-Model and Data Based Approaches to the Con...   

                                            Abstract  \
0   Design and Control of Growth Adaptable Artiﬁcial   
1  ABSTRACT:\nThis paper provides a novel approac...   
2  Abstract— This\npaper\npresents\na\nLinear\nMa...   
3  Abstract— Mechanics and control of innovative ...   
4  Abstract— This paper proposes two approaches t...   

                                           Embedding  
0  [-0.0034896785, 0.004134818, 0.02365485, -0.00...  
1  [-0.017065702, -0.029780408, 0.002489877, 0.00...  
2  [-0.0074446937, -0.032236487, 0.021474421, 0.0...  
3  [-0.01603647, -0.03346196, 0.030032031, 0.0095...  
4  [-0.014418024, -0.034438383, 0.029893901, 0.00..

In [59]:
# Clustering function
def cluster_embeddings(df):
    embeddings = df['Embedding'].tolist()

    # Standardize the embeddings
    embeddings_scaled = StandardScaler().fit_transform(embeddings)

    # Apply DBSCAN clustering
    cluster_model = DBSCAN(eps=.4, min_samples=2, metric='cosine')  # Adjust eps and min_samples as needed

    # Apply KMeans clustering
    # cluster_model = KMeans(n_clusters=5)

    clusters = cluster_model.fit_predict(embeddings_scaled)

    # Add clusters to DataFrame
    df['Cluster'] = clusters
    return df

# Apply clustering
df_with_clusters = cluster_embeddings(df)

# Print the resulting DataFrame with cluster labels
print(df_with_clusters.head())
df_with_clusters.to_csv("Published Conferences/embeded_data.csv",index=False)

                                            Filename  \
0  C1-Design and Control of Growth Adaptable Arti...   
1  C10-Integrating structure and control design u...   
2  C11-Joint Optimization of Plant, Controller, a...   
3      C12-Gyroscopic Tensegrity System Dynamics.pdf   
4  C13-Model and Data Based Approaches to the Con...   

                                            Abstract  \
0   Design and Control of Growth Adaptable Artiﬁcial   
1  ABSTRACT:\nThis paper provides a novel approac...   
2  Abstract— This\npaper\npresents\na\nLinear\nMa...   
3  Abstract— Mechanics and control of innovative ...   
4  Abstract— This paper proposes two approaches t...   

                                           Embedding  Cluster  
0  [-0.0034896785, 0.004134818, 0.02365485, -0.00...       -1  
1  [-0.017065702, -0.029780408, 0.002489877, 0.00...        0  
2  [-0.0074446937, -0.032236487, 0.021474421, 0.0...        1  
3  [-0.01603647, -0.03346196, 0.030032031, 0.0095...       -1  
4  [-0

In [60]:
def create_cluster_folders_and_move_files(df, base_folder):
    # Create base folder if it doesn't exist
    os.makedirs(base_folder, exist_ok=True)

    # Move files to their respective cluster folders
    for cluster in df['Cluster'].unique():
        cluster_folder = os.path.join(base_folder, f'Cluster_{cluster}')
        os.makedirs(cluster_folder, exist_ok=True)

        # Move files belonging to the current cluster
        for _, row in df[df['Cluster'] == cluster].iterrows():
            filename = row['Filename']
            # Define the source path
            source_path = os.path.join(base_folder, filename)
            # Move the file
            shutil.copy(source_path, os.path.join(cluster_folder, filename))

# Usage example
create_cluster_folders_and_move_files(df_with_clusters, "Published Conferences")
