In [1]:
!pip install pandas spacy scikit-learn
!python -m spacy download en_core_web_sm

Defaulting to user installation because normal site-packages is not writeable
/bin/bash: line 1: python: command not found


In [2]:
import pandas as pd
import spacy
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
import re

# Load spaCy model (use spacy.load instead of spacy.load)
try:
    nlp = spacy.load("en_core_web_sm")
except OSError:
    print("Downloading language model for the spaCy 'en_core_web_sm'...")
    spacy.cli.download("en_core_web_sm")
    nlp = spacy.load("en_core_web_sm")

# File paths
input_file = "screen.xlsx"  # Replace with your file path
output_file = "screening_results.csv"

# Read the Excel file
data = pd.read_excel(input_file)

# Check if required columns exist
if 'Title' in data.columns and 'Abstract' in data.columns:
    # Initialize output storage
    results = []
    
    # Predefined keywords and patterns for NLP-assisted matching
    drying_keywords = ["drying", "dehydration", "moisture"]
    coffee_keywords = ["coffee", "beans", "coffee beans"]
    math_keywords = ["model", "statistical", "regression", "mathematical", "analysis", "simulation"]
    math_patterns = r"(regression|mathematical model|statistical model|simulation|equation|analysis)"

    def analyze_abstract(abstract):
        """
        Analyze abstract text for relevance and extract mathematical approaches.
        """
        # NLP pipeline for text
        doc = nlp(abstract.lower())
        tokens = [token.text for token in doc if not token.is_stop and token.is_alpha]
        
        # Check for relevance using tokens and keywords
        is_relevant_drying = any(word in tokens for word in drying_keywords)
        is_relevant_coffee = any(word in tokens for word in coffee_keywords)
        is_relevant_math = any(word in tokens for word in math_keywords)

        # Decide inclusion
        if is_relevant_drying and is_relevant_coffee:
            if is_relevant_math:
                # Extract mathematical approaches
                matches = re.findall(math_patterns, abstract.lower())
                approach = ", ".join(set(matches)) if matches else "Unspecified"
                return "Yes", "Relevant to drying and mentions moisture or mathematical modeling.", approach
            else:
                return "No", "Mentions drying and coffee but lacks mathematical/statistical modeling.", ""
        else:
            return "No", "Not relevant to drying process of coffee beans.", ""

    # Process each row in the dataset
    for _, row in data.iterrows():
        title = str(row['Title'])
        abstract = str(row['Abstract'])
        decision, comment, approach = analyze_abstract(abstract)
        results.append({"Title": title, "Decision": decision, "Comment": comment, "Mathematical Approach": approach})
    
    # Create results DataFrame
    results_df = pd.DataFrame(results)

    # Perform clustering on abstracts for further insight
    abstracts = [str(row['Abstract']) for _, row in data.iterrows()]
    vectorizer = TfidfVectorizer(stop_words='english')
    X = vectorizer.fit_transform(abstracts)
    kmeans = KMeans(n_clusters=2, random_state=42).fit(X)
    results_df['Cluster'] = kmeans.labels_

    # Save to CSV
    results_df.to_csv(output_file, index=False)
    print(f"Screening completed. Results saved to {output_file}.")
else:
    print("Error: The input file does not have the required 'Title' and 'Abstract' columns.")

Screening completed. Results saved to screening_results.csv.
