In [39]:
import pandas as pd
import numpy as np
import re
import string
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

import gensim.downloader as api
from tqdm import tqdm
import matplotlib.pyplot as plt
import seaborn as sns

In [40]:
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/seoyoungyoon/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/seoyoungyoon/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [41]:
relevancy_scores = pd.read_csv("relevancy_scores.csv")
relevant_docs = pd.read_csv("relevant_document_list.csv")

In [42]:
relevancy_scores.head()

Unnamed: 0,internal_id,keywords,abstract,title,manual_screen_score,relevancy,predicted_screen_score
0,40941026,Hydrogen Fuel Cells|Alkaline Fuel Cells|Fuel C...,This paper presents the experimental results o...,An actively controlled fuel cell/battery hybri...,1.0,,
1,40941027,Detonation-to-Deflagration Transition,Lithium-ion batteries (LIBs) have been widely ...,Characterization of the deflagration behavior ...,-1.0,,
2,40941028,PEM Fuel Cells|Battery Management Systems|Hydr...,,Intelligent Control Strategy for Energy Manage...,-1.0,,
3,40941029,Power Management Strategy|Battery Life Optimiz...,The Energy Management Strategy (EMS) in Fuel C...,Multi-Objective Optimization-Based Health-Cons...,1.0,,
4,40941030,Battery Life Optimization|Battery Technology,"Currently, major vehicle manufacturers are wor...",The choice of a performance criterion for a hi...,1.0,,


In [43]:
relevant_docs.head()

Unnamed: 0,(internal) id,(source) id,keywords,abstract,title,journal,authors,doi
0,40941026,https://openalex.org,Hydrogen Fuel Cells|Alkaline Fuel Cells|Fuel C...,This paper presents the experimental results o...,An actively controlled fuel cell/battery hybri...,Journal of power sources,Lijun Gao|Zhenhua Jiang|Roger A. Dougal,https://doi.org/10.1016/j.jpowsour.2003.12.052
1,40941029,https://openalex.org,Power Management Strategy|Battery Life Optimiz...,The Energy Management Strategy (EMS) in Fuel C...,Multi-Objective Optimization-Based Health-Cons...,Energies,Mehdi Sellali|Alexandre Ravey|Achour Betka|Abd...,https://doi.org/10.3390/en15041318
2,40941030,https://openalex.org,Battery Life Optimization|Battery Technology,"Currently, major vehicle manufacturers are wor...",The choice of a performance criterion for a hi...,IOP conference series. Materials science and e...,I. A. Lyubimov|Р. Х. Курмаев,https://doi.org/10.1088/1757-899x/819/1/012015
3,40941032,https://openalex.org,Battery Technology,Battery electric vehicles (BEVs) and fuel cell...,A comparative assessment of battery and fuel c...,Energy,Mengyu Li|Xiongwen Zhang|Guojun Li,https://doi.org/10.1016/j.energy.2015.11.023
4,40941035,https://openalex.org,PEM Fuel Cells|Hydrogen Fuel Cells|Fuel Cell D...,This paper develops a proton exchange membrane...,Cost analyses and optimization of a PEMFC elec...,,Yu-Ting Teng|Fu-Cheng Wang,https://doi.org/10.1109/sii.2016.7844112


## Data Cleaning

In [44]:
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def clean_text(text):
    if pd.isnull(text):
        return ""
    text = str(text).lower()
    text = re.sub(r"\?\?\?", "", text)
    text = re.sub(r"\[\]|\(\)", "", text)
    text = re.sub(r"[^\w\s]", "", text)
    text = re.sub(r"\bev\b", "electric vehicle", text)
    text = re.sub(r"\bli[\-\s]?ion\b", "lithium ion", text)
    text = re.sub(r"\bh2\b", "hydrogen", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text

In [45]:
relevant_docs['clean_abstract'] = relevant_docs['abstract'].apply(clean_text)
relevant_docs['clean_title'] = relevant_docs['title'].apply(clean_text)

# Remove missing rows
relevant_docs.dropna(subset=['abstract', 'title'], inplace=True)

In [46]:
# DOI validity check
relevant_docs['valid_doi'] = relevant_docs['doi'].apply(lambda x: isinstance(x, str) and x.startswith('https://doi.org/10.'))
invalid_dois = relevant_docs[~relevant_docs['valid_doi']]
print(f"Invalid DOIs found: {len(invalid_dois)}")

Invalid DOIs found: 0


In [47]:
relevant_docs.reset_index(drop=True, inplace=True)

In [48]:
relevant_docs

Unnamed: 0,(internal) id,(source) id,keywords,abstract,title,journal,authors,doi,clean_abstract,clean_title,valid_doi
0,40941026,https://openalex.org,Hydrogen Fuel Cells|Alkaline Fuel Cells|Fuel C...,This paper presents the experimental results o...,An actively controlled fuel cell/battery hybri...,Journal of power sources,Lijun Gao|Zhenhua Jiang|Roger A. Dougal,https://doi.org/10.1016/j.jpowsour.2003.12.052,this paper presents the experimental results o...,an actively controlled fuel cellbattery hybrid...,True
1,40941029,https://openalex.org,Power Management Strategy|Battery Life Optimiz...,The Energy Management Strategy (EMS) in Fuel C...,Multi-Objective Optimization-Based Health-Cons...,Energies,Mehdi Sellali|Alexandre Ravey|Achour Betka|Abd...,https://doi.org/10.3390/en15041318,the energy management strategy ems in fuel cel...,multiobjective optimizationbased healthconscio...,True
2,40941030,https://openalex.org,Battery Life Optimization|Battery Technology,"Currently, major vehicle manufacturers are wor...",The choice of a performance criterion for a hi...,IOP conference series. Materials science and e...,I. A. Lyubimov|Р. Х. Курмаев,https://doi.org/10.1088/1757-899x/819/1/012015,currently major vehicle manufacturers are work...,the choice of a performance criterion for a hi...,True
3,40941032,https://openalex.org,Battery Technology,Battery electric vehicles (BEVs) and fuel cell...,A comparative assessment of battery and fuel c...,Energy,Mengyu Li|Xiongwen Zhang|Guojun Li,https://doi.org/10.1016/j.energy.2015.11.023,battery electric vehicles bevs and fuel cell e...,a comparative assessment of battery and fuel c...,True
4,40941035,https://openalex.org,PEM Fuel Cells|Hydrogen Fuel Cells|Fuel Cell D...,This paper develops a proton exchange membrane...,Cost analyses and optimization of a PEMFC elec...,,Yu-Ting Teng|Fu-Cheng Wang,https://doi.org/10.1109/sii.2016.7844112,this paper develops a proton exchange membrane...,cost analyses and optimization of a pemfc elec...,True
...,...,...,...,...,...,...,...,...,...,...,...
152,40941317,https://openalex.org,Battery Technology|Electric Vehicles|Battery M...,This chapter focuses on the battery electric v...,The Case for Battery Electric Vehicles,Elsevier eBooks,Paul Maccready,https://doi.org/10.1016/b978-012656881-3/50016-2,this chapter focuses on the battery electric v...,the case for battery electric vehicles,True
153,40941318,https://openalex.org,Fuel Cell Vehicles|Battery Life Optimization|B...,Sometimes technology and development of societ...,A new approach to battery powered electric veh...,International journal of hydrogen energy,Roberto Álvarez Fernández|Fernando Beltrán Cil...,https://doi.org/10.1016/j.ijhydene.2016.01.035,sometimes technology and development of societ...,a new approach to battery powered electric veh...,True
154,40941320,https://openalex.org,Power Management Strategy|Battery Life Optimiz...,The use of multi-stack fuel cells (FCs) is att...,Q-learning based energy management strategy fo...,Energy conversion and management,Razieh Ghaderi|Mohsen Kandidayeni|Loïc Boulon|...,https://doi.org/10.1016/j.enconman.2023.117524,the use of multistack fuel cells fcs is attrac...,qlearning based energy management strategy for...,True
155,40941321,https://openalex.org,Power Management Strategy|Battery Life Optimiz...,This paper focus on offline energy management ...,Hybrid Systems Energy Management Using Optimiz...,,Yacine Gaoua|Stéphane Caux|Pierre López|C. Rag...,https://doi.org/10.1109/vppc.2014.7007079,this paper focus on offline energy management ...,hybrid systems energy management using optimiz...,True


## TF-IDF RELEVANCE SCORING

In [49]:
tfidf_vectorizer = TfidfVectorizer(stop_words='english')
doc_vectors = tfidf_vectorizer.fit_transform(relevant_docs['clean_abstract'])

query = "electric vehicle battery hydrogen lithium ion"
query_vector = tfidf_vectorizer.transform([query])

cos_sim = cosine_similarity(query_vector, doc_vectors).flatten()
relevant_docs['tfidf_score'] = cos_sim

In [50]:
relevant_docs['tfidf_score']

0      0.020184
1      0.026052
2      0.068523
3      0.019536
4      0.056160
         ...   
152    0.188996
153    0.075232
154    0.034895
155    0.037864
156    0.085464
Name: tfidf_score, Length: 157, dtype: float64

In [51]:
relevant_docs.to_csv('cleaned_relevant_docs.csv')

## Word-to-vec scoring