In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import numpy as np
import pandas as pd
from imblearn.under_sampling import RandomUnderSampler
import re

# Clean FUS data

In [None]:
df = pd.read_csv("/content/drive/MyDrive/MSDS/Capstone/MSDS Capstone/Dataset/final_df.csv")

In [None]:
df['fus_related'].value_counts()

0    441
1     48
Name: fus_related, dtype: int64

In [None]:
df.isna().sum()

abstract               32
fus_related             0
is_cardiovascular       0
is_endocrine            0
is_gastrointestinal     0
is_musculoskeletal      0
is_neurological         0
is_womenshealth         0
is_miscellaneous        0
dtype: int64

In [None]:
df[df.isna().any(axis=1)]['fus_related'].value_counts()

0    28
1     4
Name: fus_related, dtype: int64

In [None]:
df = df.dropna()

In [None]:
df['fus_related'].value_counts()

0    413
1     44
Name: fus_related, dtype: int64

In [None]:
#duplicates

abstract_counts = df['abstract'].value_counts()
abstract_counts  = abstract_counts.to_frame().reset_index()
abstract_counts.columns = ['abstract', 'count']
abstract_counts

Unnamed: 0,abstract,count
0,Transcranial focused shockwave (FSW) is a nove...,5
1,PURPOSE: To develop an efficient MRI pulse seq...,4
2,"In recent years, veterinary medicine has expan...",4
3,Microbubble enhanced high intensity focused ul...,4
4,An orthotopically allografted mouse GL26 gliom...,4
...,...,...
325,BACKGROUND: The aim of this paper was to compa...,1
326,Sonodynamic therapy involving the non-invasive...,1
327,To improve the ultrasonic energy and realize f...,1
328,BACKGROUND: Transcranial ultrasound imaging an...,1


In [None]:
abstract_counts[abstract_counts['count']>1]

Unnamed: 0,abstract,count
0,Transcranial focused shockwave (FSW) is a nove...,5
1,PURPOSE: To develop an efficient MRI pulse seq...,4
2,"In recent years, veterinary medicine has expan...",4
3,Microbubble enhanced high intensity focused ul...,4
4,An orthotopically allografted mouse GL26 gliom...,4
...,...,...
86,PURPOSE: In high-intensity focused ultrasound ...,2
87,Cancer is one of the diseases with high mortal...,2
88,Adoptive T-cell therapy against solid tumours ...,2
89,Focused ultrasound (FUS) peripheral neuromodul...,2


In [None]:
df = df.drop_duplicates(subset='abstract', keep='first')

In [None]:
df['fus_related'].value_counts()

0    286
1     44
Name: fus_related, dtype: int64

In [None]:
# file path
file_path = '/content/drive/MyDrive/MSDS/Capstone/MSDS Capstone/Dataset/ogdata_nodupes_nonas.csv'

# Save the DataFrame to a CSV file in Google Drive
df.to_csv(file_path, index=False)

# Undersample FUS data

In [None]:
#under-sampling

# Separate features (X) and target variable (y)
X = df['abstract']
y = df['fus_related']

# Initialize RandomUnderSampler
rus = RandomUnderSampler(sampling_strategy='auto', random_state=0)
# Resample the data
X_resampled, y_resampled = rus.fit_resample(X.values.reshape(-1, 1), y)

# Convert X_resampled to DataFrame
resampled_df = pd.DataFrame(X_resampled, columns=['abstract'])
resampled_df['fus_related'] = y_resampled
resampled_df['fus_related'].value_counts()

0    44
1    44
Name: fus_related, dtype: int64

In [None]:
resampled_df.isna().sum()

abstract       0
fus_related    0
dtype: int64

In [None]:
resampled_df['abstract'].nunique()

88

In [None]:
resampled_df.head()

Unnamed: 0,abstract,fus_related
0,OBJECTIVES: To assess the comparative safety a...,0
1,With an ever-growing list of neurological appl...,0
2,We report a patient with tremor-dominant Parki...,0
3,"BACKGROUND: Herein, a robotic system offering ...",0
4,BACKGROUND: Focused ultrasound (FUS) is a medi...,0


In [None]:
##get rid of filler words --> background + objective

In [None]:
# file path
file_path = '/content/drive/MyDrive/MSDS/Capstone/MSDS Capstone/Dataset/resampled_data.csv'

# Save the DataFrame to a CSV file in Google Drive
resampled_df.to_csv(file_path, index=False)

# Concat FUS data + sean data

In [None]:
sean_df = pd.read_excel("/content/drive/MyDrive/MSDS/Capstone/MSDS Capstone/Sean's Work/Book 3 (4).xlsx")
sean_df = sean_df.drop('Link',axis=1)

# mapping from string values to numerical representations
mapping = {'NonFus': 0, 'Fus': 1}

# use the map function
sean_df['Type'] = sean_df['Type'].map(mapping)
sean_df.rename(columns={'Summary': 'abstract', 'Type':'fus_related'}, inplace=True)
sean_df = sean_df[['abstract', 'fus_related']]

#og data without indications
df = df.iloc[:, :2]

#merge
concatenated_df = pd.concat([df, sean_df], ignore_index=True)

In [None]:
sean_df['fus_related'].value_counts()

0    49
1    41
Name: fus_related, dtype: int64

In [None]:
sean_df.isna().sum()

abstract       0
fus_related    0
dtype: int64

In [None]:
sean_df['abstract'].nunique() #no duplicates

90

In [None]:
df['fus_related'].value_counts()

0    286
1     44
Name: fus_related, dtype: int64

In [None]:
df['abstract'].nunique() #no duplicates

330

In [None]:
concatenated_df['fus_related'].value_counts()

0    335
1     85
Name: fus_related, dtype: int64

In [None]:
concatenated_df['abstract'].nunique() #there are duplicates

418

In [None]:
#duplicates
concatenated_df[concatenated_df.duplicated(subset=['abstract'], keep=False)]['abstract'].value_counts()

Modern transcranial magnetic resonance-guided focused ultrasound is an incisionless, ablative treatment modality for a growing number of neurologic disorders. This procedure selectively destroys a targeted volume of cerebral tissue and relies on real-time MR thermography to monitor tissue temperatures. By focusing on a submillimeter target through a hemispheric phased array of transducers, ultrasound waves pass through the skull and avoid overheating and brain damage. High-intensity focused ultrasound techniques are increasingly used to create safe and effective stereotactic ablations for medication-refractory movement and other neurologic and psychiatric disorders.                                                                                                                                                                                                                                                                                                                                      

In [None]:
concatenated_df = concatenated_df.drop_duplicates(subset='abstract', keep='first')

In [None]:
concatenated_df['fus_related'].value_counts()

0    335
1     83
Name: fus_related, dtype: int64

In [None]:
concatenated_df

Unnamed: 0,abstract,fus_related
0,We investigated the effect of low-intensity fo...,1
1,The blood-brain barrier (BBB) protects the bra...,1
2,OBJECTIVE: Alzheimer's disease (AD) is often a...,1
3,Background Neurodegenerative disorders (such a...,1
4,BACKGROUND: Coronary microthromboembolism afte...,1
...,...,...
415,The tremendous progress in engineering and com...,1
416,To make full use of the ability of magnetic re...,1
417,Various kinds of image-guided techniques have ...,1
418,High intensity focused ultrasound (HIFU) is ra...,1


In [None]:
# file path
file_path = '/content/drive/MyDrive/MSDS/Capstone/MSDS Capstone/Dataset/concatenated_data.csv'

# Save the DataFrame to a CSV file in Google Drive
concatenated_df.to_csv(file_path, index=False)

# Under-sampling FUS + sean data

In [None]:
# Separate features (X) and target variable (y)
X = concatenated_df['abstract']
y = concatenated_df['fus_related']

# Initialize RandomUnderSampler
rus = RandomUnderSampler(sampling_strategy='auto', random_state=0)
# Resample the data
X_resampled, y_resampled = rus.fit_resample(X.values.reshape(-1, 1), y)

# Convert X_resampled to DataFrame
concatenated_df2 = pd.DataFrame(X_resampled, columns=['abstract'])
concatenated_df2['fus_related'] = y_resampled
concatenated_df2['fus_related'].value_counts()

0    83
1    83
Name: fus_related, dtype: int64

In [None]:
concatenated_df2.isna().sum()

abstract       0
fus_related    0
dtype: int64

In [None]:
concatenated_df2['abstract'].nunique()

166

In [None]:
# file path
file_path = '/content/drive/MyDrive/MSDS/Capstone/MSDS Capstone/Dataset/concatenated_data_undersamp.csv'

# Save the DataFrame to a CSV file in Google Drive
concatenated_df2.to_csv(file_path, index=False)

# Access zotero data

In [None]:
files = []

for i in range(1, 23):
    # Read the CSV file using string formatting to construct the file path
    file_path = f"/content/drive/MyDrive/MSDS/Capstone/MSDS Capstone/Dataset/zotero_data/zotero_{i}.csv"
    # Read the CSV file and append it to the list
    files.append(pd.read_csv(file_path))

# Concatenate all the DataFrames in the list into a single DataFrame
all_files = pd.concat(files, ignore_index=True)

In [None]:
df = pd.DataFrame(all_files['Abstract Note'])
df

Unnamed: 0,Abstract Note
0,INTRODUCTION: Essential tremor is the most com...
1,Inhibition of asparagine endopeptidase (AEP) h...
2,Traditional cancer treatments have been associ...
3,AIM: To investigate the ovarian function and p...
4,GOAL: To develop a low-cost magnetic resonance...
...,...
1955,Microbubbles are increasingly used in several ...
1956,Acoustic cavitation has found a wide range of ...
1957,BACKGROUND: The phase correction on transcrani...
1958,


In [None]:
df.isna().sum()

Abstract Note    118
dtype: int64

In [None]:
df = df.dropna()

In [None]:
df.isna().sum() #no nas

Abstract Note    0
dtype: int64

In [None]:
df['fus_related'] = 1

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['fus_related'] = 1


In [None]:
df = df.rename(columns = {'Abstract Note':'abstract'})

In [None]:
df

Unnamed: 0,abstract,fus_related
0,INTRODUCTION: Essential tremor is the most com...,1
1,Inhibition of asparagine endopeptidase (AEP) h...,1
2,Traditional cancer treatments have been associ...,1
3,AIM: To investigate the ovarian function and p...,1
4,GOAL: To develop a low-cost magnetic resonance...,1
...,...,...
1952,Cavitation is a critical parameter in various ...,1
1955,Microbubbles are increasingly used in several ...,1
1956,Acoustic cavitation has found a wide range of ...,1
1957,BACKGROUND: The phase correction on transcrani...,1


In [None]:
df['abstract'].nunique() #there are duplicates

1712

In [None]:
#df[df.duplicated(subset=['abstract'], keep=False)]['abstract'].value_counts()

In [None]:
df = df.drop_duplicates(subset='abstract', keep='first')
df

Unnamed: 0,abstract,fus_related
0,INTRODUCTION: Essential tremor is the most com...,1
1,Inhibition of asparagine endopeptidase (AEP) h...,1
2,Traditional cancer treatments have been associ...,1
3,AIM: To investigate the ovarian function and p...,1
4,GOAL: To develop a low-cost magnetic resonance...,1
...,...,...
1952,Cavitation is a critical parameter in various ...,1
1955,Microbubbles are increasingly used in several ...,1
1956,Acoustic cavitation has found a wide range of ...,1
1957,BACKGROUND: The phase correction on transcrani...,1


In [None]:
nofus_df = pd.read_csv("/content/drive/MyDrive/MSDS/Capstone/MSDS Capstone/Dataset/diag_zotero.csv") #non-fus data
nofus_df =  nofus_df.rename(columns = {'Abstract Note':'abstract'})
#nofus_df.columns
nofus_df.drop(columns=['Unnamed: 0'], inplace=True)
nofus_df

Unnamed: 0,abstract,fus_related
0,OBJECTIVE: To critically appraise and quantify...,0
1,OBJECTIVES: To determine whether there are cli...,0
2,Rationale: Delirium severity and duration are ...,0
3,Delirium severity has been associated with a h...,0
4,PURPOSE: We aimed to determine any association...,0
...,...,...
2214,The paper touches upon the specific features o...,0
2215,Musculoskeletal ultrasound involves the use of...,0
2216,OBJECTIVE: The objective was to evaluate the f...,0
2217,A 44-year-old woman was admitted with obvious ...,0


In [None]:
nofus_df.isna().sum() #no nas

abstract       0
fus_related    0
dtype: int64

In [None]:
nofus_df['abstract'].nunique() #there are duplicates

2183

In [None]:
#nofus_df[nofus_df.duplicated(subset=['abstract'], keep=False)]['abstract'].value_counts()

In [None]:
nofus_df = nofus_df.drop_duplicates(subset='abstract', keep='first')
nofus_df

Unnamed: 0,abstract,fus_related
0,OBJECTIVE: To critically appraise and quantify...,0
1,OBJECTIVES: To determine whether there are cli...,0
2,Rationale: Delirium severity and duration are ...,0
3,Delirium severity has been associated with a h...,0
4,PURPOSE: We aimed to determine any association...,0
...,...,...
2214,The paper touches upon the specific features o...,0
2215,Musculoskeletal ultrasound involves the use of...,0
2216,OBJECTIVE: The objective was to evaluate the f...,0
2217,A 44-year-old woman was admitted with obvious ...,0


In [None]:
#add zotero data to concat data
concat_df = pd.read_csv("/content/drive/MyDrive/MSDS/Capstone/MSDS Capstone/Dataset/concatenated_data.csv")
zotero_df = pd.concat([df, concat_df, nofus_df], ignore_index=True)
zotero_df['fus_related'].value_counts()

0    2518
1    1795
Name: fus_related, dtype: int64

In [None]:
zotero_df.isna().sum()

abstract       0
fus_related    0
dtype: int64

In [None]:
zotero_df['abstract'].nunique() #there are duplicates

4311

In [None]:
#zotero_df[zotero_df.duplicated(subset=['abstract'], keep=False)]['abstract'].value_counts()

In [None]:
zotero_df = zotero_df.drop_duplicates(subset='abstract', keep='first')

In [None]:
zotero_df['fus_related'].value_counts()

0    2517
1    1794
Name: fus_related, dtype: int64

In [None]:
zotero_df

Unnamed: 0,abstract,fus_related
0,INTRODUCTION: Essential tremor is the most com...,1
1,Inhibition of asparagine endopeptidase (AEP) h...,1
2,Traditional cancer treatments have been associ...,1
3,AIM: To investigate the ovarian function and p...,1
4,GOAL: To develop a low-cost magnetic resonance...,1
...,...,...
4308,The paper touches upon the specific features o...,0
4309,Musculoskeletal ultrasound involves the use of...,0
4310,OBJECTIVE: The objective was to evaluate the f...,0
4311,A 44-year-old woman was admitted with obvious ...,0


In [None]:
#undersample 0

# Separate features (X) and target variable (y)
X = zotero_df['abstract']
y = zotero_df['fus_related']

# Initialize RandomUnderSampler
rus = RandomUnderSampler(sampling_strategy='auto', random_state=0)
# Resample the data
X_resampled, y_resampled = rus.fit_resample(X.values.reshape(-1, 1), y)

# Convert X_resampled to DataFrame
zotero_df = pd.DataFrame(X_resampled, columns=['abstract'])
zotero_df['fus_related'] = y_resampled
zotero_df['fus_related'].value_counts()

0    1794
1    1794
Name: fus_related, dtype: int64

In [None]:
zotero_df['abstract'].nunique() #there are no duplicates

3588

In [None]:
zotero_df.isna().sum()

abstract       0
fus_related    0
dtype: int64

In [None]:
# abstracts = [x for x in zotero_df['abstract']]
# abstracts

In [None]:
# pattern = r'\b([A-Za-z]+)(?::|\s)\1\b'

# # Perform the replacement
# processed_abstract = [re.sub(pattern, '', x) for x in zotero_df['abstract']]

# # Print the result to verify
# processed_abstract

In [None]:
zotero_df['abstract'] = [x.lower() for x in zotero_df['abstract']]
zotero_df

Unnamed: 0,abstract,fus_related
0,background: the intensive care environment and...,0
1,objectives: to determine associations between ...,0
2,objective: to describe methodology used to dia...,0
3,background: caring for lightly sedated intubat...,0
4,diagnostic ultrasound activates the contact ph...,0
...,...,...
3583,high-intensity focused ultrasound (hifu) is em...,1
3584,the tremendous progress in engineering and com...,1
3585,to make full use of the ability of magnetic re...,1
3586,high intensity focused ultrasound (hifu) is ra...,1


In [None]:
# clean_abstract = [re.sub(r'^\w+:\s*,?', '',x,1) for x in lower_abstract]
# clean_abstract

In [None]:
# file path
file_path = '/content/drive/MyDrive/MSDS/Capstone/MSDS Capstone/Dataset/zotero_data.csv'

# Save the DataFrame to a CSV file in Google Drive
zotero_df.to_csv(file_path, index=False)

# Final Sanity Check Data - Mike Reviewed Data


In [None]:
file1 = pd.read_excel('/content/drive/MyDrive/MSDS/Capstone/MSDS Capstone/Dataset/Publication_data_2023-10-02 - 2024-04-01.zip (Unzipped Files)/Publication_data_2023-10-02.xlsx',
                      sheet_name = 'One-to-one data')
file2 = pd.read_excel('/content/drive/MyDrive/MSDS/Capstone/MSDS Capstone/Dataset/Publication_data_2023-10-02 - 2024-04-01.zip (Unzipped Files)/Publication_data_2023-11-10.xlsx',
                     sheet_name = 'One-to-one data')
file3 = pd.read_excel('/content/drive/MyDrive/MSDS/Capstone/MSDS Capstone/Dataset/Publication_data_2023-10-02 - 2024-04-01.zip (Unzipped Files)/Publication_data_2023-12-04.xlsx',
                     sheet_name = 'One-to-one data')
file4 = pd.read_excel('/content/drive/MyDrive/MSDS/Capstone/MSDS Capstone/Dataset/Publication_data_2023-10-02 - 2024-04-01.zip (Unzipped Files)/Publication_data_2024-01-02.xlsx',
                     sheet_name = 'One-to-one data')
file5 = pd.read_excel('/content/drive/MyDrive/MSDS/Capstone/MSDS Capstone/Dataset/Publication_data_2023-10-02 - 2024-04-01.zip (Unzipped Files)/Publication_data_2024-02-01.xlsx',
                     sheet_name = 'One-to-one data')
file6 = pd.read_excel('/content/drive/MyDrive/MSDS/Capstone/MSDS Capstone/Dataset/Publication_data_2023-10-02 - 2024-04-01.zip (Unzipped Files)/Publication_data_2024-03-01.xlsx',
                     sheet_name = 'One-to-one data')
file7 = pd.read_excel('/content/drive/MyDrive/MSDS/Capstone/MSDS Capstone/Dataset/Publication_data_2023-10-02 - 2024-04-01.zip (Unzipped Files)/Publication_data_2024-04-01.xlsx',
                     sheet_name = 'One-to-one data')

In [None]:
df = pd.concat([file1, file2, file3, file4, file5, file6, file7], ignore_index=True)

In [None]:
rename_columns = {'PubMed ID':'pubmed_id',
                  'DOI Link':'doi_link',
                  'PMC Link':'pmc_link',
                  'Article Title':'article_title',
                  'ind list':'ind_list',
                  'Author List':'author_list',
                  'First / Last Author':'first_last_author',
                  'Corresponding Author(s)':'corres_author',
                  'Article Reference':'article_ref',
                  'Publication Type':'pub_type',
                  'Abstract':'abstract',
                  'Website Page':'web_page',
                  'Pubmed Link':'pubmed_link'
}
df = df.rename(rename_columns,axis=1)
df = df[['abstract']].dropna()
df['abstract'] = df['abstract'].astype(str)
df['abstract'] = [x.lower() for x in df['abstract']]

In [None]:
df

Unnamed: 0,abstract
1,transcranial focused shockwave (fsw) is a nove...
2,rationale and objectives: the purpose of this ...
3,objective: we introduce a non-invasive mr-acou...
4,background: moderate-to-severe acute pain is p...
5,objective: medial thalamotomies were introduce...
...,...
497,nanocone clusters (nccs) have been developed a...
498,thrombotic occlusions of large blood vessels a...
499,in the last decades there has been progress in...
500,background: diffuse midline glioma (dmg) is a ...


In [None]:
#duplicates
abstract_counts = df['abstract'].value_counts()
abstract_counts  = abstract_counts.to_frame().reset_index()
abstract_counts.columns = ['abstract', 'count']
abstract_counts[abstract_counts['count']>1]

Unnamed: 0,abstract,count
0,transcranial focused ultrasound is a novel tec...,3
1,transcranial focused shockwave (fsw) is a nove...,2
2,the blood-brain barrier (bbb) plays a critical...,2
3,objective: the aim of the study described here...,2
4,"introduction: since 1980, extracorporeal shock...",2
...,...,...
62,objective: the use of magnetic resonance-guide...,2
63,purpose: we reported preliminary outcomes of h...,2
64,background: focused ultrasound (fus) shows pro...,2
65,objective: skull density ratio (sdr) influence...,2


In [None]:
df = df.drop_duplicates(subset='abstract', keep='first')
df

Unnamed: 0,abstract
1,transcranial focused shockwave (fsw) is a nove...
2,rationale and objectives: the purpose of this ...
3,objective: we introduce a non-invasive mr-acou...
4,background: moderate-to-severe acute pain is p...
5,objective: medial thalamotomies were introduce...
...,...
497,nanocone clusters (nccs) have been developed a...
498,thrombotic occlusions of large blood vessels a...
499,in the last decades there has been progress in...
500,background: diffuse midline glioma (dmg) is a ...


In [None]:
# file path
file_path = '/content/drive/MyDrive/MSDS/Capstone/MSDS Capstone/Dataset/sanity_data_tim.xlsx'

# Save the DataFrame to a CSV file in Google Drive
df.to_excel(file_path, index=False)

# Not Reviewed April (Mike)

In [None]:
file1 = pd.read_excel('/content/drive/MyDrive/MSDS/Capstone/MSDS Capstone/Dataset/Publication_data_2023-10-02 - 2024-04-01.zip (Unzipped Files)/Publication_data_2024-04-01.xlsx',
                     sheet_name = 'One-to-one data')
file2 = pd.read_excel('/content/drive/MyDrive/MSDS/Capstone/MSDS Capstone/Dataset/Not_Review_April_Mike.xlsx',
                      sheet_name = 'One-to-one data')

In [None]:
file1

Unnamed: 0,PubMed ID,DOI Link,PMC Link,Article Title,Author List,First / Last Author,Corresponding Author(s),Article Reference,Publication Type,Abstract,Website Page,Consolidated Web Info,Pubmed Link
0,37482962,https://doi.org/10.1002/jmri.28916,,"Editorial for ""Alteration of White Matter Conn...","Yeo, Desmond Teck Beng",Yeo DTB,,J Magn Reson Imaging. 2024 Apr;59(4):1371-1372.,Editorial,,,"Yeo DTB. Editorial for ""Alteration of White Ma...",https://pubmed.ncbi.nlm.nih.gov/37482962/
1,37491872,https://doi.org/10.1002/jmri.28896,,Alteration of White Matter Connectivity for MR...,"Wang, Xiaoyu; Lin, Jiaji; Lu, Haoxuan; Xiong, ...",Wang X / Lou X,,J Magn Reson Imaging. 2024 Apr;59(4):1358-1370.,Journal Article,BACKGROUND: Magnetic resonance-guided focused ...,,"Wang X, Lin J, Lu H, Xiong Y, Duan C, Zhang D,...",https://pubmed.ncbi.nlm.nih.gov/37491872/
2,37657095,https://doi.org/10.3171/2023.6.JNS23171,,Successful magnetic resonance-guided focused u...,"Vetkas, Artur; Boutet, Alexandre; Sarica, Can;...",Vetkas A / Lozano AM,,J Neurosurg. 2024 Mar 1;140(3):639-647.,Journal Article,OBJECTIVE: The use of magnetic resonance-guide...,,"Vetkas A, Boutet A, Sarica C, Germann J, Gwun ...",https://pubmed.ncbi.nlm.nih.gov/37657095/
3,37657096,https://doi.org/10.3171/2023.6.JNS231153,,Consistency is key: influence of skull density...,"Kyle, Kain; Maamary, Joel; Jonker, Benjamin; P...",Kyle K / Tisch S,,J Neurosurg. 2024 Mar 1;140(3):648-656.,Journal Article,OBJECTIVE: Skull density ratio (SDR) influence...,,"Kyle K, Maamary J, Jonker B, Peters J, Barnett...",https://pubmed.ncbi.nlm.nih.gov/37657096/
4,37696982,https://doi.org/10.1038/s41434-023-00421-1,,Acoustically targeted noninvasive gene therapy...,"Nouraein, Shirin; Lee, Sangsin; Saenz, Vidal A...",Nouraein S / Szablowski JO,"Szablowski, Jerzy O",Gene Ther. 2024 Mar;31(3-4):85-94.,Journal Article,Focused Ultrasound Blood-Brain Barrier Opening...,,"Nouraein S, Lee S, Saenz VA, Del Mundo HC, Yiu...",https://pubmed.ncbi.nlm.nih.gov/37696982/
...,...,...,...,...,...,...,...,...,...,...,...,...,...
79,38551360,https://doi.org/10.1021/acs.molpharmaceut.3c01178,,Investigation of Optimum Production Conditions...,"Mustafa, Waleed; Hall, Sarah; Huynh, Laura; Ma...",Mustafa W / Yuksel Durmaz Y,,Mol Pharm. 2024 Mar 29.,Journal Article,Nanocone clusters (NCCs) have been developed a...,,"Mustafa W, Hall S, Huynh L, Mannasse R, Lulebu...",https://pubmed.ncbi.nlm.nih.gov/38551360/
80,38553135,https://doi.org/10.1016/j.ultras.2023.107223,,Inducing cavitation within hollow cylindrical ...,"Gong, Li; Wright, Alex R; Hynynen, Kullervo; G...",Gong L / Goertz DE,"Gong, Li",Ultrasonics. 2024 Mar;138:107223.,Journal Article,Thrombotic occlusions of large blood vessels a...,,"Gong L, Wright AR, Hynynen K, Goertz DE. Induc...",https://pubmed.ncbi.nlm.nih.gov/38553135/
81,38553256,https://doi.org/10.1016/j.medcli.2023.12.013,,Essential Tremor: Update of Therapeutic Strate...,"Gironell, Alexandre; Marín-Lahoz, Juan; Póveda...",Gironell A / Póveda S,"Gironell, Alexandre",Med Clin (Barc). 2024 Mar 28:S0025-7753(24)000...,Journal Article; Review,In the last decades there has been progress in...,,"Gironell A, Marín-Lahoz J, Póveda S. Essential...",https://pubmed.ncbi.nlm.nih.gov/38553256/
82,38555449,https://doi.org/10.1186/s12967-024-05096-9,PMC10981822,Focused ultrasound-mediated blood-brain barrie...,"Tazhibi, Masih; McQuillan, Nicholas; Wei, Hong...",Tazhibi M / Wu CC,"Wu, Cheng-Chia; Zacharoulis, Stergios",J Transl Med. 2024 Mar 30;22(1):320.,Journal Article,BACKGROUND: Diffuse midline glioma (DMG) is a ...,,"Tazhibi M, McQuillan N, Wei HJ, Gallitto M, Be...",https://pubmed.ncbi.nlm.nih.gov/38555449/


In [None]:
file2

Unnamed: 0,PubMed ID,DOI Link,PMC Link,Article Title,Author List,First / Last Author,Corresponding Author(s),Article Reference,Publication Type,Abstract,Website Page,Consolidated Web Info,Pubmed Link
0,35780953,https://doi.org/10.1016/j.jconrel.2022.06.042,,"Reply to Letter from Price et al, re: Translat...","Brighi, Caterina; Salimova, Ekaterina; de Veer...",Brighi C / Egan G,"Egan, Gary",J Control Release. 2024 Feb;366:879.,Letter,,,"Brighi C, Salimova E, de Veer M, Puttick S, Eg...",https://pubmed.ncbi.nlm.nih.gov/35780953/
1,36394665,https://doi.org/10.1007/s00404-022-06761-4,PMC10348920,Letter to the editor: Römer et al. The signifi...,"David, Matthias",David M,"David, Matthias",Arch Gynecol Obstet. 2023 Sep;308(3):1045.,Letter,,,David M. Letter to the editor: Römer et al. Th...,https://pubmed.ncbi.nlm.nih.gov/36394665/
2,36411569,https://doi.org/10.2174/1570159X21666221121094343,PMC10964101,Surgical Advances in Parkinson's Disease.,"Hvingelby, Victor S; Pavese, Nicola",Hvingelby VS / Pavese N,,Curr Neuropharmacol. 2024;22(6):1033-1046.,Review; Journal Article,While symptomatic pharmacological therapy rema...,,"Hvingelby VS, Pavese N. Surgical Advances in P...",https://pubmed.ncbi.nlm.nih.gov/36411569/
3,36533878,https://doi.org/10.1080/10717544.2022.2157068,PMC9769131,Blood-cerebrospinal fluid barrier opening by m...,"Kung, Yi; Wu, Chueh-Hung; Lin, Meng-Ting; Liao...",Kung Y / Hsiao MY,,Drug Deliv. 2023 Dec;30(1):97-107.,Journal Article,Transcranial focused shockwave (FSW) is a nove...,,"Kung Y, Wu CH, Lin MT, Liao WH, Chen WS, Hsiao...",https://pubmed.ncbi.nlm.nih.gov/36533878/
4,36575807,https://doi.org/10.1111/nicc.12871,PMC9880746,Intensive care nurse-led point of care ultraso...,"Corcoran, Eleanor; Hopkins, Phil; Fisher, Rich...",Corcoran E / Rose L,,Nurs Crit Care. 2023 Sep;28(5):781-788.,"Case Reports; Research Support, Non-U.S. Gov't",Focused ultrasound can be used to rapidly diag...,,"Corcoran E, Hopkins P, Fisher R, Wong A, Rose ...",https://pubmed.ncbi.nlm.nih.gov/36575807/
...,...,...,...,...,...,...,...,...,...,...,...,...,...
525,38585032,https://doi.org/10.12786/bn.2024.17.e5,PMC10990843,Update on Non-invasive Brain Stimulation on St...,"Kim, Sejoon; Park, Hae-Yeon",Kim S / Park HY,,Brain Neurorehabil. 2024 Jan 31;17(1):e5.,Journal Article; Review,Stroke is a leading global cause of death and ...,,"Kim S, Park HY. Update on Non-invasive Brain S...",https://pubmed.ncbi.nlm.nih.gov/38585032/
526,38585361,https://doi.org/10.3389/fneur.2024.1362712,PMC10995240,Early cortico-muscular coherence and cortical ...,"Visani, Elisa; Panzica, Ferruccio; Franceschet...",Visani E / Eleopra R,,Front Neurol. 2024 Mar 22;15:1362712.,Journal Article,INTRODUCTION: To investigate cortical network ...,,"Visani E, Panzica F, Franceschetti S, Golfrè A...",https://pubmed.ncbi.nlm.nih.gov/38585361/
527,38589407,https://doi.org/10.1038/s41597-024-03197-0,PMC11002007,A large normative connectome for exploring the...,"Elias, Gavin J B; Germann, Jürgen; Joel, Sures...",Elias GJB / Lozano AM,"Lozano, Andres M",Sci Data. 2024 Apr 8;11(1):353.,Journal Article,Diffusion-weighted MRI (dMRI) is a widely used...,,"Elias GJB, Germann J, Joel SE, Li N, Horn A, B...",https://pubmed.ncbi.nlm.nih.gov/38589407/
528,38595847,https://doi.org/10.3389/fneur.2024.1345873,PMC11002122,The evolution of ventral intermediate nucleus ...,"Jameel, Ayesha; Akgun, Sena; Yousif, Nada; Smi...",Jameel A / Gedroyc W,,Front Neurol. 2024 Mar 26;15:1345873.,Journal Article,BACKGROUND: The ventral intermediate nucleus (...,,"Jameel A, Akgun S, Yousif N, Smith J, Jones B,...",https://pubmed.ncbi.nlm.nih.gov/38595847/


In [None]:
file2['fus_related'] = file2['PubMed ID'].apply(lambda x: 1 if x in file1['PubMed ID'].values else 0)

In [None]:
file2['fus_related'].value_counts()

fus_related
0    446
1     84
Name: count, dtype: int64

In [None]:
#cleaning labeled file
rename_columns = {'PubMed ID':'pubmed_id',
                  'DOI Link':'doi_link',
                  'PMC Link':'pmc_link',
                  'Article Title':'article_title',
                  'ind list':'ind_list',
                  'Author List':'author_list',
                  'First / Last Author':'first_last_author',
                  'Corresponding Author(s)':'corres_author',
                  'Article Reference':'article_ref',
                  'Publication Type':'pub_type',
                  'Abstract':'abstract',
                  'Website Page':'web_page',
                  'Pubmed Link':'pubmed_link',
}
file2 = file2.rename(rename_columns,axis=1)
file2 = file2[['abstract', 'fus_related']]
file2 = file2.dropna()
file2['abstract'] =file2['abstract'].astype(str)
file2['abstract'] = [x.lower() for x in file2['abstract']]

In [None]:
file2['fus_related'].value_counts()

fus_related
0    419
1     75
Name: count, dtype: int64

In [None]:
#duplicates
abstract_counts = file2['abstract'].value_counts()
abstract_counts  = abstract_counts.to_frame().reset_index()
abstract_counts.columns = ['abstract', 'count']
abstract_counts[abstract_counts['count']>1] #no duplicates

Unnamed: 0,abstract,count


In [None]:
# file path
file_path = '/content/drive/MyDrive/MSDS/Capstone/MSDS Capstone/Dataset/labeled_data_tim.xlsx'

# Save the DataFrame to a CSV file in Google Drive
file2.to_excel(file_path, index=False)