<a href="https://colab.research.google.com/github/rabina302/StarredPaperMaster/blob/main/scispacy_example.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np
import spacy
import scispacy
from scispacy.abbreviation import AbbreviationDetector
from scispacy.umls_linking import UmlsEntityLinker
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.layers import LSTM, Dense, Embedding, Dropout, Dense
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# !pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.3/en_core_sci_sm-0.5.3.tar.gz

In [3]:
# !pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.3/en_core_sci_md-0.5.3.tar.gz

In [4]:
# Load the SciSpaCy model
nlp = spacy.load("en_core_sci_sm")

# # Add abbreviation detector
# abbreviation_pipe = AbbreviationDetector(nlp)
# nlp.add_pipe("abbreviation_detector")

# # Add UMLS entity linker
# linker = UmlsEntityLinker(resolve_abbreviations=True)
# nlp.add_pipe("entity_linker")

  deserializers["tokenizer"] = lambda p: self.tokenizer.from_disk(  # type: ignore[union-attr]


In [5]:
import spacy

from scispacy.abbreviation import AbbreviationDetector

nlp = spacy.load("en_core_sci_sm")

# Add the abbreviation pipe to the spacy pipeline.
nlp.add_pipe("abbreviation_detector")

doc = nlp("Spinal and bulbar muscular atrophy (SBMA) is an \
           inherited motor neuron disease caused by the expansion \
           of a polyglutamine tract within the androgen receptor (AR). \
           SBMA can be caused by this easily.")

print("Abbreviation", "\t", "Definition")
for abrv in doc._.abbreviations:
	print(f"{abrv} \t ({abrv.start}, {abrv.end}) {abrv._.long_form}")

Abbreviation 	 Definition
SBMA 	 (6, 7) Spinal and bulbar muscular atrophy
SBMA 	 (33, 34) Spinal and bulbar muscular atrophy
AR 	 (29, 30) androgen receptor


In [6]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [7]:
# Read CSV file
df = pd.read_csv('/content/drive/MyDrive/StarredPaper/Data/data.csv')
df = df.dropna(subset=['text', 'label']) # Remove all rows with missing values
df['text'] = df['text'].str.replace(r'[^\w\s]+', '', regex=True) # Remove all punctuation
df['text'] = df['text'].str.encode('ascii', 'ignore').str.decode('ascii') # Remove all non-ASCII chars
df['label'] = df['label'].str.lower()

In [8]:
df

Unnamed: 0,title,text,source,label
0,Due to the recent outbreak for the Coronavirus...,You just need to add water and the drugs and v...,coronavirusmedicalkit.com,fake
1,,Hydroxychloroquine has been shown to have a 10...,RudyGiuliani,fake
2,,Fact Hydroxychloroquine has been shown to have...,CharlieKirk,fake
3,,The Corona virus is a man made virus created i...,JoanneWrightForCongress,fake
4,,Doesnt BillGates finance research at the Wuhan...,JoanneWrightForCongress,fake
...,...,...,...,...
1159,Could the Power of the Sun Slow the Coronavirus?,A study suggests that ultraviolet rays could s...,https://www.nytimes.com/,true
1160,Key evidence for coronavirus spread is flawed ...,Last week a medical journal reported that a bu...,https://www.nytimes.com/,true
1161,Summer Heat May Not Diminish Coronavirus Strength,A new report sent to the White House science a...,https://www.nytimes.com/,true
1162,How Long Will a Vaccine Really Take?,A vaccine would be the ultimate weapon against...,https://www.nytimes.com/,true


In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Extract features from the "text" column
features = []
for text in df["text"]:
    doc = nlp(text)
    lemmatized_tokens = [token.lemma_ for token in doc if not token.is_stop and token.is_alpha]
    preprocessed_text = ' '.join(lemmatized_tokens)
    features.append(preprocessed_text)
# Pad the sequences to a fixed length
# max_length = max([len(feature) for feature in features])


  global_matches = self.global_matcher(doc)


In [10]:
tfidf_features = TfidfVectorizer().fit_transform(features)

# Convert the sparse matrix to a dense array for inspection
tfidf_features_array = tfidf_features.toarray()

# Print the extracted features
print("Extracted TF-IDF features:")
print(tfidf_features_array)

Extracted TF-IDF features:
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


In [11]:
# features = pad_sequences(features, maxlen=max_length, padding="post")

In [12]:
# Extract the labels from the DataFrame (assuming they exist)

# Map string labels to numeric values
label_mapping = {'true': 0, 'fake': 1}
df['label'] = df['label'].map(label_mapping)

In [13]:
labels = df["label"].values

In [14]:
labels

array([1, 1, 1, ..., 0, 0, 0])

In [15]:
X_train, X_test, y_train, y_test = train_test_split(tfidf_features_array, labels, test_size=0.2, random_state=42)

In [16]:
# Build the model
model = Sequential()
model.add(Embedding(input_dim=tfidf_features_array.shape[1], output_dim=128))  # Input dimension should match the number of features
model.add(LSTM(128, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(1, activation="sigmoid"))

model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

In [None]:
# Define early stopping callback
early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

# Train the model
history = model.fit(X_train, y_train, epochs=10, batch_size=64, validation_data=(X_test, y_test), verbose=2, callbacks=[early_stopping])

Epoch 1/10


In [None]:
loss, accuracy = model.evaluate(X_test, y_test)
print("Loss:", loss)
print("Accuracy:", accuracy)

In [None]:
# Predict probabilities
y_pred_probs = model.predict(X_test)

# Convert probabilities to class predictions
y_pred_classes = (y_pred_probs > 0.5).astype(int)

In [None]:


# Calculate evaluation metrics
accuracy = accuracy_score(y_test, y_pred_classes)
precision = precision_score(y_test, y_pred_classes)
recall = recall_score(y_test, y_pred_classes)
f1 = f1_score(y_test, y_pred_classes)

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)

# Generate confusion matrix
cm = confusion_matrix(y_test, y_pred_classes)

# Plot confusion matrix
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', cbar=False)
plt.xlabel('Predicted Labels')
plt.ylabel('True Labels')
plt.title('Confusion Matrix')
plt.show()
