<a href="https://colab.research.google.com/github/sudip2k17/TextPreprocessingWithSpacy/blob/main/Preprocessing_Pipeline_with_SpaCy.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Preprocessing Pipeline with SpaCy**

Using SpaCy for stopword removal, lemmatization, and TF-IDF preparation

In [1]:
import spacy
import string
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
# Load SpaCy model
nlp = spacy.load("en_core_web_sm")

In [3]:
# Custom text preprocessing function
def preprocess_text(text):
    doc = nlp(text.lower())  # Convert to lowercase and tokenize
    tokens = [
        token.lemma_ for token in doc
        if token.text not in string.punctuation  # Remove punctuation
        and not token.is_stop  # Remove stopwords
        and not token.is_space  # Remove extra spaces
    ]
    return " ".join(tokens)

In [7]:
# Example text
sample_text = "IgE sensitization to Aspergillus fumigatus and a positive sputum fungal culture result are common in patients with refractory asthma ."
processed_text = preprocess_text(sample_text)
print(processed_text)  # Output: "result research highly significant medical study"

# Apply TF-IDF on processed text
corpus = [processed_text]  # Example corpus
vectorizer = TfidfVectorizer(ngram_range=(1,3))  # Using higher n-grams (uni-, bi-, tri-grams)
X_tfidf = vectorizer.fit_transform(corpus)

# Show extracted features
print(vectorizer.get_feature_names_out())

ige sensitization aspergillus fumigatus positive sputum fungal culture result common patient refractory asthma
['aspergillus' 'aspergillus fumigatus' 'aspergillus fumigatus positive'
 'asthma' 'common' 'common patient' 'common patient refractory' 'culture'
 'culture result' 'culture result common' 'fumigatus' 'fumigatus positive'
 'fumigatus positive sputum' 'fungal' 'fungal culture'
 'fungal culture result' 'ige' 'ige sensitization'
 'ige sensitization aspergillus' 'patient' 'patient refractory'
 'patient refractory asthma' 'positive' 'positive sputum'
 'positive sputum fungal' 'refractory' 'refractory asthma' 'result'
 'result common' 'result common patient' 'sensitization'
 'sensitization aspergillus' 'sensitization aspergillus fumigatus'
 'sputum' 'sputum fungal' 'sputum fungal culture']
