# Day 7

Use Genism to preprocess data from a sample text file, follow basic procedures like tokenization, stemming, lemmatization etc.


In [9]:
!pip install gensim nltk



In [4]:
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
Installing collected packages: en-core-web-sm
Successfully installed en-core-web-sm-3.8.0
[38;5;2m[+] Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [7]:
import gensim
from gensim.utils import simple_preprocess
from nltk.stem import SnowballStemmer
from nltk.corpus import stopwords
import spacy
import os

import nltk
nltk.download('stopwords')
nltk.download('wordnet')

stemmer = SnowballStemmer("english")
nlp = spacy.load("en_core_web_sm")
stop_words = set(stopwords.words("english"))

def tokenize(text):
    """Tokenize the text using Gensim's simple_preprocess."""
    return simple_preprocess(text, deacc=True)  # deacc=True removes punctuations

def stem_tokens(tokens):
    """Apply stemming to the tokens."""
    return [stemmer.stem(token) for token in tokens]

def lemmatize_tokens(tokens):
    """Apply lemmatization using spaCy."""
    doc = nlp(" ".join(tokens))
    return [token.lemma_ for token in doc if token.lemma_ != "-PRON-"]

def remove_stopwords(tokens):
    """Remove stopwords from the tokens."""
    return [token for token in tokens if token not in stop_words]

def preprocess_text(text):
    """Complete preprocessing pipeline: tokenize, remove stopwords, stem, and lemmatize."""
    tokens = tokenize(text)
    tokens = remove_stopwords(tokens)
    tokens = stem_tokens(tokens)
    tokens = lemmatize_tokens(tokens)
    return tokens

def preprocess_file(file_path):
    if not os.path.exists(file_path):
        raise FileNotFoundError(f"The file {file_path} does not exist.")

    with open(file_path, "r", encoding="utf-8") as file:
        text = file.read()

    processed_tokens = preprocess_text(text)
    return processed_tokens

if __name__ == "__main__":
    sample_file = "sample_text.txt"  

    try:
        processed_tokens = preprocess_file(sample_file)
        print("Processed Tokens:", processed_tokens)
    except Exception as e:
        print(f"Error: {e}")
        
#The file sample_text contains: The quick brown fox jumps over the lazy dog. This sentence contains every letter of the English alphabet. It's often used to test typewriters and keyboards. Let's preprocess this text!


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\akash\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\akash\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Processed Tokens: ['quick', 'brown', 'fox', 'jump', 'lazi', 'dog', 'sentenc', 'contain', 'everi', 'letter', 'english', 'alphabet', 'often', 'use', 'test', 'typewrit', 'keyboard', 'let', 'preprocess', 'text']
