### To install

In [None]:
!pip install spacy
!pip install nltk 
!python3 -m spacy download en_core_web_sm

In [2]:
import spacy
from nltk.stem import PorterStemmer
from spacy import displacy
import os
from pathlib import Path

# Load the English model
nlp = spacy.load("en_core_web_sm")

2023-09-10 18:57:17.494845: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [65]:
Path("output").mkdir(parents=True, exist_ok=True)

In [39]:
def process_document(sentence, remove_stopwords=True, stemming=False, lemmatization=False):    
    doc = nlp(sentence)
    print(type(doc))

    # Removing stop words from doc
    if remove_stopwords:
        doc__no_stop = [token for token in doc if not token.is_stop]
    
    # Tokenization
    tokens = [token.text for token in doc if not token.is_stop]
    print("Tokens:", tokens)
    print("Number of tokens:", len(tokens))

    # Initialize stemmer and Stemming of tokens
    stemmer = PorterStemmer()
    stems = [stemmer.stem(word) for word in tokens]
    print("Stems:", stems)
    print("Number of Stems:", len(stems))

    # Lemmas
    lemmas = [token.lemma_ for token in doc if not token.is_stop]
    print("Lemmas:", lemmas)
    print("Number of Lemmas:", len(lemmas))

    # Stop words
    stop_words = [token.text for token in doc if not token.is_stop]
    print("Checking for Stop Words:", stop_words)

    return doc

### Process an example of the txt file in the dataset
##### spacey visualization results below
The entity visualizer, ent, highlights named entities and their labels in a text.

In [40]:
# read file inside local folder test/neg/0_2.txt
text = None
with open('aclImdb/test/neg/0_2.txt', 'r') as file:
    text = file.read().replace('\n', '')

    # process the data raw 
    print("Data With stop words removed\n")
    process_document(text, remove_stopwords=False)
    print("\n\n")
    print("Data With stop words removed\n")
    # process the data with stop words removed
    doc = process_document(text, remove_stopwords=True)
    displacy.render(doc, style="ent", jupyter=True)

Data With stop words removed

<class 'spacy.tokens.doc.Doc'>
Tokens: ['Mr.', 'Costner', 'dragged', 'movie', 'far', 'longer', 'necessary', '.', 'Aside', 'terrific', 'sea', 'rescue', 'sequences', ',', 'care', 'characters', '.', 'ghosts', 'closet', ',', 'Costner', 'character', 'realized', 'early', ',', 'forgotten', 'later', ',', 'time', 'care', '.', 'character', 'care', 'cocky', ',', 'overconfident', 'Ashton', 'Kutcher', '.', 'problem', 'comes', 'kid', 'thinks', 'better', 'shows', 'signs', 'cluttered', 'closet', '.', 'obstacle', 'appears', 'winning', 'Costner', '.', 'Finally', 'past', 'half', 'way', 'point', 'stinker', ',', 'Costner', 'tells', 'Kutcher', 'ghosts', '.', 'told', 'Kutcher', 'driven', 'best', 'prior', 'inkling', 'foreshadowing', '.', 'magic', ',', 'turning', 'hour', '.']
Number of tokens: 79
Stems: ['mr.', 'costner', 'drag', 'movi', 'far', 'longer', 'necessari', '.', 'asid', 'terrif', 'sea', 'rescu', 'sequenc', ',', 'care', 'charact', '.', 'ghost', 'closet', ',', 'costner', '

### Results with stop words removed 
tokens  / stems / lemmas = 187

###
tokens  / stems / lemmas = 79

### Generating svg's of depenency parse
The dependency visualizer, dep, shows part-of-speech tags and syntactic dependencies.



In [32]:

res = []
dir_path = "aclImdb/test/neg"
output_path = "image/"
# Iterate directory
for file_path in os.listdir(dir_path):
    # check if current file_path is a file 
    if os.path.isfile(os.path.join(dir_path, file_path)):
        # add filename to list
        res.append(file_path)

        doc = nlp(text)
        svg = displacy.render(doc, style="dep",minify=True,jupyter=False, options={'distance': 60})
        file_name = file_path.split(".")[0]
        output_path = Path("image/" + dir_path + file_name + "-dep.svg")
        output_path.open("w", encoding="utf-8").write(svg)

    

#### Printing readable pdf

In [35]:
!jupyter nbconvert --to pdf SteveA-NLP-1.1.ipynb

[NbConvertApp] Converting notebook SteveA-NLP-1.1.ipynb to pdf
Your version must be at least (1.12.1) but less than (3.0.0).
Refer to https://pandoc.org/installing.html.
Continuing with doubts...
  check_pandoc_version()
[NbConvertApp] Writing 36361 bytes to notebook.tex
[NbConvertApp] Building PDF
[NbConvertApp] Running xelatex 3 times: ['xelatex', 'notebook.tex', '-quiet']
[NbConvertApp] Running bibtex 1 time: ['bibtex', 'notebook']
[NbConvertApp] PDF successfully created
[NbConvertApp] Writing 37850 bytes to SteveA-NLP-1.1.pdf


In [59]:
!git push origin main

Everything up-to-date
