### To install

In [None]:
!pip install spacy
!pip install nltk 
!python3 -m spacy download en_core_web_sm

# Download dataset to root of the project with this file in the same directory 

In [3]:
import spacy
from nltk.stem import PorterStemmer
from spacy import displacy
import os
from pathlib import Path
from collections import Counter

# Load the English model
nlp = spacy.load("en_core_web_sm")

2023-09-11 16:29:49.406155: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


#### Main custom function for data analysis

In [4]:
def process_document(sentence, remove_stopwords=True):    
    doc = nlp(sentence)
    doc_tokens = doc

    # Removing stop words from doc
    if remove_stopwords:
        doc_tokens = [token for token in doc if not token.is_stop]
    
    # Tokenization
    tokens = [token.text for token in doc_tokens]
    print("Tokens:", tokens)
    print("Number of tokens:", len(tokens))

    # Initialize stemmer and Stemming of tokens
    stemmer = PorterStemmer()
    stems = [stemmer.stem(word) for word in tokens]
    print("Stems:", stems)
    print("Number of Stems:", len(stems))

    # Lemmas
    lemmas = [token.lemma_ for token in doc_tokens]
    print("Lemmas:", lemmas)
    print("Number of Lemmas:", len(lemmas))

    # Stop words
    stop_words = [token.text for token in doc_tokens if token.is_stop]
    print("Checking for Stop Words:", stop_words)

    word_freq = Counter()

    # Counting the frequency of words
    for token in doc_tokens:
        if not token.is_punct and not token.is_stop:
            word_freq[token.text.lower()] += 1  # Convert to lowercase

    # Get the most common words
    common_words = word_freq.most_common(5)
    print("Most common words:", common_words)

    return doc

#### Results of sample file aclImdb/test/neg/0_2.txt
Raw movie review 
tokens, stems, lemmas all equealed 187 tokens

After stop words removed 
tokens, stems, and lemmas = 79 tokens

Most common words with counter: 
[('costner', 4), ('care', 3), ('kutcher', 3), ('ghosts', 2), ('closet', 2)]

An image of the results produced by the name entity visualizer can be view in Figure 1 of the report. 

In [7]:
# read file inside local folder test/neg/0_2.txt
text = None
svg = None
file_path = '0_2.txt'
with open('aclImdb/test/neg/0_2.txt', 'r') as file:
    text = file.read().replace('\n', '')

    # process the data raw 
    print("Data Raw\n")
    process_document(text, remove_stopwords=False)
    print("\n\n")
    print("Data With stop words removed\n")
    # process the data with stop words removed
    doc = process_document(text, remove_stopwords=True)
    
    svg = displacy.render(doc, style="ent", minify=True,jupyter=False)
    file_name = file_path.split(".")[0]
    output_path = Path("img_res/test/neg/" + file_name + "_ent.svg")
    output_path.open("w", encoding="utf-8").write(svg)

Data Raw

Tokens: ['Once', 'again', 'Mr.', 'Costner', 'has', 'dragged', 'out', 'a', 'movie', 'for', 'far', 'longer', 'than', 'necessary', '.', 'Aside', 'from', 'the', 'terrific', 'sea', 'rescue', 'sequences', ',', 'of', 'which', 'there', 'are', 'very', 'few', 'I', 'just', 'did', 'not', 'care', 'about', 'any', 'of', 'the', 'characters', '.', 'Most', 'of', 'us', 'have', 'ghosts', 'in', 'the', 'closet', ',', 'and', 'Costner', "'s", 'character', 'are', 'realized', 'early', 'on', ',', 'and', 'then', 'forgotten', 'until', 'much', 'later', ',', 'by', 'which', 'time', 'I', 'did', 'not', 'care', '.', 'The', 'character', 'we', 'should', 'really', 'care', 'about', 'is', 'a', 'very', 'cocky', ',', 'overconfident', 'Ashton', 'Kutcher', '.', 'The', 'problem', 'is', 'he', 'comes', 'off', 'as', 'kid', 'who', 'thinks', 'he', "'s", 'better', 'than', 'anyone', 'else', 'around', 'him', 'and', 'shows', 'no', 'signs', 'of', 'a', 'cluttered', 'closet', '.', 'His', 'only', 'obstacle', 'appears', 'to', 'be', '

#### Code for running on a directory

This cell works and was able to produce the most insight by seeing the various reviews with techniques used in the process_document function.  

Ideally I'd want to make seperate output files for this data, the entity recognizer and the dependency visualizer. 

In [None]:
dir_path = "aclImdb/test/neg"
output_path = 'img_res/test/neg/'

# Iterate directory
for file_name in os.listdir(dir_path):
    # Create the full file path by joining the directory path and filename
    file_path = os.path.join(dir_path, file_name)

    # Check if the current file_path is a file
    if os.path.isfile(file_path):
        # Read the text from the file
        with open(file_path, 'r') as f:
            text = f.read()

        # process the data raw 
        print("Data Raw\n")
        process_document(text, remove_stopwords=False)
        print("\n\n")
        print("Data With stop words removed\n")
        # process the data with stop words removed
        doc = process_document(text, remove_stopwords=True)

        # Works but commented out to show other stats on multiple txt files
        # Creating the entity Recognizer 
        # svg = displacy.render(doc, style="ent", minify=True,jupyter=False)
        # file_name = file_path.split(".")[0]
        # output_path = Path("img_res/test/neg/" + file_name + "_ent.svg")
        # output_path.open("w", encoding="utf-8").write(svg)

### Generating svg's of depenency parse
The dependency visualizer, dep, shows part-of-speech tags and syntactic dependencies.   
Creates a custom collered one with a compact view that is more square based shaped connections.   
Images can be seen in Figure 2 of the report. 


In [47]:

res = []
dir_path = "aclImdb/test/neg"
output_path = 'img_res/test/neg/'

# Iterate directory
for file_name in os.listdir(dir_path):

    file_path = os.path.join(dir_path, file_name)
    # check if current file_path is a file 
    if os.path.isfile(file_path):
        with open(file_path, 'r') as f:
            text = f.read()
        doc = nlp(text)
        options = {"compact": True, "bg": "#09a3d5","color": "white", "font": "Source Sans Pro"}
        
        # Creating the dependency visualizer
        svg = displacy.render(doc, style="dep",minify=True,jupyter=False, options=options)
        file_name = file_path.split(".")[0]
        output_path = Path("img_res/test/neg/" + file_name + "_dep.svg")
        output_path.open("w", encoding="utf-8").write(svg)

#### Printing readable pdf

In [10]:
!jupyter nbconvert --to pdf SteveA-NLP-1.1.ipynb

[NbConvertApp] Converting notebook SteveA-NLP-1.1.ipynb to pdf
Your version must be at least (1.12.1) but less than (3.0.0).
Refer to https://pandoc.org/installing.html.
Continuing with doubts...
  check_pandoc_version()
[NbConvertApp] Writing 43236 bytes to notebook.tex
[NbConvertApp] Building PDF
[NbConvertApp] Running xelatex 3 times: ['xelatex', 'notebook.tex', '-quiet']
[NbConvertApp] Running bibtex 1 time: ['bibtex', 'notebook']
[NbConvertApp] PDF successfully created
[NbConvertApp] Writing 52482 bytes to SteveA-NLP-1.1.pdf


In [14]:
!git 

On branch main
Your branch is up to date with 'origin/main'.

Changes to be committed:
  (use "git restore --staged <file>..." to unstage)
	[32mmodified:   .gitignore[m
	[32mnew file:   Module_2_Lab_Session.ipynb[m
	[32mmodified:   SteveA-NLP-1.1.ipynb[m
	[32mnew file:   SteveA-NLP-1.1.pdf[m

