# Lexos Visualizations

This script does the following:

1. Creates spaCy docs from a list of text files.
2. Converts the tokens to lower case and filters them to remove digits, punctuation, and whitespace.
3. Tests that the loader is working properly.
4. Store data in a document term matrix.
5. Generates a dendogram from the dtm.

## Configuration

Configure a list of file paths, the labels you wish to use for each document, and the language model you wish to use to parse the texts.

Note that converting long texts to spaCy docs can take a long time.

In [1]:
# Replace with your own data
data = [
    r"C:\Users\jack\OneDrive\Documents\School\summer22\LexosRepo\lexos\tests\test_data\txt\Austen_Pride.txt",
    r"C:\Users\jack\OneDrive\Documents\School\summer22\LexosRepo\lexos\tests\test_data\txt\Austen_Sense.txt"
]
labels = ["Pride", "Sense"]
model = "en_core_web_sm"

## Import Lexos API

In [2]:
# Set local import path
import os
import sys
LEXOS_PATH = "lexos"
if "NOTEBOOK_INITIATED_FLAG" not in globals():
    NOTEBOOK_INITIATED_FLAG = True
    try:
        module_path = os.path.join(os.path.dirname(__file__), os.pardir)
    except:
        module_path = os.path.abspath(os.path.join(LEXOS_PATH))
        %cd lexos
        %pwd
    if module_path not in sys.path:
        sys.path.append(module_path)
        
# Import Lexos API modules
from lexos.io.basic import Loader
from lexos import tokenizer
from lexos.dtm import DTM
from lexos.cutter import Ginsu
try:
    from lexos.cluster.dendrogram import Dendrogram
except ImportError:
    print("Dendogram not imported.")

C:\Users\jack\OneDrive\Documents\School\summer22\LexosRepo\lexos\lexos


## Load Texts and Convert to spaCy Docs

In [3]:
# Create the loader and load the data
loader = Loader()
loader.load(data)

# Make the docs -- currently takes a long time with full novels
docs = tokenizer.make_docs(loader.texts, model=model)

## Ensure Loader is working correctly

In [4]:
for i, text in enumerate(docs):
    print(text[0:50])
    print("\n")

 Pride and Prejudice
by Jane Austen
Chapter 1
It is a truth universally acknowledged, that a single man in possession of a good fortune, must be in want of a wife.
However little known the feelings or views of such a man


SENSE AND SENSIBILITY
by Jane Austen
(1811)
CHAPTER 1
The family of Dashwood had long been settled in Sussex. Their estate
was large, and their residence was at Norland Park, in the centre of
their property, where,




## Vectorizer

In [5]:
from lexos.tokenizer.lexosdoc import LexosDoc

lexos_doc = LexosDoc(docs)
tokens = lexos_doc.get_tokens()
#print (tokens)

In [6]:
from textacy.representations.vectorizers import Vectorizer
from lexos.tokenizer.lexosdoc import LexosDoc

vectorizer = Vectorizer(
    tf_type="linear",
    idf_type=None,
    norm=None
)

tokenised_docs = (LexosDoc(doc).get_tokens() for doc in docs)
doc_term_matrix = vectorizer.fit_transform(tokenised_docs)

In [7]:
doc_term_matrix = vectorizer.fit_transform(tokens)
print (doc_term_matrix)

  (0, 2)	119323
  (0, 3)	499
  (0, 4)	3508
  (0, 10)	747
  (0, 20)	18
  (0, 21)	18
  (0, 22)	30
  (0, 23)	9117
  (0, 24)	1424
  (0, 29)	6171
  (0, 31)	20
  (0, 46)	19
  (0, 59)	16
  (0, 70)	16
  (0, 81)	18
  (0, 92)	10
  (0, 95)	6
  (0, 97)	7
  (0, 98)	6
  (0, 99)	133
  (0, 100)	1538
  (0, 101)	462
  (0, 102)	446
  (0, 411)	145
  (0, 544)	2555
  :	:
  (1, 22)	3
  (1, 23)	9900
  (1, 24)	2725
  (1, 29)	4918
  (1, 31)	19
  (1, 46)	16
  (1, 59)	15
  (1, 70)	15
  (1, 81)	6
  (1, 92)	5
  (1, 95)	6
  (1, 97)	6
  (1, 98)	5
  (1, 99)	66
  (1, 100)	1572
  (1, 101)	452
  (1, 102)	554
  (1, 411)	254
  (1, 544)	2438
  (1, 602)	431
  (1, 749)	263
  (1, 1022)	601
  (1, 1108)	2
  (1, 1109)	2
  (1, 1111)	39892


## Cut texts

In [8]:
cutter = Ginsu()
list_of_segmented_docs = cutter.splitn(docs, n=3)

print(f"Number of docs: {len(list_of_segmented_docs)}\n")
for i, segmented_doc in enumerate(list_of_segmented_docs):
    print(f"Doc {i+1}:\n")
    for j, segment in enumerate(segmented_doc):
        print(f"Segment {j+1}:\n")
        print(segment.text[0:25])
        print()

Number of docs: 2

Doc 1:

Segment 1:

 Pride and Prejudice
by 

Segment 2:

report which shortly prev

Segment 3:

that the authority of a s

Doc 2:

Segment 1:

SENSE AND SENSIBILITY
by

Segment 2:

many years date. He was u

Segment 3:

other, I forget
who. So 



## Initiate DTM

In [9]:
from lexos.dtm import DTM
labels = ["Pride_and_Prejudice", "Sense_and_Sensibility"]
dtm = DTM(docs, labels)

labels = ["Pride1", "Pride2", "Pride3", "Sense1", "Sense2", "Sense3"]
#result = [doc.text for doc in result]
#print (result[0])
dtm2 = DTM(list_of_segmented_docs, labels)

## Create Dendogram

In [10]:
dendrogram = Dendrogram(dtm2, show=True)

ValueError: 7 columns passed, passed data had 3 columns

## Create WordCloud

In [None]:
from lexos.visualization.cloud.wordcloud import multicloud
labels = dtm.get_table().columns.tolist()[1:]
multicloud(dtm, docs=None, opts=None, ncols=3, title=None, labels=None, show=True, figure_opts=None, round=None, filename=None)

## Create BubbleViz

In [None]:
from lexos.visualization.bubbleviz import bubbleviz
bubbleviz(dtm)