# Basic text analysis without a training dataset

## Reading in a text file

In [34]:
import spacy
nlp = spacy.load('en_core_web_sm')
nlp.max_length = 1_200_000
from collections import Counter
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

In [20]:
fn = "/home/peter/Documents/gh/sc_bash/1260-0.txt"
text = open(fn, 'r').read()

## Tokenization

In [21]:
text_doc = nlp(text)

In [24]:
sentences = list(text_doc.sents)
print(sentences[10])

Folds of scarlet drapery shut in my view to the right hand; to the left
were the clear panes of glass, protecting, but not separating me from
the drear November day.


## Stopwords removal

In [27]:
normalized_text = [token for token in text_doc if not token.is_stop]
print(normalized_text[0:65])

[CHAPTER, 


, possibility, taking, walk, day, ., 
, wandering, ,, ,, leafless, shrubbery, hour, morning, ;, 
, dinner, (, Mrs., Reed, ,, company, ,, dined, early, ), 
, cold, winter, wind, brought, clouds, sombre, ,, rain, 
, penetrating, ,, outdoor, exercise, 
, question, ., 

, glad, :, liked, long, walks, ,, especially, chilly, 
, afternoons, :, dreadful, coming, home, raw, twilight, ,, 
, nipped, fingers]


## Punctuation removal

In [28]:
clean_text = [token for token in normalized_text if not token.is_punct]
print(clean_text[0:65])

[CHAPTER, 


, possibility, taking, walk, day, 
, wandering, leafless, shrubbery, hour, morning, 
, dinner, Mrs., Reed, company, dined, early, 
, cold, winter, wind, brought, clouds, sombre, rain, 
, penetrating, outdoor, exercise, 
, question, 

, glad, liked, long, walks, especially, chilly, 
, afternoons, dreadful, coming, home, raw, twilight, 
, nipped, fingers, toes, heart, saddened, chidings, 
, Bessie, nurse, humbled, consciousness, physical, 
, inferiority, Eliza, John, Georgiana]


## Lemmatization

In [29]:
for token in clean_text:
    print(token, token.lemma_)

CHAPTER chapter



 



possibility possibility
taking take
walk walk
day day

 

wandering wandering
leafless leafless
shrubbery shrubbery
hour hour
morning morning

 

dinner dinner
Mrs. Mrs.
Reed Reed
company company
dined dine
early early

 

cold cold
winter winter
wind wind
brought bring
clouds cloud
sombre sombre
rain rain

 

penetrating penetrate
outdoor outdoor
exercise exercise

 

question question


 


glad glad
liked like
long long
walks walk
especially especially
chilly chilly

 

afternoons afternoon
dreadful dreadful
coming come
home home
raw raw
twilight twilight

 

nipped nip
fingers finger
toes toe
heart heart
saddened sadden
chidings chiding

 

Bessie Bessie
nurse nurse
humbled humble
consciousness consciousness
physical physical

 

inferiority inferiority
Eliza Eliza
John John
Georgiana Georgiana
Reed Reed


 


said say
Eliza Eliza
John John
Georgiana Georgiana
clustered cluster
round round
mama mama

 

drawing drawing
room room
lay lie
reclined recline
sofa

## Word Frequency Count

In [33]:
words = [token.text for token in clean_text if not token.is_stop and not token.is_punct and not token.is_sent_end]
word_freq = Counter(words)
common_words = word_freq.most_common(25)
print(common_words)

[('\n', 12223), ('\n\n', 2376), ('said', 577), ('Mr.', 543), ('like', 392), ('Rochester', 364), ('little', 332), ('Jane', 330), ('Miss', 303), ('sir', 286), ('room', 260), ('thought', 255), ('Mrs.', 252), ('long', 243), ('time', 242), ('day', 233), ('shall', 230), ('know', 221), ('looked', 220), ('night', 220), ('think', 213), ('good', 205), ('John', 200), ('come', 194), ('face', 190)]


## Sentiment Analysis

In [35]:
analyzer = SentimentIntensityAnalyzer()
vs = analyzer.polarity_scores(text)