# Подключение библиотек

In [173]:
import requests
import collections
import numpy as np
import pandas as pd

# Обработка текста
import re
import string
import nltk
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

# Алгоритмы для подсчёта слов
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

# Загрузка данных

In [179]:
data_url = requests.get('https://www.gutenberg.org/files/11/11-0.txt')
text = data_url.content.decode('utf-8')
text[:1000]

'\ufeffThe Project Gutenberg eBook of Alice’s Adventures in Wonderland, by Lewis Carroll\r\n\r\nThis eBook is for the use of anyone anywhere in the United States and\r\nmost other parts of the world at no cost and with almost no restrictions\r\nwhatsoever. You may copy it, give it away or re-use it under the terms\r\nof the Project Gutenberg License included with this eBook or online at\r\nwww.gutenberg.org. If you are not located in the United States, you\r\nwill have to check the laws of the country where you are located before\r\nusing this eBook.\r\n\r\nTitle: Alice’s Adventures in Wonderland\r\n\r\nAuthor: Lewis Carroll\r\n\r\nRelease Date: January, 1991 [eBook #11]\r\n[Most recently updated: October 12, 2020]\r\n\r\nLanguage: English\r\n\r\nCharacter set encoding: UTF-8\r\n\r\nProduced by: Arthur DiBianca and David Widger\r\n\r\n*** START OF THE PROJECT GUTENBERG EBOOK ALICE’S ADVENTURES IN WONDERLAND ***\r\n\r\n[Illustration]\r\n\r\n\r\n\r\n\r\nAlice’s Adventures in Wonderland\r

# Обработка названий глав

In [144]:
chapter_titles = [
    'CHAPTER I.     Down the Rabbit-Hole',
    'CHAPTER II.    The Pool of Tears',
    'CHAPTER III.   A Caucus-Race and a Long Tale',
    'CHAPTER IV.    The Rabbit Sends in a Little Bill',
    'CHAPTER V.     Advice from a Caterpillar',
    'CHAPTER VI.    Pig and Pepper',
    'CHAPTER VII.   A Mad Tea-Party',
    'CHAPTER VIII.  The Queen’s Croquet-Ground',
    'CHAPTER IX.    The Mock Turtle’s Story',
    'CHAPTER X.     The Lobster Quadrille',
    'CHAPTER XI.    Who Stole the Tarts?',
    'CHAPTER XII.   Alice’s Evidence']

for i, title in enumerate(chapter_titles):
  title = title.lower()
  title = "".join([char for char in title if char not in string.punctuation+'”“’—'])
  chapter_titles[i] = re.sub(' +', ' ', title)
  
chapter_titles

['chapter i down the rabbithole',
 'chapter ii the pool of tears',
 'chapter iii a caucusrace and a long tale',
 'chapter iv the rabbit sends in a little bill',
 'chapter v advice from a caterpillar',
 'chapter vi pig and pepper',
 'chapter vii a mad teaparty',
 'chapter viii the queens croquetground',
 'chapter ix the mock turtles story',
 'chapter x the lobster quadrille',
 'chapter xi who stole the tarts',
 'chapter xii alices evidence']

# Обработка текста: избавление от лишней информации и от знаков пунктуации

In [178]:
text = text.replace('\r', ' ')
text = text.replace('\n', ' ')
text = text.lower()
text = "".join([char for char in text if char not in string.punctuation+'”‘“’—'])
text = re.sub(' +', ' ', text)

text = text.rpartition(chapter_titles[0])[1]+text.rpartition(chapter_titles[0])[2].rpartition('the end')[0]
text_to_split = text

text[:1000]

'chapter i down the rabbithole alice was beginning to get very tired of sitting by her sister on the bank and of having nothing to do once or twice she had peeped into the book her sister was reading but it had no pictures or conversations in it and what is the use of a book thought alice without pictures or conversations so she was considering in her own mind as well as she could for the hot day made her feel very sleepy and stupid whether the pleasure of making a daisychain would be worth the trouble of getting up and picking the daisies when suddenly a white rabbit with pink eyes ran close by her there was nothing so very remarkable in that nor did alice think it so very much out of the way to hear the rabbit say to itself oh dear oh dear i shall be late when she thought it over afterwards it occurred to her that she ought to have wondered at this but at the time it all seemed quite natural but when the rabbit actually took a watch out of its waistcoatpocket and looked at it and the

# Разделение текста на главы

In [146]:
chapters = collections.deque([])

for i in range(len(chapter_titles)-1, -1, -1):
  head, chapter = text.partition(chapter_titles[i])[0], text.split(chapter_titles[i])[-1]
  text = head
  chapters.appendleft(chapter.strip())

chapters = list(chapters)

# Обработка глав: исключение стоп слов и применение стемминга

In [147]:
for i, chapter in enumerate(chapters):
  words = word_tokenize(chapter)
  stop_words = stopwords.words('english')

  filtered_words = [word for word in words if word not in stop_words]

  porter = PorterStemmer()
  stemmed = [porter.stem(word) for word in filtered_words]

  chapters[i] = ' '.join(stemmed)

# Поиск топ-20 часто встречающихся слов в главах через TF-IDF

In [148]:
chapters_tfidfs = []

for chapter in chapters:
  tfidfv = TfidfVectorizer()
  tfidf = tfidfv.fit_transform([chapter]).toarray()
  features = tfidfv.get_feature_names_out()
  chapters_tfidfs.append((features, np.squeeze(tfidf)))

In [149]:
for i, chapter in enumerate(chapters_tfidfs):
  features, tfidf = chapter
  df = pd.DataFrame({'TF-IDF': tfidf}, index=features)
  df.index.name = 'Слово'
  df = df.sort_values('TF-IDF', ascending=False)
  print(f'Глава {i+1}:\n', df.head(20))
  print('\n')

Глава 1:
            TF-IDF
Слово            
alic     0.418097
littl    0.223980
like     0.179184
way      0.164252
think    0.164252
get      0.149320
see      0.149320
door     0.134388
one      0.134388
tri      0.119456
eat      0.119456
could    0.119456
said     0.119456
say      0.119456
thought  0.119456
time     0.119456
go       0.119456
wonder   0.119456
thing    0.104524
went     0.104524


Глава 2:
            TF-IDF
Слово            
alic     0.373840
littl    0.244434
mous     0.230055
im       0.186920
said     0.172541
go       0.158163
dear     0.158163
cri      0.143784
thing    0.143784
like     0.143784
thought  0.129406
must     0.129406
went     0.129406
oh       0.129406
way      0.115028
feet     0.115028
time     0.115028
one      0.115028
cat      0.115028
come     0.100649


Глава 3:
          TF-IDF
Слово          
said   0.519584
alic   0.351483
mous   0.305638
know   0.183383
dodo   0.183383
one    0.122255
soon   0.106973
prize  0.091691
thing  0.09169

# Поиск топ-20 часто встречающихся слов через LDA

In [172]:
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(chapters)

lda = LatentDirichletAllocation(n_components=len(chapters))
lda.fit(X)

feature_names = vectorizer.get_feature_names_out()

for i, topic in enumerate(lda.components_):
    print(f"Глава {i+1}:")
    print("\n".join([feature_names[i] for i in topic.argsort()[:-21:-1]]))
    print()

Глава 1:
said
alic
king
gryphon
turtl
mock
would
look
white
one
voic
hatter
queen
court
could
littl
know
rabbit
juri
danc

Глава 2:
splash
late
whose
green
tureen
tide
retir
adventuresbegin
pretti
pennyworth
theyyouv
bound
yell
mouthsand
button
seashor
faintli
gay
eyelid
disobey

Глава 3:
said
alic
caterpillar
im
pigeon
serpent
littl
well
tri
minut
ive
think
chang
size
last
know
one
bit
your
use

Глава 4:
splash
late
whose
green
tureen
tide
retir
adventuresbegin
pretti
pennyworth
theyyouv
bound
yell
mouthsand
button
seashor
faintli
gay
eyelid
disobey

Глава 5:
alic
said
queen
look
littl
one
head
rabbit
came
king
like
heard
get
began
thought
voic
went
go
grow
three

Глава 6:
said
alic
mous
know
dodo
one
soon
bird
dri
call
round
lori
get
say
thing
would
look
prize
ill
long

Глава 7:
splash
late
whose
green
tureen
tide
retir
adventuresbegin
pretti
pennyworth
theyyouv
bound
yell
mouthsand
button
seashor
faintli
gay
eyelid
disobey

Глава 8:
splash
late
whose
green
tureen
tide
retir
adventur