Там надо во первых 
* перевести субтитры в текст, 
* затем разместить эти текстовые документы по папкам с категориями, чтобы склерн мог их взять как X,y, 
* затем преобразовывать X (т.е. текст сам), 
* потом еще попробовать вытащить какую-то мета инфу, типа распределение длинны предложений, распределение длительности появления слов на субтитрах и т.д.

1. Создаем датафрейм с названием фильма, уровнем и названием файла с субтитрами
    * Удаляем лишние столбцы
    * Применяем `.strip()` к названию фильмов (там были переносы строк)
    * Заменяем `/` на `_` в названии уровней
    * Собираем список файлов с субтитрами с помощью `os.listdir()`
    * С помощью `difflib.get_close_matches()` создаем столбец с названиями файлов субтитров
2. Создадим из каждого субтитра plain text (.txt) с помощью `pysubs2` в отдельной папке
3. Распределим txt по подпапкам с уровнем сложности для корректной работы `sklearn.datasets.load_files`

## Создаем датафрейм с названием фильма, уровнем и названием файла с субтитрами

In [11]:

import os
import shutil
import pathlib

import difflib

import pandas as pd

import pysubs2

import re



In [12]:
def labels_preproc(path):
    labels = pd.read_csv(path)
    # labels.columns
    labels = labels.drop(columns=['Kinopoisk ','Subtitles'])
    labels = labels.sort_values('Movie').reset_index(drop=True)

    labels['Movie'] = labels['Movie'].str.strip()

    # with pd.option_context("display.max_rows", 300):
    #     display(labels['Movie'])
    # labels['Movie'].unique()
    # labels['Level'].value_counts()

    labels.loc[labels['Level'] == 'A2/A2+','Level'] = 'A2_A2+'
    # labels.loc[labels['Level'] == 'B1, B2','Level'] = 'A2_A2+'
    labels.loc[labels['Level'] == 'A2/A2+, B1','Level'] = 'A2_A2+, B1'
    return labels

labels = labels_preproc('data/labels.csv')
# labels.head()

In [13]:
def add_subs_file_names(df,path_to_raw_subs):
    # Список уже имеющихся субтитров
    subs_file_list = os.listdir(path_to_raw_subs)
    # subs_file_list = [x.lower() for x in subs_file_list]
    # subs_file_list[:5]
    # labels.head(1)

    # Присваиваем название субтитра по наибольшему совпадению
    for movie in df['Movie']:
        df.loc[df['Movie'] == movie,'sub_file_name'] = difflib.get_close_matches(movie, subs_file_list,cutoff=0)[0]
        
    return df
        
labels = add_subs_file_names(labels,'data/Subtitles_raw/')
labels = labels.drop(index=[39,61]) # Удаляем сериалы
labels.loc[labels['Movie'] == 'Harry Potter (1)', 'sub_file_name'] = 'Harry_Potter_and_the_philosophers_stone(2001).srt'
labels = labels.reset_index(drop=True)

In [14]:

with pd.option_context("display.max_rows", 300):
    display(labels)

Unnamed: 0,Movie,Level,sub_file_name
0,10 Cloverfield Lane,B1,10_Cloverfield_lane(2016).srt
1,10 things I hate about you,B1,10_things_I_hate_about_you(1999).srt
2,A knight’s tale,B2,A_knights_tale(2001).srt
3,A star is born,B2,A_star_is_born(2018).srt
4,Aladdin,A2_A2+,Aladdin(1992).srt
5,All dogs go to heaven,A2_A2+,All_dogs_go_to_heaven(1989).srt
6,An American tail,A2_A2+,An_American_tail(1986).srt
7,Babe,A2_A2+,Babe(1995).srt
8,Back to the future,A2_A2+,Back_to_the_future(1985).srt
9,Batman begins,A2_A2+,Batman_begins(2005).srt


## Создадим из каждого субтитра plain text (.txt) с помощью `pysubs2` в отдельной папке

Удалить HTML теги просто
https://stackoverflow.com/questions/753052/strip-html-from-strings-in-python#:~:text=If%20you%20need%20to%20strip%20HTML

In [15]:
def create_plain_text_str(sub_file,delete_html_tags=True):
    plain_text = ''
    for line in sub_file:  
        plain_text += line.plaintext + ' '
        # plain_text += line.text + ' '
        
    if delete_html_tags:
        plain_text = re.sub('<[^<]+?>', ' ', plain_text)
        
    return plain_text


# subs = pysubs2.load('data/Subtitles_raw/10_Cloverfield_lane(2016).srt',
#                     # keep_html_tags=False,
#                     keep_unknown_html_tags=True
#                    )

# test = create_plain_text_str(subs)
# test

In [16]:
def create_plain_text_files(df, dir_with_subs,dir_with_txt):
    pwd = os.path.abspath(os.getcwd())
    path_from = os.path.join(pwd, dir_with_subs)
    path_to = os.path.join(pwd, dir_with_txt)
    
    try:
        os.mkdir(path_to)
    except:
        print(dir_with_txt, ' already exists. But that\'s fine')
        
    for i in range(len(labels)):
        
        
        sub_name = labels.iloc[i]['sub_file_name']
        
        path_to_sub_file = os.path.join(path_from,sub_name)
      
        sub_obj = pysubs2.load(path_to_sub_file)
        
        plain_text = create_plain_text_str(sub_obj)
        
        plain_text_name = sub_name + '.txt'
        # print(plain_text_name)
        path_to_plain_text_file = os.path.join(path_to,plain_text_name)
        
        
        with open(path_to_plain_text_file, "w",encoding="utf-8") as file:
            # Writing data to a file
            file.write(plain_text)
    
create_plain_text_files(labels, 'data\Subtitles_raw','data\Subtitles_plain_text')

data\Subtitles_plain_text  already exists. But that's fine


## Распределим txt по подпапкам с уровнем сложности для корректной работы `sklearn.datasets.load_files`

In [19]:
# Это мультклассовая разбивка
# Еще стоит разделить на бинарные задачи

def distribute_into_level_subfolders(labels,dir_with_txt_files,dir_with_level_subfolders):
    pwd = os.path.abspath(os.getcwd())
    path_from = os.path.join(pwd, dir_with_txt_files)
    path_to = os.path.join(pwd, dir_with_level_subfolders)
    # print(path_from)
    # print(path_to)
    
    try:
        os.mkdir(path_to)
    except:
        print(dir_with_level_subfolders, ' already exists. But that\'s fine')
        
    for i in range(len(labels)):
        level = labels.iloc[i]['Level']
        txt_name = labels.iloc[i]['sub_file_name'] + '.txt'
        
        # print(level,txt_name)
        
        path_from_ = os.path.join(path_from,txt_name)
        path_to_level = os.path.join(path_to,level)
        path_to_file = os.path.join(path_to_level,txt_name)

        # print(path_from_)
        # print(path_to_)
        try:
            shutil.copy(path_from_, path_to_file)
        except:
            os.mkdir(path_to_level)
            shutil.copy(path_from_, path_to_file)
    
distribute_into_level_subfolders(labels,'data\Subtitles_plain_text','data\Subtitles_multiclass')

In [20]:
# Это мульти бинарная разбивка
# Тут столько задач классификации, сколько уровней
# По принципу определения А1 уровень и не А1 уровень
# И так по каждому уровню

def distribute_into_multi_binary_subfolders(labels,dir_with_txt_files,dir_with_multi_binary_subfolders):
    pwd = os.path.abspath(os.getcwd())
    path_from = os.path.join(pwd, dir_with_txt_files)
    path_to = os.path.join(pwd, dir_with_multi_binary_subfolders)
    # print(path_from)
    # print(path_to)
    
#     Создаем папку где будут лежать каждая из классификаций
    try:
        os.mkdir(path_to)
    except:
        print(dir_with_multi_binary_subfolders, ' already exists. But that\'s fine')
    
    for level in labels['Level'].unique():
        
#         Берем уровень

#         Берем субитры этого уровня
        the_level = labels.loc[labels['Level'] == level,'sub_file_name']
        the_level = the_level + '.txt'
        # display(the_level)
        
        # Берем путь до папки где будут располагаться категории
        path_to_level = os.path.join(path_to,level)
        # print(path_to_level)
        
        # Размещаться они будут так:
        #     B1/B1
        #     B1/not_B1
        
        # Пробуем создать папку с категорией контретного уровня 
        try:
            os.mkdir(path_to_level)
        except:
            print(path_to_level, ' already exists. But that\'s fine')        
        
        
        for txt in the_level:
            # Каждый субтитр этого уровня переносим в папку вида B1/B1
            path_from_file = os.path.join(path_from,txt)
            # print(path_from_file)
            path_to_level_level = os.path.join(path_to_level,level)
            path_to_file = os.path.join(path_to_level_level,txt)
            # print(path_to_file)
            try:
                shutil.copy(path_from_file, path_to_file)
            except:
                os.mkdir(path_to_level_level)
                shutil.copy(path_from_file, path_to_file)
            
            # try:
            #     os.mkdir
        
        # Берем субтитры не этого уровня
        not_level = labels.loc[labels['Level'] != level,'sub_file_name']
        not_level = not_level + '.txt'
        
        for txt in not_level:
            # Каждый этот субтитр переносим в папку вида B1/not_B1
            path_from_file = os.path.join(path_from,txt)
            # print(path_from_file)
            path_to_level_level = os.path.join(path_to_level,'not_' + level)
            path_to_file = os.path.join(path_to_level_level,txt)
            # print(path_to_file)    
            
            try:
                shutil.copy(path_from_file, path_to_file)
            except:
                os.mkdir(path_to_level_level)
                shutil.copy(path_from_file, path_to_file)
        
        

    
distribute_into_multi_binary_subfolders(labels,'data\Subtitles_plain_text','data\Subtitles_multi_binary')

## Словарь с уровнем

In [None]:

# vocab = pd.read_json('vocab/worddata.json')
## vocab = pd.json_normalize('vocab/worddata.json')

vocab = pd.read_json('vocab/englishprofile.json')

# vocab

# vocab[vocab['baseword'] == 'and']

# Что такое `'guideword'` и как обрабатывать, что одно слово принадлежит разным уровням?

# vocab['level'].value_counts()

# Нам не нужны уровни выше B2, поэтому обрежем.

# vocab['level'].sort_values().unique().tolist()

levels = ['A1', 'A2', 'B1', 'B2']

# vocab.loc[vocab['level'].isin(levels)].count()

vocab = vocab.loc[vocab['level'].isin(levels)]

# vocab

# test = vocab.groupby('baseword')['level'].unique()

# pd.DataFrame(test).head(50)

# with pd.option_context("display.max_rows", 300):
#     display(pd.DataFrame(test))

# stopwords.words('english')

## Тест векторизации текста, IT/IDF, логистической регресии [RAW]

In [138]:
import numpy as np
import re
import nltk
from sklearn.datasets import load_files
nltk.download('stopwords')
# nltk.download('wordnet')
# nltk.download('omw-1.4')
import pickle
from nltk.corpus import stopwords

from nltk.stem import WordNetLemmatizer

from sklearn.linear_model import LogisticRegression

from sklearn.feature_extraction.text import CountVectorizer

from sklearn.feature_extraction.text import TfidfTransformer

from sklearn.model_selection import train_test_split

from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

from sklearn.model_selection import cross_val_score

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\SnakeRZR\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [154]:
def load_data_(path_to_files):
    movie_data = load_files(path_to_files)
    X, y = movie_data.data, movie_data.target

    return X,y

X,y = load_data_('data/Subtitles_multi_binary/B1/')
# print(len(X))
# del X,y

86

In [130]:
def stem(X):
    documents = []

    stemmer = WordNetLemmatizer()

    for sen in range(0, len(X)):
    #     # Remove all the special characters
        document = re.sub(r'\W', ' ', str(X[sen]))

        # remove all single characters
        document = re.sub(r'\s+[a-zA-Z]\s+', ' ', document)

    #     # Remove single characters from the start
    #     document = re.sub(r'\^[a-zA-Z]\s+', ' ', document) 

        # Substituting multiple spaces with single space
        document = re.sub(r'\s+', ' ', document, flags=re.I)

        # Removing prefixed 'b'
        document = re.sub(r'^b\s+', '', document)

        # Converting to Lowercase
        document = document.lower()

        # Lemmatization
        document = document.split()

        document = [stemmer.lemmatize(word) for word in document]
        document = ' '.join(document)

        documents.append(document)
    return documents

# len(stem(X))

86

In [152]:
def count_vectorize_(X):
    vectorizer = CountVectorizer(
                            # max_features=1500, 
                             # min_df=5, 
                             # max_df=0.7, 
                             stop_words=stopwords.words('english'),
                            ngram_range = (1,5)
                            )
    X = vectorizer.fit_transform(X).toarray()
    return X
# count_vectorize_(X).shape

In [110]:
def tfidf_transform_(X):
    tfidfconverter = TfidfTransformer()
    X = tfidfconverter.fit_transform(X).toarray()
    return X

In [153]:
def pipeline_(path_to_files):
    X, y = load_data_(path_to_files)
    
    
    # X = stem(X)
        
    X = count_vectorize_(X)
    
    
    # X = tfidf_transform_(X)
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0, 
                                                    stratify=y
                                                   )
    print(X_train.shape)
    print(X_test.shape)
    logreg = LogisticRegression()
    
    # cv = cross_val_score(logreg,X,y,cv=5,scoring='roc_auc')
    # print(cv)
    
    logreg.fit(X_train,y_train)
    y_pred = logreg.predict(X_test)

    print(confusion_matrix(y_test,y_pred))
    print()
    print(classification_report(y_test,y_pred))
    print()
    print(accuracy_score(y_test, y_pred))
    
pipeline_('data/Subtitles_multi_binary/B1//')    # binary
# pipeline_('data/Subtitles_multiclass/')    # multiclass

(68, 11395)
(18, 11395)
[0.5 0.5 0.5 0.5 0.5]
[[ 0  6]
 [ 0 12]]

              precision    recall  f1-score   support

           0       0.00      0.00      0.00         6
           1       0.67      1.00      0.80        12

    accuracy                           0.67        18
   macro avg       0.33      0.50      0.40        18
weighted avg       0.44      0.67      0.53        18


0.6666666666666666


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


## Исследование стеммера

In [181]:
test_str = X[0][:1000]

In [182]:
repr(test_str)

'b" Fixed & Synced by bozxphd. Enjoy The Flick  (CLANGING) (DRAWER CLOSES) (INAUDIBLE) (CELL PHONE RINGING) BEN ON PHONE: Michelle, please don\'t hang up. Just talk to me, okay? I can\'t believe you just left. Michelle. Come back. Please say something. Michelle, talk to me. Look, we had an argument. Couples fight. That is no reason to just leave everything behind. Running away isn\'t gonna help it any. Michelle, please... (DIALTONE) NEWSCASTER: More details on that. Elsewhere today, power has still not been restored to many cities on the southern seaboard in the wake of this afternoon\'s widespread blackout. While there had been some inclement weather in the region, the problem seems linked to what authorities are calling a catastrophic power surge that has crippled traffic in the area. - (LOUD CRASH) - (GRUNTS) - (TIRES SCREECHING) - (SCREAMS) - (GLASS SHATTERING) - (GASPING) (GROANS) (HORN HONKING) (INHALES DEEPLY) (SNIFFS) (SIGHS) (GASPING) (CHAINS RATTLING) (BREATHING HEAVILY) (GRU

In [178]:
test_str = str(test_str)
test_str

'b" Fixed & Synced by bozxphd. Enjoy The Flick  (CLANGING) (DRAWER CLOSES) (INAUDIBLE) (CELL PHONE RINGING) BEN ON PHONE: Michelle, please don\'t hang up. Just talk to me, okay? I can\'t believe you just left. Michelle. Come back. Please say something. Michelle, talk to me. Look, we had an argument. Couples fight. That is no reason to just leave everything behind. Running away isn\'t gonna help it any. Michelle, please... (DIALTONE) NEWSCASTER: More details on that. Elsewhere today, power has still not been restored to many cities on the southern seaboard in the wake of this afternoon\'s widespread blackout. While there had been some inclement weather in the region, the problem seems linked to what authorities are calling a catastrophic power surge that has crippled traffic in the area. - (LOUD CRASH) - (GRUNTS) - (TIRES SCREECHING) - (SCREAMS) - (GLASS SHATTERING) - (GASPING) (GROANS) (HORN HONKING) (INHALES DEEPLY) (SNIFFS) (SIGHS) (GASPING) (CHAINS RATTLING) (BREATHING HEAVILY) (GRU

In [170]:
test_str = re.sub(r'\W', ' ', test_str)
test_str

'b  Fixed   Synced by bozxphd  Enjoy The Flick   CLANGING   DRAWER CLOSES   INAUDIBLE   CELL PHONE RINGING  BEN ON PHONE  Michelle  please don t hang up  Just talk to me  okay  I can t believe you just left  Michelle  Come back  Please say something  Michelle  talk to me  Look  we had an argument  Couples fight  That is no reason to just leave everything behind  Running away isn t gonna help it any  Michelle  please     DIALTONE  NEWSCASTER  More details on that  Elsewhere today  power has still not been restored to many cities on the southern seaboard in the wake of this afternoon s widespread blackout  While there had been some inclement weather in the region  the problem seems linked to what authorities are calling a catastrophic power surge that has crippled traffic in the area     LOUD CRASH     GRUNTS     TIRES SCREECHING     SCREAMS     GLASS SHATTERING     GASPING   GROANS   HORN HONKING   INHALES DEEPLY   SNIFFS   SIGHS   GASPING   CHAINS RATTLING   BREATHING HEAVILY   GRUNTIN

In [179]:
import spacy

# Initialize spacy 'en' model, keeping only tagger component needed for lemmatization
nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])

# sentence = "The striped bats are hanging on their feet for best"

# Parse the sentence using the loaded 'en' model object `nlp`
doc = nlp(test_str)

# Extract the lemma for each token and join
" ".join([token.lemma_ for token in doc])
#> 'the strip bat be hang on -PRON- foot for good'

'b " Fixed & sync by bozxphd . enjoy the Flick   ( clanging ) ( drawer close ) ( INAUDIBLE ) ( cell phone RINGING ) BEN on phone : Michelle , please do not hang up . just talk to I , okay ? I can not believe you just leave . Michelle . come back . please say something . Michelle , talk to I . look , we have an argument . couple fight . that be no reason to just leave everything behind . run away be not go to help it any . Michelle , please ... ( dialtone ) newscaster : More detail on that . elsewhere today , power have still not be restore to many city on the southern seaboard in the wake of this afternoon \'s widespread blackout . while there have be some inclement weather in the region , the problem seem link to what authority be call a catastrophic power surge that have cripple traffic in the area . - ( LOUD crash ) - ( grunt ) - ( TIRES SCREECHING ) - ( SCREAMS ) - ( GLASS SHATTERING ) - ( GASPING ) ( GROANS ) ( HORN HONKING ) ( INHALES DEEPLY ) ( SNIFFS ) ( SIGHS ) ( GASPING ) ( c

AttributeError: 'spacy.tokens.doc.Doc' object has no attribute 'lemma_'

In [161]:
from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer 

nltk.download('punkt')

In [171]:
test_str = str(test_str)
lemmatizer = WordNetLemmatizer()
word_list = nltk.word_tokenize(test_str)
word_list


['b',
 'Fixed',
 'Synced',
 'by',
 'bozxphd',
 'Enjoy',
 'The',
 'Flick',
 'CLANGING',
 'DRAWER',
 'CLOSES',
 'INAUDIBLE',
 'CELL',
 'PHONE',
 'RINGING',
 'BEN',
 'ON',
 'PHONE',
 'Michelle',
 'please',
 'don',
 't',
 'hang',
 'up',
 'Just',
 'talk',
 'to',
 'me',
 'okay',
 'I',
 'can',
 't',
 'believe',
 'you',
 'just',
 'left',
 'Michelle',
 'Come',
 'back',
 'Please',
 'say',
 'something',
 'Michelle',
 'talk',
 'to',
 'me',
 'Look',
 'we',
 'had',
 'an',
 'argument',
 'Couples',
 'fight',
 'That',
 'is',
 'no',
 'reason',
 'to',
 'just',
 'leave',
 'everything',
 'behind',
 'Running',
 'away',
 'isn',
 't',
 'gon',
 'na',
 'help',
 'it',
 'any',
 'Michelle',
 'please',
 'DIALTONE',
 'NEWSCASTER',
 'More',
 'details',
 'on',
 'that',
 'Elsewhere',
 'today',
 'power',
 'has',
 'still',
 'not',
 'been',
 'restored',
 'to',
 'many',
 'cities',
 'on',
 'the',
 'southern',
 'seaboard',
 'in',
 'the',
 'wake',
 'of',
 'this',
 'afternoon',
 's',
 'widespread',
 'blackout',
 'While',
 'the

## merge countvectorizer + vocab

1. Берем словарь
2. Лемматизируем его (basewords)
3. Берем субтитры (одного фильма для теста)
4. Обрабатываем (убираем дичь)
5. Лемматизируем той же штукой, что и словарь
6. Мерджим по слову

In [190]:
vocab = pd.read_json('vocab/englishprofile.json')

levels = ['A1', 'A2', 'B1', 'B2']

vocab = vocab.loc[vocab['level'].isin(levels)]

vocab = vocab.groupby('baseword')['level'].unique()

vocab = pd.DataFrame(vocab)
# vocab

In [198]:
import spacy

# Initialize spacy 'en' model, keeping only tagger component needed for lemmatization
nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])

# sentence = "The striped bats are hanging on their feet for best"

# Parse the sentence using the loaded 'en' model object `nlp`
doc = nlp(test_str)

# Extract the lemma for each token and join
" ".join([token.lemma_ for token in doc])
#> 'the strip bat be hang on -PRON- foot for good'

def spacy_lemmatize(x,nlp):
    str_ = npl(x)
    result = " ".join([token.lemma_ for token in str_])
    return result

vocab = vocab.reset_index()
vocab['lemmas'] = vocab['basewords'].apply(spacy_lemmatize,npl)
vocab
    

ValueError: [E866] Expected a string or 'Doc' as input, but got: <class 'bytes'>.

In [203]:
# vocab = vocab.reset_index()
vocab

Unnamed: 0,baseword,level
0,all along,[B2]
1,all in all,[B2]
2,all of a sudden,[B2]
3,be used to sb/sth/doing sth,[B1]
4,bound to do sth,[B2]
...,...,...
6167,zebra,[B2]
6168,zero,"[A2, B1]"
6169,zip,[B2]
6170,zone,[B1]


In [214]:
def str_clean(x):

#     # Remove all the special characters
    x = re.sub(r'\W', ' ', x)

    # remove all single characters
    x = re.sub(r'\s+[a-zA-Z]\s+', ' ', x)

    # Remove single characters from the start
    x = re.sub(r'\^[a-zA-Z]\s+', ' ', x) 

    # Substituting multiple spaces with single space
    x = re.sub(r'\s+', ' ', x, flags=re.I)

    # Removing prefixed 'b'
    x = re.sub(r'^b\s+', '', x)

    # Converting to Lowercase
    x = x.lower()

#     # Lemmatization
#     document = document.split()

#     document = [stemmer.lemmatize(word) for word in document]
#     document = ' '.join(document)

#     documents.append(document)
    return x

# len(stem(X))

In [223]:
import nltk

w_tokenizer = nltk.tokenize.WhitespaceTokenizer()
lemmatizer = nltk.stem.WordNetLemmatizer()

def lemmatize_text(text):
    result = [lemmatizer.lemmatize(w) for w in w_tokenizer.tokenize(text)]
    result = " ".join(result)
    return result

# df = pd.DataFrame(['this was cheesy', 'she likes these books', 'wow this is great'], columns=['text'])
vocab['text_lemmatized'] = vocab['baseword'].apply(str_clean).apply(lemmatize_text)

In [227]:
vocab_string = ' '.join(vocab['baseword'].apply(str_clean).apply(lemmatize_text).unique().tolist())

In [233]:
cv = CountVectorizer()
vocab_cv = cv.fit_transform([vocab_string])
# vocab_string

In [236]:
cv.get_feature_names_out()

array(['10', '20', 'abandon', ..., 'zip', 'zone', 'zoo'], dtype=object)

In [239]:
vocab_df = pd.DataFrame(vocab_cv.toarray(),columns = cv.get_feature_names_out()).T

In [241]:
vocab_df

Unnamed: 0,0
10,2
20,1
abandon,1
abandoned,1
ability,1
...,...
zebra,1
zero,1
zip,1
zone,1


In [244]:
vocab_df.sort_values(0,ascending=False).head(20)

Unnamed: 0,0
sth,614
sb,281
etc,190
in,170
or,169
up,156
the,153
to,136
of,119
be,118
