In [175]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.decomposition import PCA
import MeCab
import numpy as np
import pandas as pd
import glob
import seaborn as sns
import matplotlib.pyplot as plt

In [166]:
class MorphologicaAnalysis:
    def __init__(self, document_list: list[str]) -> None:
        self.mecab: MeCab = MeCab.Tagger("-Owakati")
        self.document_list: list[str] = document_list
        self.vectors: np.ndarray = self._run_tfid(self.document_list)
    
    def _run_mecab(self, text: str) -> list:
        node = self.mecab.parseToNode(text)
    
        word_list = []
    
        while node:
            if node.feature.split(',')[0] == '動詞':
                word_list.append(node.feature.split(',')[6])
            elif node.feature.split(',')[0] == '形容詞':
                word_list.append(node.feature.split(',')[6])
            else:
                word_list.append(node.surface)
            node = node.next
    
        return word_list[1: -1]

    def _run_tfid(self, document_list: list[str]) -> None:
        vectorizer = TfidfVectorizer(analyzer=self._run_mecab)
        vectors = vectorizer.fit_transform(document_list)
        return vectors.toarray()

    def calcurate_cosine_similarity_dataframe(self, document_id: int) -> pd.DataFrame:
        return pd.DataFrame([
            self.document_list,
            cosine_similarity([self.vectors[document_id]], self.vectors)[0]
        ], index=['文書', '類似度']).T.sort_values('類似度', ascending=False)


m = MorphologicaAnalysis(["pythonが大好きです", "goが大好きです", "pythonが好きです", "pythonが嫌いです"])

In [167]:
m.calcurate_cosine_similarity_dataframe(0)

Unnamed: 0,文書,類似度
0,pythonが大好きです,1.0
1,goが大好きです,0.631654
2,pythonが好きです,0.543201
3,pythonが嫌いです,0.543201


In [168]:
headlines = []

for text_path in glob.glob('./text/topic-news/*.txt'):
    with open(text_path, 'r') as f:
        headlines.append(f.readlines()[2][:-1])

In [169]:
headline_model = MorphologicaAnalysis(headlines)
# headline_model.calcurate_cosine_similarity_dataframe(1).head(10)