In [1]:
import pandas as pd
import numpy as np
import nltk
from os import path
import re
import libs as ft
from sklearn.metrics.pairwise import cosine_similarity
#from pyfasttext import FastText
#from gensim.models.wrappers import FastText
import fasttext as ft


import nltk
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Si\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Si\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [2]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.decomposition import PCA
from matplotlib import pyplot as plt

from mpl_toolkits.mplot3d import Axes3D
from pathlib import Path

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from pathlib import Path
from sklearn.metrics import (roc_curve, auc, accuracy_score)
from sklearn.preprocessing import StandardScaler

import nltk
from nltk.tokenize.punkt import PunktSentenceTokenizer, PunktParameters
from nltk.tokenize.punkt import PunktSentenceTokenizer, PunktParameters


In [3]:
def cleansing(x, drop_tag, tag_pos, lemmatizer):
    """
    いらない品詞を除外し，レンマ化して返す．apply関数内で使用，

    Args:
        x (Series): apply関数で呼び出されるSeries
        drop_tag (list): いらない品詞リスト(nltk)
        tag_pos (dict): key -> tag, value -> pos. レンマ化の精度向上に使用．
        lemmatizer (nltk.stem.WordNetLemmatizer): lemmatizer

    Returns:
        (str): output sentence
    """
    words = [word for word in x['headline_text'].split(' ') if word != '']  # 空文字入るとエラーになる
    tags = nltk.pos_tag(words)  # 品詞を取得
    try:
        words = [(word, tag_pos[tag]) for word, tag in tags if tag not in drop_tag]  # いらない品詞を除外
    except KeyError:
        print('drop_tag:', len(words), words)
    
    #escape termination from undentified words and output the potential point
    words_escaped_list = []
    try:
        for word, pos in words:
            print('word, pos:', word, pos)
            words = [lemmatizer.lemmatize(word, pos=pos)]
    except ValueError:
        print('lemmatization value error escaped!')
        print('words:', len(words), words)
        words_escaped_list.append(words)
    
    #words = [lemmatizer.lemmatize(word, pos=pos) for word, pos in words]
    sentence = ' '.join(words)  # 連結
    return sentence

In [4]:
def preprocess(data):
    """
    前処理の関数．

    Args:
        data (DataFrame): input dataset

    Retruns:
        (DataFrame): output dataset
    """
    # まずは，いらない品詞を落とし，レンマ化する．
    # その後，階層クラスタリングのときに使う用のcsvファイルとモデル学習用のtxtファイルを出力する．
    lemmatizer = nltk.stem.WordNetLemmatizer()
    # いらない品詞
    drop_tag = ['$', 'CC', 'CD', 'DT', 'IN', 'MD', 'POS', 'PRP', 'PRP$', 'RP', 'TO' , 'WP', 'WRB','WDT','PDT','EX','WP$','In']
    # 品詞とpos(lemma用)の変換辞書
    tag_pos = {'FW': 'n', 'JJ': 'a', 'JJR': 'a', 'JJS': 'a', 'NN': 'n', 'NNP': 'n', 'NNS': 'n', 'RB': 'r', 'RBR': 'r', 'VB': 'v',
               'VBD': 'v', 'VBG': 'v', 'VBN': 'v', 'VBP': 'v', 'VBZ': 'v', 'RBS': 'r',}

    data = data.assign(preprocessed=data.apply(func=cleansing, axis=1, args=(drop_tag, tag_pos, lemmatizer)))

    print('after drop and lemmatization:')
    print(data)
    data.to_csv('data.csv', sep='\t', index=False)
    data['headline_text'].to_csv('text.txt', index=False)
    
    '''
    s = data['headline_text'].values.tolist()[0]
    print(s)
    path_w = 'text.txt'
    txt = ['headline_text\n', s]
    print(txt)
    with open(path_w, mode='w') as f:
        f.writelines(txt)
    '''
    
    
    #data.to_csv('text.txt', index=False)
    return data

In [5]:
def get_word_vector(data_name='text.txt', model_name='./pretrained_model/model.bin'):
    """
    fasttextベースで分散表現を取得する関数．これも見てわかると思うので引数は省略．

    Returns:
        (list of list): 単語リストのリスト．[['word_0_0', 'word_0_1'], ['word_1_0', 'word_1_1', 'word_1_2'], ...]みたいな
        (array): 分散表現 次元=(文章数×分散表現の次元数)
    """
    sentences = []
    with open(data_name, mode='r',encoding="utf-8") as f:
        for line in f.readlines():
            line = re.sub('\n', '', line)
            sentences.append(line.split(' '))

    # modelが12GBくらいメモリを食うので終わったら開放する．
    vec_name =  'sentences_vc.npy'
    #if not path.exists(vec_name):
        #model = FastText.load_fasttext_format(model_name)
    model = ft.load_model(model_name)
    dim = model.get_dimension()
    sentences_vec = np.zeros((dim,))

    for words in sentences:
        vec = np.zeros((dim,))
        for word in words:
            if model.get_word_id(word) == -1:
                print('this word does not exists in corpus: %s at %s' % (word, words))
            vec = np.vstack((vec, model.get_word_vector(word)))
        vec = vec[1:, :].mean(axis=0)
        sentences_vec = np.vstack((sentences_vec, vec))
    sentences_vec = sentences_vec[1:, :]
    del model

    np.save(vec_name, sentences_vec)
    #else:
        #sentences_vec = np.load(vec_name)
    return sentences, sentences_vec

In [6]:
import re
from contextlib import redirect_stdout
from io import StringIO

example = 'Mary had a little lamb, Jack went up the hill, Jill followed suit, i woke up suddenly, it was a really bad dream...'

def token_to_sentence(str):
    f = StringIO()
    with redirect_stdout(f):
        regex_of_sentence = re.findall('([\w\s]{0,})[^\w\s]', str)
        regex_of_sentence = [x for x in regex_of_sentence if x is not '']
        for i in regex_of_sentence:
            print(i)
        first_step_to_sentence = (f.getvalue()).split('\n')
    g = StringIO()
    with redirect_stdout(g):
        for i in first_step_to_sentence:
            try:
                regex_to_clear_sentence = re.search('\s([\w\s]{0,})', i)
                print(regex_to_clear_sentence.group(1))
            except:
                print(i)
        sentence = (g.getvalue()).split('\n')
    return sentence

def token_to_words(str):
    f = StringIO()
    with redirect_stdout(f):
        for i in str:
            regex_of_word = re.findall('([\w]{0,})', i)
            regex_of_word = [x for x in regex_of_word if x is not '']
            for word in regex_of_word:
                print(regex_of_word)
        words = (f.getvalue()).split('\n')

In [7]:
#homePC
#path_0 = Path(r'C:\Users\SI\Python_ML\python_NLTK\NLTK_')
#officePC
path_0 = Path(r'C:\Users\Si\Desktop\python_all\python_NLTK\NLTK_')


author = ['Poe','Twain']
type_ = ['letter', 'story']
test = ['test']

choice = [0,1]

index_i = 0
index_j = 1
path_file = str(path_0) + str('/') + str(author[choice[index_i]]) + str('-') + str(type_[choice[index_j]])
#path = str(path_0) + str('/') + str(test[0])
print(path_file)

C:\Users\Si\Desktop\python_all\python_NLTK\NLTK_/Poe-story


In [8]:
path_file = Path(path_file)
path_file
path_file.glob('*.txt')
list(path_file.glob('*.txt'))
print('lists:', path_file.glob('*.txt'))
#list path
path_list = list(path_file.glob('*.txt'))
path_list

lists: <generator object Path.glob at 0x000001BFF32D6BC8>


[WindowsPath('C:/Users/Si/Desktop/python_all/python_NLTK/NLTK_/Poe-story/Poe-story-001.txt'),
 WindowsPath('C:/Users/Si/Desktop/python_all/python_NLTK/NLTK_/Poe-story/Poe-story-002.txt'),
 WindowsPath('C:/Users/Si/Desktop/python_all/python_NLTK/NLTK_/Poe-story/Poe-story-003.txt'),
 WindowsPath('C:/Users/Si/Desktop/python_all/python_NLTK/NLTK_/Poe-story/Poe-story-004.txt'),
 WindowsPath('C:/Users/Si/Desktop/python_all/python_NLTK/NLTK_/Poe-story/Poe-story-005.txt'),
 WindowsPath('C:/Users/Si/Desktop/python_all/python_NLTK/NLTK_/Poe-story/Poe-story-006.txt'),
 WindowsPath('C:/Users/Si/Desktop/python_all/python_NLTK/NLTK_/Poe-story/Poe-story-007.txt'),
 WindowsPath('C:/Users/Si/Desktop/python_all/python_NLTK/NLTK_/Poe-story/Poe-story-008.txt'),
 WindowsPath('C:/Users/Si/Desktop/python_all/python_NLTK/NLTK_/Poe-story/Poe-story-009.txt'),
 WindowsPath('C:/Users/Si/Desktop/python_all/python_NLTK/NLTK_/Poe-story/Poe-story-010.txt'),
 WindowsPath('C:/Users/Si/Desktop/python_all/python_NLTK/NLT

In [9]:
#new_dir_path_csv = 'csv/' + str(test[choice[index_i]]) + str('_') 
#new_dir_path_png = 'png/' + str(test[choice[index_i]]) + str('_') 

#author_type = str(test[choice[index_i]]) + str('_')

new_dir_path_csv = 'csv/' + str(author[choice[index_i]]) + str('-') + str(type_[choice[index_j]])
new_dir_path_png = 'png/' + str(author[choice[index_i]]) + str('-') + str(type_[choice[index_j]])

author_type = str(author[choice[index_i]]) + str('-') + str(type_[choice[index_j]])

new_dir_path_png_top = 'png/' + author_type + '/top_ranking'
new_dir_path_png_top_no_punc = 'png/' + author_type + '/top_ranking_no_punc'
new_dir_path_png_top_no_punc_no_stopwords = 'png/' + author_type + '/top_ranking_no_punc_no_stopwords'
new_dir_path_png_dispersion_plot = 'png/' + author_type + '/dispersion_plot'
new_dir_path_png_biagram_plot  = 'png/' + author_type + '/biagram_plot'

def my_makedirs(path):
    if not os.path.isdir(path):
        os.makedirs(path)

my_makedirs(new_dir_path_csv)
my_makedirs(new_dir_path_png)
my_makedirs(new_dir_path_png_top)
my_makedirs(new_dir_path_png_top_no_punc)
my_makedirs(new_dir_path_png_top_no_punc_no_stopwords)

my_makedirs(new_dir_path_png_dispersion_plot)
my_makedirs(new_dir_path_png_biagram_plot)

new_dir_path_csv, new_dir_path_png, new_dir_path_png_top, new_dir_path_png_top_no_punc, new_dir_path_png_top_no_punc_no_stopwords,new_dir_path_png_dispersion_plot, new_dir_path_png_biagram_plot

('csv/Poe-story',
 'png/Poe-story',
 'png/Poe-story/top_ranking',
 'png/Poe-story/top_ranking_no_punc',
 'png/Poe-story/top_ranking_no_punc_no_stopwords',
 'png/Poe-story/dispersion_plot',
 'png/Poe-story/biagram_plot')

In [10]:
# read multi-stories and get the feature vectors

In [11]:
#for index in range(len(path_list)):
for index in range(1):
    print(index)
    index = 1
    data = pd.read_table(path_list[index], encoding="utf-8")
    print(data)
    data_list = data[data.columns[0]].values.tolist()
    print('len(data_list)',len(data_list))
    
    vec_list = []
    sentence_list = []
    for idx, sentence in enumerate(data_list):
        print(idx, sentence)
        para = sentence

        punkt_params = PunktParameters()
        punkt_params.abbrev_types = set(['Mr', 'Mrs', 'LLC', 'Miss'])
        tokenizer = PunktSentenceTokenizer(punkt_params)
        tokens = tokenizer.tokenize(para)

        for t in tokens:
            print (t, "\n")

        df_sentence = pd.DataFrame(tokens)
        df_sentence = df_sentence.rename(columns={0: 'headline_text'})
        print(idx, df_sentence)

        col = pd.DataFrame(df_sentence).columns[0]
        series = pd.DataFrame(df_sentence)['headline_text']
        print(col, series, type(series), series.shape)

        df_sentence = preprocess(pd.DataFrame(series))
        sentences, vec = get_word_vector()

        if len(sentences) == 0 or len(vec)==0:
            continue
        elif (len(sentences) == 1 and sentences[0][0] == 'headline_text'):
            continue
        elif (len(sentences) > 1 and sentences[0][0] == 'headline_text'):
            for j in range(vec.shape[0]):
                if j == 0:
                    continue
                else:
                    sentence_list.append(sentences[j])
                    vec_list.append(vec[j])
        else:
            for j in range(vec.shape[0]):
                sentence_list.append(sentences[j])
                vec_list.append(vec[j])

        #print(vec.shape, sentences[1], vec_list[1])
        print('idx:',idx, '\n',vec.shape, sentences)
        #sentence_list.append(sentences[1])
        #vec_list.append(vec[1])
        
        # preserve the results into csv by authers
        df_vec_list = []
        for i in range(len(vec_list)):
            vec_series = pd.Series(vec_list[i])
            vec_series
            df_vec = pd.DataFrame(vec_series)
            print(df_vec)
            df_vec_list.append(df_vec)
            
        df_vec_concat = pd.concat([df_vec_list[j] for j in range(len(vec_list))], axis=1)
        df_vec_concat
        
        df_vec_concat.to_csv(new_dir_path_csv +'\df_vec_'+str(index)+'.csv')
        df_vec2 = pd.read_csv(new_dir_path_csv +'\df_vec_'+str(index)+'.csv')
        df_vec2
        
        df_sentence_list = []
        for i in range(len(sentence_list)):
            sentence_series = pd.Series(sentence_list[i])
            sentence_series
            df_sentence = pd.DataFrame(sentence_series)
            print(df_sentence)
            df_sentence_list.append(df_sentence)

        df_sentence_concat = pd.concat([df_sentence_list[j] for j in range(len(sentence_list))], axis=1)
        df_sentence_concat

        df_sentence_concat.to_csv(new_dir_path_csv +'\df_sentence_'+str(index)+'.csv')
        df_sentence2 = pd.read_csv(new_dir_path_csv +'\df_sentence_'+str(index)+'.csv')
        df_sentence2


0
                             BYRON AND MISS CHAWORTH
0  Les anges says Madame Dudevant, a woman who in...
1  The angels are not more pure than the heart of...
2  The hyperbole is scarcely less than true. It w...
3  The boyish poet-love is indisputably that one ...
4  In every allusion made by the author of “Child...
5  That his attachment for this “Mary” (in whose ...
6  In view of a passion thus engendered, Miss Cha...
7  In absence, the bard bore easily with him all ...
8  She to him was the Egeria of his dreams  the V...
len(data_list) 9
0 Les anges says Madame Dudevant, a woman who intersperses many an admirable sentiment amid a chaos of the most shameless and altogether objectionable fiction.
Les anges says Madame Dudevant, a woman who intersperses many an admirable sentiment amid a chaos of the most shameless and altogether objectionable fiction. 

0                                        headline_text
0  Les anges says Madame Dudevant, a woman who in...
headline_text 0    Les 

this word does not exists in corpus: headline_text at ['headline_text']
this word does not exists in corpus: The at ['The', 'boyish', 'poet-love', 'is', 'indisputably', 'that', 'one', 'of', 'the', 'human', 'sentiments', 'which', 'most', 'nearly', 'realizes', 'our', 'dreams', 'of', 'the', 'chastened', 'voluptuousness', 'of', 'heaven.']
this word does not exists in corpus: poet-love at ['The', 'boyish', 'poet-love', 'is', 'indisputably', 'that', 'one', 'of', 'the', 'human', 'sentiments', 'which', 'most', 'nearly', 'realizes', 'our', 'dreams', 'of', 'the', 'chastened', 'voluptuousness', 'of', 'heaven.']
this word does not exists in corpus: heaven. at ['The', 'boyish', 'poet-love', 'is', 'indisputably', 'that', 'one', 'of', 'the', 'human', 'sentiments', 'which', 'most', 'nearly', 'realizes', 'our', 'dreams', 'of', 'the', 'chastened', 'voluptuousness', 'of', 'heaven.']
idx: 3 
 (2, 300) [['headline_text'], ['The', 'boyish', 'poet-love', 'is', 'indisputably', 'that', 'one', 'of', 'the', 'hum

idx: 4 
 (4, 300) [['headline_text'], ['"In', 'every', 'allusion', 'made', 'by', 'the', 'author', 'of', '“Childe', 'Harold”', 'to', 'his', 'passion', 'for', 'Mary', 'Chaworth,', 'there', 'runs', 'a', 'vein', 'of', 'almost', 'spiritual', 'tenderness', 'and', 'purity,', 'strongly', 'in', 'contrast', 'with', 'the', 'gross', 'earthliness', 'pervading', 'and', 'disfiguring', 'his', 'ordinary', 'love-poems."'], ['"The', 'Dream,', 'in', 'which', 'the', 'incidents', 'of', 'his', 'parting', 'with', 'her', 'when', 'about', 'to', 'travel,', 'are', 'said', 'to', 'be', 'delineated,', 'or', 'at', 'least', 'paralleled,', 'has', 'never', 'been', 'excelled', '(certainly', 'never', 'excelled', 'by', 'him)', 'in', 'the', 'blended', 'fervor,', 'delicacy,', 'truthfulness', 'and', 'ethereality', 'which', 'sublimate', 'and', 'adorn', 'it."'], ['"For', 'this', 'reason,', 'it', 'may', 'well', 'be', 'doubted', 'if', 'he', 'has', 'written', 'anything', 'so', 'universally', 'popular."']]
            0
0    0.0039

this word does not exists in corpus: headline_text at ['headline_text']
this word does not exists in corpus: "That at ['"That', 'his', 'attachment', 'for', 'this', '“Mary”', '(in', 'whose', 'very', 'name', 'there', 'indeed', 'seemed', 'to', 'exist', 'for', 'him', 'an', '“enchantment”)', 'was', 'earnest,', 'and', 'long-abiding,', 'we', 'have', 'every', 'reason', 'to', 'believe."']
this word does not exists in corpus: “Mary” at ['"That', 'his', 'attachment', 'for', 'this', '“Mary”', '(in', 'whose', 'very', 'name', 'there', 'indeed', 'seemed', 'to', 'exist', 'for', 'him', 'an', '“enchantment”)', 'was', 'earnest,', 'and', 'long-abiding,', 'we', 'have', 'every', 'reason', 'to', 'believe."']
this word does not exists in corpus: (in at ['"That', 'his', 'attachment', 'for', 'this', '“Mary”', '(in', 'whose', 'very', 'name', 'there', 'indeed', 'seemed', 'to', 'exist', 'for', 'him', 'an', '“enchantment”)', 'was', 'earnest,', 'and', 'long-abiding,', 'we', 'have', 'every', 'reason', 'to', 'believe.

idx: 5 
 (10, 300) [['headline_text'], ['"That', 'his', 'attachment', 'for', 'this', '“Mary”', '(in', 'whose', 'very', 'name', 'there', 'indeed', 'seemed', 'to', 'exist', 'for', 'him', 'an', '“enchantment”)', 'was', 'earnest,', 'and', 'long-abiding,', 'we', 'have', 'every', 'reason', 'to', 'believe."'], ['"There', 'are', 'a', 'hundred', 'evidences', 'of', 'this', 'fact,', 'scattered', 'not', 'only', 'through', 'his', 'own', 'poems', 'and', 'letters,', 'but', 'in', 'the', 'memoirs', 'of', 'his', 'relatives,', 'and', 'cotemporaries', 'in', 'general."'], ['"But', 'that', 'it', 'was', 'thus', 'earnest', 'and', 'enduring,', 'does', 'not', 'controvert,', 'in', 'any', 'degree,', 'the', 'opinion', 'that', 'it', 'was', 'a', 'passion', '(if', 'passion', 'it', 'can', 'properly', 'be', 'termed)', 'of', 'the', 'most', 'thoroughly', 'romantic', 'shadowy', 'and', 'imaginative', 'character."'], ['"It', 'was', 'born', 'of', 'the', 'hour,', 'and', 'of', 'the', 'youthful', 'necessity', 'to', 'love,', 'wh

this word does not exists in corpus: headline_text at ['headline_text']
this word does not exists in corpus: "In at ['"In', 'view', 'of', 'a', 'passion', 'thus', 'engendered,', 'Miss', 'Chaworth,', '(who', 'is', 'represented', 'as', 'possessed', 'of', 'no', 'little', 'personal', 'beauty', 'and', 'some', 'accomplishments,)', 'could', 'not', 'have', 'failed', 'to', 'serve', 'sufficiently', 'well', 'as', 'the', 'incarnation', 'of', 'the', 'ideal', 'that', 'haunted', 'the', 'fancy', 'of', 'the', 'poet."']
this word does not exists in corpus: engendered, at ['"In', 'view', 'of', 'a', 'passion', 'thus', 'engendered,', 'Miss', 'Chaworth,', '(who', 'is', 'represented', 'as', 'possessed', 'of', 'no', 'little', 'personal', 'beauty', 'and', 'some', 'accomplishments,)', 'could', 'not', 'have', 'failed', 'to', 'serve', 'sufficiently', 'well', 'as', 'the', 'incarnation', 'of', 'the', 'ideal', 'that', 'haunted', 'the', 'fancy', 'of', 'the', 'poet."']
this word does not exists in corpus: Miss at ['"In

this word does not exists in corpus: headline_text at ['headline_text']
this word does not exists in corpus: "In at ['"In', 'absence,', 'the', 'bard', 'bore', 'easily', 'with', 'him', 'all', 'the', 'fancies', 'which', 'were', 'the', 'basis', 'of', 'his', 'flame', '', 'a', 'flame', 'which', 'absence', 'itself', 'but', 'served', 'to', 'keep', 'in', 'vigor', '', 'while', 'the', 'less', 'ideal', 'but', 'at', 'the', 'same', 'time', 'the', 'less', 'really', 'substantial', 'affection', 'of', 'his', 'ladye-love,', 'perished', 'utterly', 'and', 'forthwith,', 'through', 'simple', 'lack', 'of', 'the', 'element', 'which', 'had', 'fanned', 'it', 'into', 'being."']
this word does not exists in corpus: absence, at ['"In', 'absence,', 'the', 'bard', 'bore', 'easily', 'with', 'him', 'all', 'the', 'fancies', 'which', 'were', 'the', 'basis', 'of', 'his', 'flame', '', 'a', 'flame', 'which', 'absence', 'itself', 'but', 'served', 'to', 'keep', 'in', 'vigor', '', 'while', 'the', 'less', 'ideal', 'but', 'at',

PermissionError: [Errno 13] Permission denied: 'csv/Poe-story\\df_sentence_1.csv'

In [None]:
for i in range(len(vec_list)):
    print(i, len(vec_list[i]), vec_list[i])

In [None]:
print(vec.shape, len(vec_list), len(vec_list[0]), type(vec_list[0]))

In [None]:
for i in range(len(sentence_list)):
    print(i, len(sentence_list[i]), sentence_list[i])

In [None]:
# save the feature vectors into csv

In [None]:
print(vec.shape, len(vec_list), len(vec_list[0]), type(vec_list[0]))

In [None]:
df_vec_list = []
for i in range(len(vec_list)):
    vec_series = pd.Series(vec_list[i])
    vec_series
    df_vec = pd.DataFrame(vec_series)
    print(df_vec)
    df_vec_list.append(df_vec)

In [None]:
df_vec_concat = pd.concat([df_vec_list[j] for j in range(len(vec_list))], axis=1)
df_vec_concat

In [None]:
df_vec_concat.to_csv(new_dir_path_csv +'\df_vec_test.csv')
df_vec2 = pd.read_csv(new_dir_path_csv +'\df_vec_test.csv')
df_vec2

In [None]:
# save the original sentences into csv

In [None]:
df_sentence_list = []
for i in range(len(sentence_list)):
    sentence_series = pd.Series(sentence_list[i])
    sentence_series
    df_sentence = pd.DataFrame(sentence_series)
    print(df_sentence)
    df_sentence_list.append(df_sentence)

df_sentence_concat = pd.concat([df_sentence_list[j] for j in range(len(sentence_list))], axis=1)
df_sentence_concat

df_sentence_concat.to_csv(new_dir_path_csv +'\df_sentence_test.csv')
df_sentence2 = pd.read_csv(new_dir_path_csv +'\df_sentence_test.csv')
df_sentence2