In this notebook, I converted docx files to dataframe. I removed unnecessary information. One row refers to one speaker's saying before another speaker says something.

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
! pip install docx2txt

Collecting docx2txt
  Downloading docx2txt-0.8.tar.gz (2.8 kB)
Building wheels for collected packages: docx2txt
  Building wheel for docx2txt (setup.py) ... [?25l[?25hdone
  Created wheel for docx2txt: filename=docx2txt-0.8-py3-none-any.whl size=3980 sha256=2f1d6a982f11e8d915628f75a5d9fe2ff071504a2aef8dda95c4b9e72a37cafc
  Stored in directory: /root/.cache/pip/wheels/b7/20/b2/473e3aea9a0c0d3e7b2f7bd81d06d0794fec12752733d1f3a8
Successfully built docx2txt
Installing collected packages: docx2txt
Successfully installed docx2txt-0.8


In [None]:
! pip install python-docx

Collecting python-docx
  Downloading python-docx-0.8.11.tar.gz (5.6 MB)
[K     |████████████████████████████████| 5.6 MB 5.3 MB/s 
Building wheels for collected packages: python-docx
  Building wheel for python-docx (setup.py) ... [?25l[?25hdone
  Created wheel for python-docx: filename=python_docx-0.8.11-py3-none-any.whl size=184507 sha256=729e3e0f7ee37ac09c547d1b79d3fd2d40f7b9fddbe47a83cd81f7c5caf32370
  Stored in directory: /root/.cache/pip/wheels/f6/6f/b9/d798122a8b55b74ad30b5f52b01482169b445fbb84a11797a6
Successfully built python-docx
Installing collected packages: python-docx
Successfully installed python-docx-0.8.11


In [None]:
# data access and processing
import pandas as pd
import numpy as np

# File helpers
import glob
import warnings

# python helpers
import os.path
import re

# docx helpers
import docx
import docx2txt

In [None]:
base_prefix = '/content/drive/MyDrive/Capstone Design/data/final'

In [None]:
#export
def getText(filename):
    """
    Import document file and show in python environment
    
    Parmeters
    ---------
    filename : str
        a document's file path
        
    Returns
    -------
    str
        the document's contents
    """
    doc = docx.Document(filename)
    fullText = []
    for para in doc.paragraphs:
        fullText.append(para.text)
    return '\n'.join(fullText)

In [None]:
# get filenames list
filenames = glob.glob(base_prefix + '/*.docx')

# read file contents
file_contents = []
file_id = []
for file in filenames:
    file_id.append(file.split("/")[-1])
    file_contents.append(getText(file))
    
# convert to df
file_df = pd.DataFrame({'file_name': file_id, 'text': file_contents})
file_df.head() 

Unnamed: 0,file_name,text
0,2039_parent_11.04.2021.docx,"Speaker 1:\nHas been involved in, I would love..."
1,2033_VocIndex-and-SA_07.09.2020.docx,"Speaker 1:\n... wash, that's probably one of t..."
2,2035_GAS-and-VocIndex_06.18.2020.docx,Speaker 1:\nNo problem.\nSpeaker 2:\nGreat. He...
3,2040_VocIndex_06.19.2020.docx,"Speaker 1:\nOkay. I'm guessing not as well, be..."
4,2034_GAS_Voc_SA_05.18.2020.docx,Speaker 1:\nThat maybe you're not getting now....


In [None]:
def docx_to_df(file_path):    
    """
    Convert docx file to dataframe
    
    Parameters
    ----------
    file_path : str
        A file path of documnet
        
    Returns
    -------
    dataframe
        speech | transcript_filepath | id  | transcriber_id | wave_filepath
        ------------------------------------------------------------------
        00:00  | Users/Soyeon/~~~.   |119-2| 113.           | Users/~~~~
        
    """
    # Convert docx file to dataframe
    file_name = file_path.split("/")[-1]
    text = file_df[file_df['file_name'] == file_name].text.item()
    text_list = text.split('\n')
    df = pd.DataFrame(text_list, columns = ["speech"])

    # Add [transcript_filepath] column
    df['file_name'] = file_name
    df['participant'] = df['file_name'].str[:4]
    df["transcript_filepath"] = file_path

    extract = re.search('(\d{2}).(\d{2}).(\d{4})', file_path)
    if extract is not None:
      df['year'] = extract.group(3)
      df['month'] = extract.group(1)
      df['day'] = extract.group(2)

    else:
        df['year'] = None
        df['month'] = None
        df['day'] = None
        warnings.warn('File {0} seems to have the wrong title format for extracting id and transcriber_id'.format(file_path));

    df = df[df["speech"] != '']

    return df

In [None]:
# Create a list having all dataframes converted from the docx files
dfs_list = [docx_to_df(file) for file in filenames]
megadata = pd.concat(dfs_list)

In [None]:
megadata.head()

Unnamed: 0,speech,file_name,participant,transcript_filepath,year,month,day
0,Speaker 1:,2039_parent_11.04.2021.docx,2039,/content/drive/MyDrive/Capstone Design/data/fi...,2021,11,4
1,"Has been involved in, I would love to hear abo...",2039_parent_11.04.2021.docx,2039,/content/drive/MyDrive/Capstone Design/data/fi...,2021,11,4
2,Speaker 2:,2039_parent_11.04.2021.docx,2039,/content/drive/MyDrive/Capstone Design/data/fi...,2021,11,4
3,"Correct. No, he's post high school.",2039_parent_11.04.2021.docx,2039,/content/drive/MyDrive/Capstone Design/data/fi...,2021,11,4
4,Speaker 1:,2039_parent_11.04.2021.docx,2039,/content/drive/MyDrive/Capstone Design/data/fi...,2021,11,4


In [None]:
megadata = megadata[megadata["speech"] != '']

In [None]:
megadata.head()

Unnamed: 0,speech,file_name,participant,transcript_filepath,year,month,day
0,Speaker 1:,2039_parent_11.04.2021.docx,2039,/content/drive/MyDrive/Capstone Design/data/fi...,2021,11,4
1,"Has been involved in, I would love to hear abo...",2039_parent_11.04.2021.docx,2039,/content/drive/MyDrive/Capstone Design/data/fi...,2021,11,4
2,Speaker 2:,2039_parent_11.04.2021.docx,2039,/content/drive/MyDrive/Capstone Design/data/fi...,2021,11,4
3,"Correct. No, he's post high school.",2039_parent_11.04.2021.docx,2039,/content/drive/MyDrive/Capstone Design/data/fi...,2021,11,4
4,Speaker 1:,2039_parent_11.04.2021.docx,2039,/content/drive/MyDrive/Capstone Design/data/fi...,2021,11,4


In [None]:
participant = megadata['participant'].unique().tolist()

In [None]:
megadata = megadata[megadata["speech"].str.contains("This transcript was exported on") == False]
megadata = megadata[megadata["speech"].str.contains("Transcript by Rev.com") == False]
megadata = megadata[megadata["speech"].str.contains("Page") == False]
megadata = megadata[megadata["speech"].str.contains('\(Completed ') == False]
megadata = megadata[megadata["speech"].str.contains('This is the vocational index for') == False]

for i in range(len(participant)):
  megadata = megadata[megadata["speech"].str.contains(str(participant[i]) + '_') == False]

In [None]:
# remove parenthesis 
megadata['speech'] = megadata['speech'].str.replace(r"\(.*\)","")
megadata['speech'] = megadata['speech'].str.replace(r"\[.*\]","")

  
  This is separate from the ipykernel package so we can avoid doing imports until


In [None]:
# Reset the index of the Dataframe
megadata = megadata.reset_index(drop = True)

In [None]:
megadata_speaker = megadata.iloc[::2]
megadata_speech = megadata.iloc[1::2]

speaker = megadata_speaker['speech'].values.tolist()
megadata_speech['speaker'] = speaker

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """


In [None]:
megadata_speech[megadata_speech['year'].isna()]['file_name'].unique()

array([], dtype=object)

In [None]:
megadata_speech['year'] = megadata_speech['year'].astype(int)
megadata_speech['month'] = megadata_speech['month'].astype(int)
megadata_speech['day'] = megadata_speech['day'].astype(int)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [None]:
megadata_speech

Unnamed: 0,speech,file_name,participant,transcript_filepath,year,month,day,speaker
1,"Has been involved in, I would love to hear abo...",2039_parent_11.04.2021.docx,2039,/content/drive/MyDrive/Capstone Design/data/fi...,2021,11,4,Speaker 1:
3,"Correct. No, he's post high school.",2039_parent_11.04.2021.docx,2039,/content/drive/MyDrive/Capstone Design/data/fi...,2021,11,4,Speaker 2:
5,Just like to confirm. So did Adrian exit the s...,2039_parent_11.04.2021.docx,2039,/content/drive/MyDrive/Capstone Design/data/fi...,2021,11,4,Speaker 1:
7,"No, he was already out.",2039_parent_11.04.2021.docx,2039,/content/drive/MyDrive/Capstone Design/data/fi...,2021,11,4,Speaker 2:
9,"Okay. So for this section, I'm going to ask yo...",2039_parent_11.04.2021.docx,2039,/content/drive/MyDrive/Capstone Design/data/fi...,2021,11,4,Speaker 1:
...,...,...,...,...,...,...,...,...
33073,Hope I didn't say anything wrong.,3002_T2-Service-Access-and-VocIndex_04.15.2021...,3002,/content/drive/MyDrive/Capstone Design/data/fi...,2021,4,15,Speaker 2 :
33075,Nope. It was great. You did a great job.,3002_T2-Service-Access-and-VocIndex_04.15.2021...,3002,/content/drive/MyDrive/Capstone Design/data/fi...,2021,4,15,Speaker 1 :
33077,Okay. All right. Take care.,3002_T2-Service-Access-and-VocIndex_04.15.2021...,3002,/content/drive/MyDrive/Capstone Design/data/fi...,2021,4,15,Speaker 2 :
33079,Bye-bye.,3002_T2-Service-Access-and-VocIndex_04.15.2021...,3002,/content/drive/MyDrive/Capstone Design/data/fi...,2021,4,15,Speaker 1 :


In [None]:
cols=["year","month","day"]
megadata_speech['date'] = megadata_speech[cols].apply(lambda x: '-'.join(x.values.astype(str)), axis="columns")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [None]:
megadata_speech

Unnamed: 0,speech,file_name,participant,transcript_filepath,year,month,day,speaker,date
1,"Has been involved in, I would love to hear abo...",2039_parent_11.04.2021.docx,2039,/content/drive/MyDrive/Capstone Design/data/fi...,2021,11,4,Speaker 1:,2021-11-4
3,"Correct. No, he's post high school.",2039_parent_11.04.2021.docx,2039,/content/drive/MyDrive/Capstone Design/data/fi...,2021,11,4,Speaker 2:,2021-11-4
5,Just like to confirm. So did Adrian exit the s...,2039_parent_11.04.2021.docx,2039,/content/drive/MyDrive/Capstone Design/data/fi...,2021,11,4,Speaker 1:,2021-11-4
7,"No, he was already out.",2039_parent_11.04.2021.docx,2039,/content/drive/MyDrive/Capstone Design/data/fi...,2021,11,4,Speaker 2:,2021-11-4
9,"Okay. So for this section, I'm going to ask yo...",2039_parent_11.04.2021.docx,2039,/content/drive/MyDrive/Capstone Design/data/fi...,2021,11,4,Speaker 1:,2021-11-4
...,...,...,...,...,...,...,...,...,...
33073,Hope I didn't say anything wrong.,3002_T2-Service-Access-and-VocIndex_04.15.2021...,3002,/content/drive/MyDrive/Capstone Design/data/fi...,2021,4,15,Speaker 2 :,2021-4-15
33075,Nope. It was great. You did a great job.,3002_T2-Service-Access-and-VocIndex_04.15.2021...,3002,/content/drive/MyDrive/Capstone Design/data/fi...,2021,4,15,Speaker 1 :,2021-4-15
33077,Okay. All right. Take care.,3002_T2-Service-Access-and-VocIndex_04.15.2021...,3002,/content/drive/MyDrive/Capstone Design/data/fi...,2021,4,15,Speaker 2 :,2021-4-15
33079,Bye-bye.,3002_T2-Service-Access-and-VocIndex_04.15.2021...,3002,/content/drive/MyDrive/Capstone Design/data/fi...,2021,4,15,Speaker 1 :,2021-4-15


In [None]:
df = megadata_speech[['participant', 'file_name', 'speaker', 'speech', 'date', 'transcript_filepath', 'year', 'month', 'day']]

In [None]:
df = df.reset_index(drop = True)

In [None]:
df.head()

Unnamed: 0,participant,file_name,speaker,speech,date,transcript_filepath,year,month,day
0,2039,2039_parent_11.04.2021.docx,Speaker 1:,"Has been involved in, I would love to hear abo...",2021-11-4,/content/drive/MyDrive/Capstone Design/data/fi...,2021,11,4
1,2039,2039_parent_11.04.2021.docx,Speaker 2:,"Correct. No, he's post high school.",2021-11-4,/content/drive/MyDrive/Capstone Design/data/fi...,2021,11,4
2,2039,2039_parent_11.04.2021.docx,Speaker 1:,Just like to confirm. So did Adrian exit the s...,2021-11-4,/content/drive/MyDrive/Capstone Design/data/fi...,2021,11,4
3,2039,2039_parent_11.04.2021.docx,Speaker 2:,"No, he was already out.",2021-11-4,/content/drive/MyDrive/Capstone Design/data/fi...,2021,11,4
4,2039,2039_parent_11.04.2021.docx,Speaker 1:,"Okay. So for this section, I'm going to ask yo...",2021-11-4,/content/drive/MyDrive/Capstone Design/data/fi...,2021,11,4


In [None]:
df.to_csv('processed_data.csv')