In [1]:
# Dependancies 
from transformers import pipeline
from docx import Document
import re

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Cleaning Text Functions

def prepare(input_text_path, output_text_path, start_string, end_string):
    doc = Document(input_text_path)
    cleaned_text = []
    
    # Regular expression to match timestamps
    timestamp_regex = re.compile(r'\b\d{1,2}:\d{2}:\d{2}\b')
    
    # Flags for controlling text recording
    start_recording = False
    end_recording = False
    
    for para in doc.paragraphs:
        if start_string in para.text:
            start_recording = True  # Signal to start recording text from the next paragraph
            print("found the start string")
            continue  # Skip appending the starting string
        if end_string in para.text:
            end_recording = True  # Signal to stop recording text
            print("found the end string")
            break  # Exit the loop since the end string has been encountered
        
        if start_recording and not end_recording:
            text = para.text
            
            # Remove timestamps from the text
            text = timestamp_regex.sub("", text)
            
            # Conditionally remove the "R:" prefix if present, after removing timestamps
            if text.startswith("R:"):
                text = text[2:].strip()
            
            # Skip paragraphs that start with "I:"
            if not text.startswith("I:"):
                cleaned_text.append(text.strip())  
    
    final_text = "\n".join(cleaned_text)
    
    # Write the cleaned text to the output file
    with open(output_text_path, 'w', encoding='utf-8') as file:
        file.write(final_text)


In [None]:
def prepare(input_text_path, output_text_path):
    doc = Document(input_text_path)
    cleaned_text = []
    
    # Regular expression to match timestamps
    timestamp_regex = re.compile(r'\b\d{1,2}:\d{2}:\d{2}\b')
    
    for para in doc.paragraphs:
        text = para.text
        
        # Remove timestamps from the text
        text = timestamp_regex.sub("", text)
        
        # Conditionally remove the "R:" prefix if present, after removing timestamps
        if text.startswith("R:"):
            text = text[2:].strip()
        
        # Skip paragraphs that start with "I:"
        if not text.startswith("I:"):
            cleaned_text.append(text.strip())  
    
    final_text = "\n".join(cleaned_text)
    
    # Write the cleaned text to the output file
    with open(output_text_path, 'w', encoding='utf-8') as file:
        file.write(final_text)


In [3]:
# Conditional 
inputtext_conditional = "/Users/sunmoon/Desktop/nlp/joint.docx"
outputtext_conditional = "/Users/sunmoon/Desktop/nlp/joint_cleaned.txt"

prepare(inputtext_conditional,outputtext_conditional, "Intra household discussion and Bargaining", "General norms around land")


found the start string
found the end string


In [4]:
# Unconditional (wife not on deed)

inputtext_unconditional = "/Users/sunmoon/Desktop/nlp/single.docx"
outputtext_unconditional = "/Users/sunmoon/Desktop/nlp/single_cleaned.txt"

prepare(inputtext_unconditional,outputtext_unconditional, "Intra-household discussion and bargaining", "Gender norms around land")


found the start string
found the end string


In [5]:
#classifier = pipeline("text-classification", model="j-hartmann/emotion-english-distilroberta-base", return_all_scores=True)
classifier = pipeline(task="text-classification", model="SamLowe/roberta-base-go_emotions", top_k=None)

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFRobertaForSequenceClassification: ['roberta.embeddings.position_ids']
- This IS expected if you are initializing TFRobertaForSequenceClassification from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFRobertaForSequenceClassification from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFRobertaForSequenceClassification were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFRobertaForSequenceClassification for predictions without further training.


In [6]:
# Conditional 
results_conditional = classifier(outputtext_conditional)
print(results_conditional)

[[{'label': 'neutral', 'score': 0.9689276218414307}, {'label': 'approval', 'score': 0.010621764697134495}, {'label': 'annoyance', 'score': 0.0072591654025018215}, {'label': 'realization', 'score': 0.004994337912648916}, {'label': 'disapproval', 'score': 0.003861496224999428}, {'label': 'disappointment', 'score': 0.0035657351836562157}, {'label': 'admiration', 'score': 0.003407483221963048}, {'label': 'confusion', 'score': 0.0033609310630708933}, {'label': 'anger', 'score': 0.002968629589304328}, {'label': 'disgust', 'score': 0.0029220380820333958}, {'label': 'sadness', 'score': 0.0027078292332589626}, {'label': 'amusement', 'score': 0.002326824702322483}, {'label': 'excitement', 'score': 0.002281916094943881}, {'label': 'joy', 'score': 0.002070820890367031}, {'label': 'curiosity', 'score': 0.001999541651457548}, {'label': 'fear', 'score': 0.001791447284631431}, {'label': 'love', 'score': 0.0017111704219132662}, {'label': 'optimism', 'score': 0.0016817477298900485}, {'label': 'desire', 

In [7]:
# Unconditional 
results_unconditional = classifier(outputtext_unconditional)
print(results_unconditional)

[[{'label': 'neutral', 'score': 0.9686564207077026}, {'label': 'approval', 'score': 0.010021964088082314}, {'label': 'annoyance', 'score': 0.0074686771258711815}, {'label': 'realization', 'score': 0.004718221258372068}, {'label': 'disapproval', 'score': 0.003920832183212042}, {'label': 'confusion', 'score': 0.0036984344478696585}, {'label': 'disappointment', 'score': 0.0033637257292866707}, {'label': 'admiration', 'score': 0.003210442140698433}, {'label': 'anger', 'score': 0.00317251100204885}, {'label': 'disgust', 'score': 0.0028084740042686462}, {'label': 'sadness', 'score': 0.0024821481201797724}, {'label': 'amusement', 'score': 0.00232497020624578}, {'label': 'excitement', 'score': 0.0022132357116788626}, {'label': 'curiosity', 'score': 0.0021854734513908625}, {'label': 'joy', 'score': 0.0019707013852894306}, {'label': 'fear', 'score': 0.001706388546153903}, {'label': 'optimism', 'score': 0.001658217515796423}, {'label': 'love', 'score': 0.0015884240856394172}, {'label': 'desire', 