# Tokenizing Comments and POS Tagging

## Importing Packages

In [1]:
import pandas as pd
import re
import spacy
from spacy.tokenizer import Tokenizer

In [2]:
# spacy pipeline 
nlp = spacy.load("en_core_web_sm")
nlp.add_pipe('sentencizer')
tokenizer = Tokenizer(nlp.vocab)

  from .autonotebook import tqdm as notebook_tqdm


## Reading and Cleaning Data

In [3]:
# load in EDITED metaphor annotation json and save it to a pandas dataframe
file_path = 'project-2-at-2025-05-20-edit.json'
met_df = pd.read_json(file_path)

# cleaning file names
met_df['filename'] = met_df.apply(lambda row: re.sub(r"^[^_]*-", '', row.file_upload), axis=1)
met_df['filename'] = met_df.apply(lambda row: re.sub(r"_fixed", '', row.filename[:-4]), axis=1)

## Tokenizing and POS Tagging

In [4]:
# creating a new dataframe based on the metaphor dataframe, with fewer columns
df_analysis=met_df[['filename', 'data', 'annotations']]
# extracting comment text from data column 
df_analysis['text'] = df_analysis.apply(lambda row: row.data['text'], axis=1)
# creating a column of text split into sentences
df_analysis['sentences'] = df_analysis.apply(lambda row: [sent.text.strip() for sent in nlp(row.text).sents], axis=1)
# creating a column of text split into tokens
df_analysis['tokens'] = df_analysis.apply(lambda row: tokenizer(row.text), axis=1)
# creating a column with POS tagging
df_analysis['pos'] = df_analysis.apply(lambda row: [(token.text, token.pos_) for token in nlp(row.text)], axis=1)
# creating a column with POS tagging per sentence
df_analysis['pos_sentence'] = df_analysis.apply(lambda row: [[(token.text, token.pos_) for token in nlp(sent)] for sent in row.sentences], axis=1)
# creating a column with metaphor tokens
df_analysis['metaphor_tokens']=df_analysis.apply(lambda row: [[str(token) for token in tokenizer(metaphor['value']['text'])] for metaphor in row.annotations[0]['result']], axis=1)
# dropping data column
df_analysis = df_analysis.drop(columns=['data', 'annotations'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_analysis['text'] = df_analysis.apply(lambda row: row.data['text'], axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_analysis['sentences'] = df_analysis.apply(lambda row: [sent.text.strip() for sent in nlp(row.text).sents], axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_analys

In [5]:
df_analysis.head()

Unnamed: 0,filename,text,sentences,tokens,pos,pos_sentence,metaphor_tokens
0,hillary_1,' The problem is bigger than Mr. Trump.'Extend...,"[' The problem is bigger than Mr. Trump., 'Ext...","(', The, problem, is, bigger, than, Mr., Trump...","[(', PUNCT), (The, DET), (problem, NOUN), (is,...","[[(', PUNCT), (The, DET), (problem, NOUN), (is...","[[blue, collar]]"
1,hillary_2,"'If she was a man, she would be president-elec...","['If she was a man, she would be president-ele...","('If, she, was, a, man,, she, would, be, presi...","[(', PUNCT), (If, SCONJ), (she, PRON), (was, A...","[[(', PUNCT), (If, SCONJ), (she, PRON), (was, ...",[[genitals]]
2,hillary_3,'What are you going to tell your daughters?'Te...,['What are you going to tell your daughters?'T...,"('What, are, you, going, to, tell, your, daugh...","[(', PUNCT), (What, PRON), (are, AUX), (you, P...","[[(', PUNCT), (What, PRON), (are, AUX), (you, ...",[[engage]]
3,hillary_4,'What are you going to tell your daughters?'We...,['What are you going to tell your daughters?'W...,"('What, are, you, going, to, tell, your, daugh...","[(', PUNCT), (What, PRON), (are, AUX), (you, P...","[[(', PUNCT), (What, PRON), (are, AUX), (you, ...","[[top], [third, world]]"
4,hillary_5,A PR piece designed to reverse all the damage ...,[A PR piece designed to reverse all the damage...,"(A, PR, piece, designed, to, reverse, all, the...","[(A, DET), (PR, NOUN), (piece, NOUN), (designe...","[[(A, DET), (PR, NOUN), (piece, NOUN), (design...",[]


In [6]:
df_analysis.to_csv('tokenized_comments.csv')