# CMT Analysis

## Import Packages

In [1]:
import pandas as pd
import re
from spacy.tokenizer import Tokenizer
from spacy.lang.en import English
import nltk
from nltk.corpus import stopwords 
nlp = English()
tokenizer = Tokenizer(nlp.vocab)

  from .autonotebook import tqdm as notebook_tqdm


## Reading and Cleaning Data

### Metaphor Annotations

In [2]:
# load in CMT annotation json and save it to a pandas dataframe
file_path = 'CMT_July17.json'
met_df = pd.read_json(file_path)

# extracting labels from the annotations column
met_df['labels'] = met_df.apply(lambda row: row.annotations[0]['result'], axis=1)

# cleaning up file names 
met_df['filename'] = met_df.apply(lambda row: re.sub(r"^[^_]*-", '', row.file_upload), axis=1)

In [3]:
# creating a dictionary of metaphor labels, where each key is a filename
met_labels = {}
for name in met_df['filename'].unique():
    # creating a new dataframe only containing labels corresponding to one file
    new_df = met_df[met_df['filename']==name][['filename', 'labels']].reset_index()
    # creating a list to save the labels in 
    labels_dic = {}
    for el in new_df['labels'][0]:
        # adding labels to the label list 
        if el['from_name']=='source':
            start = el['value']['start']
            end = el['value']['end']
            for source in el['value']['taxonomy'][0]:
                try: 
                    labels_dic[source].append([start,end])
                except KeyError: 
                    labels_dic[source] = [[start,end]]
    # saving the dic of labels to the dictionary of filenames
    met_labels[name] = labels_dic

## Corpus Analysis

In [4]:
# creating a new dataframe based on the metaphor dataframe, with fewer columns
df_analysis=met_df[['filename', 'data']]
# creating a column that measures the length of the text associated with each filename
df_analysis['length'] = df_analysis.apply(lambda row: len(tokenizer(row.data['text'])), axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_analysis['length'] = df_analysis.apply(lambda row: len(tokenizer(row.data['text'])), axis=1)


In [5]:
# shortest comments
df_analysis.loc[(df_analysis.length == min(df_analysis.length))]

Unnamed: 0,filename,data,length
40,china_49.txt,"{'text': 'Racist claptrap, MSL.'}",3
70,property_73.txt,{'text': 'Smear campaign IMO'},3
81,refugees_22.txt,{'text': 'Don't think so'},3
132,watch_40.txt,{'text': 'Great piece! Thanks.'},3
141,watch_64.txt,{'text': 'Loved this article!'},3


In [6]:
# longest
df_analysis.loc[(df_analysis.length == max(df_analysis.length))]

Unnamed: 0,filename,data,length
139,watch_57.txt,{'text': 'Information overload is real. And ve...,441


In [7]:
# statistics
print('There are {} comments in the corpus. The comments range between {} and {} in number of tokens, with an average of {}. \
The total number of tokens in the corpus is {}.'.format(len(df_analysis), min(df_analysis.length), 
                                                        max(df_analysis.length), sum(df_analysis.length)/len(df_analysis), 
                                                        sum(df_analysis.length)))

There are 150 comments in the corpus. The comments range between 3 and 441 in number of tokens, with an average of 68.39333333333333. The total number of tokens in the corpus is 10259.


## Results

### Common Source Domains

In [8]:
# creating a list of all the potential sources
source_list = []
for file in met_labels.keys():
    source_list.extend(list(met_labels[file]))
# creating a dictionary where every potential source is a key and setting the values to zero
source_dic = {}
for source in set(source_list):
    source_dic[source]=0

In [9]:
# populating the dictionary by counting the number of occurences for each source
for file in met_labels.keys():
    # going through each source in the file's labels 
    for source in met_labels[file].keys():
        # updating the count in the source dictionary
        source_dic[source] = source_dic[source] + len(met_labels[file][source])

In [10]:
sorted(source_dic.items(), key=lambda x: x[1], reverse=True)

[('Other', 27),
 ('Adversary', 23),
 ('Container(s)', 17),
 ('Game', 12),
 ('Transferring an Object', 11),
 ('Up', 10),
 ('Journey', 10),
 ('Control', 9),
 ('Body', 9),
 ('War', 9),
 ('Cover', 8),
 ('Object(s)', 8),
 ('Problem', 8),
 ('Money', 8),
 ('Location', 8),
 ('Possession(s)', 8),
 ('Upward Movement', 7),
 ('Education', 7),
 ('Plant', 7),
 ('Preventing movement/motion', 7),
 ('Obligations, Duties, Responsibilities', 6),
 ('Physical Properties', 6),
 ('Motion/Movement', 6),
 ('Color', 5),
 ('Commercial Transaction', 5),
 ('Moving Object', 5),
 ('Communication', 5),
 ('Lacking Possession', 5),
 ('Machine/Mechanism', 5),
 ('Motion', 5),
 ('Comparison', 5),
 ('Constraint', 4),
 ('Vision', 4),
 ('Physical Forces', 4),
 ('Growth/Rise', 4),
 ('Time', 4),
 ('Water', 4),
 ('Following', 4),
 ('Down', 4),
 ('Sound', 4),
 ('Object', 4),
 ('Lacking a Needed Possession', 4),
 ('Forward Movement', 4),
 ('Adversity', 4),
 ('Possessing/Having/Possessions', 3),
 ('Liquid', 3),
 ('Building', 3),
 

### Keywords

In [11]:
# extracting and combining all the highlighted metaphor texts
text = ''
for row in met_df['labels']:
    for label in row:
        text = text + ' ' + label['value']['text']

In [12]:
# create a list of the words 
tokenizer = nltk.tokenize.RegexpTokenizer('\w+')
tokens = tokenizer.tokenize(text)

# create a list where all the words are in lowercase
words = []
for word in tokens:
    words.append(word.lower())

# create a new list without stop words
sw = stopwords.words('english')
new_words = []
for word in words:
    if word not in sw:
        new_words.append(word)

# create and return the frequency distribution        
freqdist = nltk.FreqDist(new_words)
freqdist

FreqDist({'china': 17, 'start': 8, 'time': 8, 'real': 8, 'us': 8, 'money': 8, 'like': 7, 'point': 6, 'take': 6, 'one': 6, ...})