# CMT Analysis

## Import Packages

In [1]:
import pandas as pd
import re
from spacy.tokenizer import Tokenizer
from spacy.lang.en import English
import nltk
from nltk.corpus import stopwords 
nlp = English()
tokenizer = Tokenizer(nlp.vocab)

  from .autonotebook import tqdm as notebook_tqdm


## Reading and Cleaning Data

### Metaphor Annotations

In [2]:
# load in CMT annotation json and save it to a pandas dataframe
file_path = 'CMT_July30.json'
met_df = pd.read_json(file_path)

# extracting labels from the annotations column
met_df['labels'] = met_df.apply(lambda row: row.annotations[0]['result'], axis=1)

# cleaning up file names 
met_df['filename'] = met_df.apply(lambda row: re.sub(r"^[^_]*-", '', row.file_upload), axis=1)
met_df['filename'] = met_df.apply(lambda row: re.sub(r"_fixed", '', row.filename[:-4].lower()), axis=1)

In [3]:
# creating a dictionary of metaphor labels, where each key is a filename
met_labels = {}
for name in met_df['filename'].unique():
    # creating a new dataframe only containing labels corresponding to one file
    new_df = met_df[met_df['filename']==name][['filename', 'labels']].reset_index()
    # creating a list to save the labels in 
    labels_dic = {}
    for el in new_df['labels'][0]:
        # adding labels to the label list 
        if el['from_name']=='source':
            start = el['value']['start']
            end = el['value']['end']
            for source in el['value']['taxonomy'][0]:
                try: 
                    labels_dic[source].append([start,end])
                except KeyError: 
                    labels_dic[source] = [[start,end]]
    # saving the dic of labels to the dictionary of filenames
    met_labels[name] = labels_dic

## Corpus Analysis

In [5]:
# creating a new dataframe based on the metaphor dataframe, with fewer columns
df_analysis=met_df[['filename', 'data']]
# creating a column that measures the length of the text associated with each filename
df_analysis['length'] = df_analysis.apply(lambda row: len(tokenizer(row.data['text'])), axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_analysis['length'] = df_analysis.apply(lambda row: len(tokenizer(row.data['text'])), axis=1)


In [6]:
# shortest comments
df_analysis.loc[(df_analysis.length == min(df_analysis.length))]

Unnamed: 0,filename,data,length
40,china_49,"{'text': 'Racist claptrap, MSL.'}",3
70,property_73,{'text': 'Smear campaign IMO'},3
81,refugees_22,{'text': 'Don't think so'},3
132,watch_40,{'text': 'Great piece! Thanks.'},3
141,watch_64,{'text': 'Loved this article!'},3


In [7]:
# longest
df_analysis.loc[(df_analysis.length == max(df_analysis.length))]

Unnamed: 0,filename,data,length
139,watch_57,{'text': 'Information overload is real. And ve...,441


In [8]:
# statistics
print('There are {} comments in the corpus. The comments range between {} and {} in number of tokens, with an average of {}. \
The total number of tokens in the corpus is {}.'.format(len(df_analysis), min(df_analysis.length), 
                                                        max(df_analysis.length), sum(df_analysis.length)/len(df_analysis), 
                                                        sum(df_analysis.length)))

There are 150 comments in the corpus. The comments range between 3 and 441 in number of tokens, with an average of 68.39333333333333. The total number of tokens in the corpus is 10259.


In [9]:
# creating a list to contain the length of each label
labels_diff = []
for all_labels in met_labels.values():
    for label in all_labels.values():
        for indices in label:
            # calculating the difference in indices
            length=indices[1]-indices[0]
            # appending the length to the list
            labels_diff.append(length)
print('There are {} labels in the CMT annotation layer, and the average span of each label is {} characters.'.format(len(labels_diff),
                                                                                                                    sum(labels_diff)/len(labels_diff)))

There are 830 labels in the CMT annotation layer, and the average span of each label is 22.643373493975904 characters.


## Results

### Common Source Domains

In [10]:
# creating a list of all the potential sources
source_list = []
for file in met_labels.keys():
    source_list.extend(list(met_labels[file]))

In [11]:
def top_sources(labels_dic):
    '''
    takes a dictionary of labels as input and returns a sorted list of labels based on their frequency
    '''
    # creating a dictionary where every potential source is a key and setting the values to zero
    source_dic = {}
    for source in set(source_list):
        source_dic[source]=0
    # populating the dictionary by counting the number of occurences for each source
    for file in labels_dic.keys():
        # going through each source in the file's labels 
        for source in labels_dic[file].keys():
            # updating the count in the source dictionary
            source_dic[source] = source_dic[source] + len(labels_dic[file][source])
    return sorted(source_dic.items(), key=lambda x: x[1], reverse=True)

In [12]:
# top sources overall
top_sources(met_labels)

[('Container(s)', 50),
 ('Transferring an Object', 29),
 ('Object(s)', 23),
 ('Motion', 22),
 ('Possession(s)', 21),
 ('Adversary', 21),
 ('Body', 21),
 ('Location', 19),
 ('Other', 17),
 ('Game', 16),
 ('Journey', 16),
 ('Physical Properties', 15),
 ('Control', 15),
 ('Comparison', 15),
 ('Substance', 14),
 ('Lacking Possession', 14),
 ('Down', 14),
 ('Moving Object', 13),
 ('Cover', 12),
 ('War', 11),
 ('Sound', 11),
 ('Machine/Mechanism', 11),
 ('Up', 11),
 ('Path(s)/Pathway', 10),
 ('Constructed Object', 10),
 ('Education', 9),
 ('Problem Solving', 9),
 ('Money', 8),
 ('Obligations, Duties, Responsibilities', 8),
 ('Time', 8),
 ('Plant', 8),
 ('Direction', 8),
 ('Constraint', 7),
 ('Building', 7),
 ('Preventing movement/motion', 7),
 ('Commercial Transaction', 7),
 ('Vision', 7),
 ('Upward Movement', 7),
 ('Growth', 6),
 ('Barrier/Obstacle', 6),
 ('Maintaining Position', 6),
 ('Forward Movement', 6),
 ('Object', 6),
 ('Following', 6),
 ('Communication', 6),
 ('Commodities, Goods, V

In [13]:
# creating dictionaries of labels for each of the 6 topics
aboriginal_labels = dict(filter(lambda item: 'aboriginal' in item[0], met_labels.items()))
china_labels = dict(filter(lambda item: 'china' in item[0], met_labels.items()))
property_labels = dict(filter(lambda item: 'property' in item[0], met_labels.items()))
refugees_labels = dict(filter(lambda item: 'refugees' in item[0], met_labels.items()))
uber_labels = dict(filter(lambda item: 'uber' in item[0], met_labels.items()))
watch_labels = dict(filter(lambda item: 'watch' in item[0], met_labels.items()))

In [14]:
top_sources(aboriginal_labels)

[('Container(s)', 9),
 ('Object', 6),
 ('Location', 6),
 ('Possession(s)', 5),
 ('Transferring an Object', 5),
 ('Control', 5),
 ('Plant', 5),
 ('Machine/Mechanism', 4),
 ('Journey', 4),
 ('Growth/Rise', 4),
 ('Problem Solving', 3),
 ('Moving Object', 3),
 ('Adversary', 3),
 ('Linked Objects', 3),
 ('Comparison', 3),
 ('Other', 3),
 ('Game', 2),
 ('Sound', 2),
 ('Backward Movement', 2),
 ('Body Of Water', 2),
 ('Maintaining Position', 2),
 ('Money', 2),
 ('Crime', 2),
 ('Vehicle', 2),
 ('Up', 2),
 ('Brittle Object', 1),
 ('Motion', 1),
 ('Cause, Causation', 1),
 ('Disposables/Garbage', 1),
 ('Object(s)', 1),
 ('Destruction', 1),
 ('Goal', 1),
 ('Barrier/Obstacle', 1),
 ('Cover', 1),
 ('1-On-1 Physical Aggression, Fight', 1),
 ('Struggle', 1),
 ('Cause, Causation, Change', 1),
 ('Food', 1),
 ('Circular movement', 1),
 ('Foreward Movement/Motion', 1),
 ('Electrification/Electricity', 1),
 ('People', 1),
 ('Field of Vision', 1),
 ('Violence', 1),
 ('Senses (seeing/hearing/touching/etc.)',

In [15]:
top_sources(china_labels)

[('Container(s)', 14),
 ('Object(s)', 7),
 ('Location', 6),
 ('Body', 6),
 ('Lacking Possession', 5),
 ('Down', 4),
 ('Physical Properties', 4),
 ('Communication', 4),
 ('Possessing/Having/Possessions', 3),
 ('War', 3),
 ('Sound', 3),
 ('Constraint', 3),
 ('Moving Object', 3),
 ('Adversary', 3),
 ('Transferring an Object', 3),
 ('Control', 3),
 ('Comparison', 3),
 ('1-Or-1 Physical Aggression, Fight', 2),
 ('Cover', 2),
 ('Machine/Mechanism', 2),
 ('Tool', 2),
 ('Cause, Causation, Change', 2),
 ('Building', 2),
 ('Journey', 2),
 ('Up', 2),
 ('Following', 2),
 ('Confinement', 2),
 ('Commercial Transaction', 2),
 ('Direction', 2),
 ('Brittle Object', 1),
 ('Light', 1),
 ('Visibility', 1),
 ('Motion', 1),
 ('Substance', 1),
 ('Size', 1),
 ('Game', 1),
 ('Disposables/Garbage', 1),
 ('Possessor', 1),
 ('Goal', 1),
 ('Power', 1),
 ('Possessing', 1),
 ('Making Visible', 1),
 ('Solving a Puzzle', 1),
 ('Job/Career', 1),
 ('Clothing', 1),
 ('Maintaining Position', 1),
 ('Child', 1),
 ('Region i

In [16]:
top_sources(property_labels)

[('Substance', 11),
 ('Body', 11),
 ('Motion', 9),
 ('Game', 9),
 ('Container(s)', 9),
 ('Education', 7),
 ('War', 5),
 ('Colour', 5),
 ('Money', 4),
 ('Transferring an Object', 4),
 ('Vision', 4),
 ('Belief(s)', 3),
 ('Barrier/Obstacle', 3),
 ('Cover', 3),
 ('Machine/Mechanism', 3),
 ('Tool', 3),
 ('Journey', 3),
 ('Path(s)/Pathway', 3),
 ('Comparison', 3),
 ('Other', 3),
 ('Commodities, Goods, Value', 2),
 ('Visibility', 2),
 ('Sound', 2),
 ('Destruction', 2),
 ('Moral(s)/Morality', 2),
 ('Constraint', 2),
 ('Maintaining Position', 2),
 ('Downward Movement', 2),
 ('Possession(s)', 2),
 ('Immoral/Immorality', 2),
 ('Feeling', 2),
 ('Physical Properties', 2),
 ('Up', 2),
 ('Animal', 2),
 ('Time', 2),
 ('An External Event Exerting Force On', 2),
 ('Direction', 2),
 ('Constructed Object', 2),
 ('Fabric', 1),
 ('Weapons', 1),
 ('Growth', 1),
 ('Harm', 1),
 ('Making', 1),
 ('Job/Career', 1),
 ('Physical Forces', 1),
 ('Building', 1),
 ('Food', 1),
 ('Opportunities', 1),
 ('Logical/Logic', 

In [17]:
top_sources(refugees_labels)

[('Object(s)', 10),
 ('Lacking Possession', 9),
 ('Motion', 7),
 ('Obligations, Duties, Responsibilities', 7),
 ('Problem Solving', 6),
 ('Possession(s)', 5),
 ('Down', 5),
 ('Time', 5),
 ('Container(s)', 4),
 ('Water', 4),
 ('Transferring an Object', 4),
 ('Caregiver', 4),
 ('Adversity', 4),
 ('Rule Enforcer', 3),
 ('Up', 3),
 ('Adversary', 3),
 ('Comparison', 3),
 ('Sound', 2),
 ('Journey', 2),
 ('Crime', 2),
 ('An External Event Exerting Force On', 2),
 ('Control', 2),
 ('Path(s)/Pathway', 2),
 ('Entities', 2),
 ('Other', 2),
 ('Cause, Causation', 1),
 ('Possessor', 1),
 ('Perception(s)', 1),
 ('War', 1),
 ('Harm', 1),
 ('Power', 1),
 ('Moral(s)/Morality', 1),
 ('Job/Career', 1),
 ('Cover', 1),
 ('Religion', 1),
 ('Body Of Water', 1),
 ('Past Events Time', 1),
 ('Forward Movement', 1),
 ('Education', 1),
 ('Disgust', 1),
 ('Money', 1),
 ('Food', 1),
 ('Importance', 1),
 ('Problem', 1),
 ('Attributes', 1),
 ('Aligned/Alignment', 1),
 ('Puzzle', 1),
 ('Vertical Scale', 1),
 ('Change',

In [18]:
top_sources(uber_labels)

[('Transferring an Object', 8),
 ('Possession(s)', 7),
 ('Adversary', 7),
 ('Container(s)', 5),
 ('Growth', 4),
 ('Cover', 4),
 ('Journey', 4),
 ('Game', 3),
 ('Forward Movement', 3),
 ('Moving Object', 3),
 ('Location', 3),
 ('Commercial Transaction', 3),
 ('Constructed Object', 3),
 ('Light', 2),
 ('Commodities, Goods, Value', 2),
 ('Motion', 2),
 ('Object(s)', 2),
 ('Belief(s)', 2),
 ('Making Visible', 2),
 ('Down', 2),
 ('Physical Properties', 2),
 ('Up', 2),
 ('Following', 2),
 ('Body', 2),
 ('Preventing movement/motion', 2),
 ('Path(s)/Pathway', 2),
 ('Comparison', 2),
 ('Liquid', 2),
 ('Other', 2),
 ('Brittle Object', 1),
 ('Visibility', 1),
 ('Substance', 1),
 ('Cause, Causation', 1),
 ('Possessor', 1),
 ('Sound', 1),
 ('Harm', 1),
 ('Barrier/Obstacle', 1),
 ('Constraint', 1),
 ('Machine/Mechanism', 1),
 ('Maintaining Position', 1),
 ('Story', 1),
 ('Building', 1),
 ('Crime', 1),
 ('Forceful Extraction', 1),
 ('Farm/Domestic Animals', 1),
 ('Vehicle', 1),
 ('Function(ing)/Funct

In [19]:
top_sources(watch_labels)

[('Container(s)', 9),
 ('Physical Properties', 7),
 ('Other', 6),
 ('Points (Set up in Spatial Configuration)', 5),
 ('Wealth', 5),
 ('Transferring an Object', 5),
 ('Moving Object', 4),
 ('Adversary', 4),
 ('Object(s)', 3),
 ('Building', 3),
 ('Addiction', 3),
 ('Preventing movement/motion', 3),
 ('Control', 3),
 ('Path(s)/Pathway', 3),
 ('Upward Movement', 3),
 ('Motion', 2),
 ('War', 2),
 ('Harm', 2),
 ('Making', 2),
 ('Religion', 2),
 ('Forward Movement', 2),
 ('Possession(s)', 2),
 ('Electrification/Electricity', 2),
 ('Down', 2),
 ('Following', 2),
 ('Landscape', 2),
 ('Location', 2),
 ('Body', 2),
 ('Direction', 2),
 ('Constructed Object', 2),
 ('Commodities, Goods, Value', 1),
 ('Visibility', 1),
 ('Substance', 1),
 ('Game', 1),
 ('Cause, Causation', 1),
 ('Disposables/Garbage', 1),
 ('Growth', 1),
 ('Sound', 1),
 ('Goal', 1),
 ('Barrier/Obstacle', 1),
 ('Making Visible', 1),
 ('Constraint', 1),
 ('Cover', 1),
 ('Machine/Mechanism', 1),
 ('Body Of Water', 1),
 ('Downward Moveme

### Keywords

In [20]:
def keywords(labels_df):
    '''
    takes a dataframe of labels as input and returns the frequency distribution of the keywords in the labelled text
    '''
    # extracting and combining all the highlighted metaphor texts
    text = ''
    for row in labels_df['labels']:
        for label in row:
            text = text + ' ' + label['value']['text']
    
    # create a list of the words 
    nltk_tokenizer = nltk.tokenize.RegexpTokenizer('\w+')
    tokens = nltk_tokenizer.tokenize(text)
    
    # create a list where all the words are in lowercase
    words = []
    for word in tokens:
        words.append(word.lower())
    
    # create a new list without stop words
    sw = stopwords.words('english')
    new_words = []
    for word in words:
        if word not in sw:
            new_words.append(word)
    
    # create and return the frequency distribution        
    freqdist = nltk.FreqDist(new_words)
    return freqdist

In [21]:
# creating dataframes for each topic
aboriginal_df = met_df[met_df.filename.str.contains('aboriginal')]
china_df = met_df[met_df.filename.str.contains('china')]
property_df = met_df[met_df.filename.str.contains('property')]
refugees_df = met_df[met_df.filename.str.contains('refugees')]
uber_df = met_df[met_df.filename.str.contains('uber')]
watch_df = met_df[met_df.filename.str.contains('watch')]

In [22]:
# keywords over the entire corpus
keywords(met_df)

FreqDist({'china': 18, 'tax': 18, 'time': 13, 'one': 13, 'canada': 13, 'point': 11, 'go': 11, 'back': 10, 'like': 10, 'us': 10, ...})

In [23]:
keywords(aboriginal_df)

FreqDist({'start': 7, 'abuse': 4, 'address': 4, 'raised': 4, 'mainstream': 4, 'problem': 3, 'alcohol': 3, 'time': 3, 'long': 3, 'past': 3, ...})

In [24]:
keywords(china_df)

FreqDist({'china': 17, 'us': 7, 'war': 5, 'military': 5, 'chinese': 4, 'un': 4, 'america': 4, 'international': 4, 'ruling': 4, 'like': 3, ...})

In [25]:
keywords(property_df)

FreqDist({'tax': 18, 'head': 10, 'race': 6, 'canada': 5, 'money': 5, 'professor': 4, 'back': 4, 'racism': 4, 'applies': 4, 'white': 4, ...})

In [26]:
keywords(refugees_df)

FreqDist({'lack': 6, 'take': 6, 'refugees': 5, 'europe': 5, 'canada': 5, 'low': 4, 'shortage': 3, 'syrian': 3, 'mass': 3, 'accepted': 3, ...})

In [27]:
keywords(uber_df)

FreqDist({'uber': 6, 'money': 5, 'drivers': 4, 'taxi': 4, 'follow': 4, 'commercial': 4, 'lost': 4, 'driver': 3, 'glaring': 3, 'one': 3, ...})

In [28]:
keywords(watch_df)

FreqDist({'tech': 8, 'point': 5, 'go': 5, 'line': 4, 'age': 4, 'way': 4, 'world': 4, 'one': 4, 'time': 3, 'around': 3, ...})