<a href="https://colab.research.google.com/github/spatank/CIS-530/blob/master/Homework%205/homework_5.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [39]:
pip install pymagnitude



In [0]:
from pymagnitude import *
from itertools import combinations
from prettytable import PrettyTable
from sklearn.cluster import KMeans
import random
from sklearn.cluster import KMeans

In [41]:
from google.colab import drive
drive.mount('/content/drive')

# to view contents, run following line
# !ls drive/My\ Drive/CIS-530/Homework\ 5/Data

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [0]:
def load_input_file(file_path):
    """
    Loads the input file to two dictionaries
    :param file_path: path to an input file
    :return: 2 dictionaries:
    1. Dictionary, where key is a target word and value is a list of paraphrases
    2. Dictionary, where key is a target word and value is a number of clusters
    """
    word_to_paraphrases_dict = {}
    word_to_k_dict = {}

    with open(file_path, 'r') as fin:
        for line in fin:
            target_word, k, paraphrases = line.split(' :: ')
            word_to_k_dict[target_word] = int(k)
            word_to_paraphrases_dict[target_word] = paraphrases.split()

    return word_to_paraphrases_dict, word_to_k_dict


def load_output_file(file_path):
    """
    :param file_path: path to an output file
    :return: A dictionary, where key is a target word and value is a list of list of paraphrases
    """
    clusterings = {}

    with open(file_path, 'r') as fin:
        for line in fin:
            target_word, _, paraphrases_in_cluster = line.strip().split(' :: ')
            paraphrases_list = paraphrases_in_cluster.strip().split()
            if target_word not in clusterings:
                clusterings[target_word] = []
            clusterings[target_word].append(paraphrases_list)

    return clusterings


def write_to_output_file(file_path, clusterings):
    """
    Writes the result of clusterings into an output file
    :param file_path: path to an output file
    :param clusterings:  A dictionary, where key is a target word and value is a list of list of paraphrases
    :return: N/A
    """
    with open(file_path, 'w') as fout:
        for target_word, clustering in clusterings.items():
            for i, cluster in enumerate(clustering):
                fout.write(f'{target_word} :: {i + 1} :: {" ".join(cluster)}\n')
        fout.close()


def get_paired_f_score(gold_clustering, predicted_clustering):
    """
    :param gold_clustering: gold list of list of paraphrases
    :param predicted_clustering: predicted list of list of paraphrases
    :return: Paired F-Score
    """
    gold_pairs = set()
    for gold_cluster in gold_clustering:
        for pair in combinations(gold_cluster, 2):
            gold_pairs.add(tuple(sorted(pair)))

    predicted_pairs = set()
    for predicted_cluster in predicted_clustering:
        for pair in combinations(predicted_cluster, 2):
            predicted_pairs.add(tuple(sorted(pair)))

    overlapping_pairs = gold_pairs & predicted_pairs

    precision = 1. if len(predicted_pairs) == 0 else float(len(overlapping_pairs)) / len(predicted_pairs)
    recall = 1. if len(gold_pairs) == 0 else float(len(overlapping_pairs)) / len(gold_pairs)
    paired_f_score = 0. if precision + recall == 0 else 2 * precision * recall / (precision + recall)

    return paired_f_score


def evaluate_clusterings(gold_clusterings, predicted_clusterings):
    """
    Displays evaluation scores between gold and predicted clusterings
    :param gold_clusterings: dictionary where key is a target word and value is a list of list of paraphrases
    :param predicted_clusterings: dictionary where key is a target word and value is a list of list of paraphrases
    :return: N/A
    """
    target_words = set(gold_clusterings.keys()) & set(predicted_clusterings.keys())

    if len(target_words) == 0:
        print('No overlapping target words in ground-truth and predicted files')
        return None

    paired_f_scores = np.zeros((len(target_words)))
    ks = np.zeros((len(target_words)))

    table = PrettyTable(['Target', 'k', 'Paired F-Score'])
    for i, target_word in enumerate(target_words):
        paired_f_score = get_paired_f_score(gold_clusterings[target_word], predicted_clusterings[target_word])
        k = len(gold_clusterings[target_word])
        paired_f_scores[i] = paired_f_score
        ks[i] = k
        table.add_row([target_word, k, f'{paired_f_score:0.4f}'])

    average_f_score = np.average(paired_f_scores, weights=ks)
    print(table)
    print(f'=> Average Paired F-Score:  {average_f_score:.4f}')


In [0]:
random.seed(123)

# TASK 3.1
def cluster_random(word_to_paraphrases_dict, word_to_k_dict):
    """
    Clusters paraphrases randomly
    :param word_to_paraphrases_dict: dictionary, where key is a target word and value is a list of paraphrases
    :param word_to_k_dict: dictionary, where key is a target word and value is a number of clusters
    :return: dictionary, where key is a target word and value is a list of list of paraphrases,
    where each list corresponds to a cluster
    """
    clusterings = {}

    for target_word in word_to_paraphrases_dict.keys():
        paraphrase_list = word_to_paraphrases_dict[target_word]
        clusters = []
        k = word_to_k_dict[target_word] # number of clusters for target word
        chosen_paraphrases = set() # keep track of any paraphrases that may not be randomly chosen
        for cluster in range(k): 
          # each word must have a cluster, each cluster must have a word
          cluster_list = random.choices(paraphrase_list, k = int(np.round(len(paraphrase_list)/k)))
          chosen_paraphrases.update(cluster_list)
          clusters.append(cluster_list)
        for paraphrase in paraphrase_list:
          if paraphrase not in chosen_paraphrases:
            # choose a random cluster list and append unassigned word to it
            random.choice(clusters).append(paraphrase) 
        clusterings[target_word] = clusters

    return clusterings

In [44]:
input_filepath = 'drive/My Drive/CIS-530/Homework 5/Data/data/dev_input.txt'
output_filepath = 'drive/My Drive/CIS-530/Homework 5/Data/data/dev_output.txt'
word_to_paraphrases_dict, word_to_k_dict = load_input_file(input_filepath)
gold_clusterings = load_output_file(output_filepath)
predicted_clusterings = cluster_random(word_to_paraphrases_dict, word_to_k_dict)
evaluate_clusterings(gold_clusterings, predicted_clusterings)

+----------------+----+----------------+
|     Target     | k  | Paired F-Score |
+----------------+----+----------------+
|     wash.v     | 13 |     0.1697     |
|    watch.v     | 5  |     0.2393     |
|    expect.v    | 6  |     0.2927     |
|    paper.n     | 7  |     0.3158     |
|     miss.v     | 8  |     0.2316     |
|     eat.v      | 6  |     0.2922     |
|  atmosphere.n  | 6  |     0.2500     |
|     note.v     | 3  |     0.5957     |
|     use.v      | 6  |     0.3747     |
|   judgment.n   | 7  |     0.1887     |
|   express.v    | 7  |     0.2300     |
|   operate.v    | 7  |     0.2544     |
|    begin.v     | 8  |     0.2132     |
|   produce.v    | 7  |     0.2305     |
|    smell.v     | 4  |     0.5169     |
|     mean.v     | 6  |     0.2431     |
|   interest.n   | 5  |     0.2340     |
|    party.n     | 5  |     0.2421     |
|   suspend.v    | 6  |     0.2034     |
|    source.n    | 9  |     0.1728     |
|  difference.n  | 5  |     0.3724     |
|     bank.n    

In [0]:
# word_to_paraphrases_dict, word_to_k_dict = load_input_file('drive/My Drive/CIS-530/Homework 5/Data/data/test_input.txt')
# predicted_clusterings = cluster_random(word_to_paraphrases_dict, word_to_k_dict)
# write_to_output_file('drive/My Drive/CIS-530/Homework 5/test_output_random.txt', predicted_clusterings)

In [0]:
def create_PPMI_matrix(term_context_matrix):
  '''Given a term context matrix, output a PPMI matrix.
  See section 15.1 in the textbook.

  Hint: Use numpy matrix and vector operations to speed up implementation.
  
  Input:
    term_context_matrix: A nxn numpy array, where n is
        the numer of tokens in the vocab.
  
  Returns: A nxn numpy matrix, where A_ij is equal to the
     point-wise mutual information between the ith word
     and the jth word in the term_context_matrix.
  '''       
  target_counts = np.sum(term_context_matrix, axis = 1)
  context_counts = np.sum(term_context_matrix, axis = 0)
  total = np.sum(term_context_matrix) # matrix sum
  PPMI_matrix = np.divide(np.multiply(term_context_matrix,total),np.multiply(target_counts,context_counts.T))
  PPMI_matrix = np.maximum(0, np.log2(PPMI_matrix+1e-6))
  
  return PPMI_matrix

In [0]:
# TASK 3.2
def cluster_with_sparse_representation(word_to_paraphrases_dict, word_to_k_dict):
    """
    Clusters paraphrases using sparse vector representation
    :param word_to_paraphrases_dict: dictionary, where key is a target word and value is a list of paraphrases
    :param word_to_k_dict: dictionary, where key is a target word and value is a number of clusters
    :return: dictionary, where key is a target word and value is a list of list of paraphrases,
    where each list corresponds to a cluster
    """
    # Note: any vector representation should be in the same directory as this file
    vectors_root_path = 'drive/My Drive/CIS-530/Homework 5/Data/vectors/'
    vectors_path = 'coocvec-500mostfreq-window-3.filter.magnitude'
    vectors = Magnitude(vectors_root_path + vectors_path)
    clusterings = {}

    for target_word in word_to_paraphrases_dict.keys():
        paraphrase_list = word_to_paraphrases_dict[target_word]
        clusters = []
        k = word_to_k_dict[target_word]
        chosen_paraphrases = set() # keep track of any paraphrases that may not be chosen
        X = np.zeros((len(paraphrase_list), 500))
        # build data matrix
        for idx, paraphrase in enumerate(paraphrase_list):
          X[idx,:] = vectors.query(paraphrase)
        # X = create_PPMI_matrix(X)
        kmeans = KMeans(n_clusters = k).fit(X)
        for cluster in range(k):
          cluster_list = [paraphrase_list[idx] 
                          for idx, label in enumerate(kmeans.labels_) 
                          if label == cluster]
          if len(cluster_list) == 0:
            print('Empty cluster\n')
          chosen_paraphrases.update(cluster_list)
          clusters.append(cluster_list)
        for paraphrase in paraphrase_list:
          if paraphrase not in chosen_paraphrases:
            # choose a random cluster list and append unassigned word to it
            random.choice(clusters).append(paraphrase) 
        clusterings[target_word] = clusters

    return clusterings

In [81]:
input_filepath = 'drive/My Drive/CIS-530/Homework 5/Data/data/dev_input.txt'
output_filepath = 'drive/My Drive/CIS-530/Homework 5/Data/data/dev_output.txt'
word_to_paraphrases_dict, word_to_k_dict = load_input_file(input_filepath)
gold_clusterings = load_output_file(output_filepath)
predicted_clusterings = cluster_with_sparse_representation(word_to_paraphrases_dict, word_to_k_dict)
evaluate_clusterings(gold_clusterings, predicted_clusterings)

+----------------+----+----------------+
|     Target     | k  | Paired F-Score |
+----------------+----+----------------+
|     wash.v     | 13 |     0.1685     |
|    watch.v     | 5  |     0.3796     |
|    expect.v    | 6  |     0.2767     |
|    paper.n     | 7  |     0.3085     |
|     miss.v     | 8  |     0.2157     |
|     eat.v      | 6  |     0.2543     |
|  atmosphere.n  | 6  |     0.3184     |
|     note.v     | 3  |     0.4878     |
|     use.v      | 6  |     0.4096     |
|   judgment.n   | 7  |     0.2212     |
|   express.v    | 7  |     0.2329     |
|   operate.v    | 7  |     0.2144     |
|    begin.v     | 8  |     0.3377     |
|   produce.v    | 7  |     0.3159     |
|    smell.v     | 4  |     0.2500     |
|     mean.v     | 6  |     0.3391     |
|   interest.n   | 5  |     0.2586     |
|    party.n     | 5  |     0.2664     |
|   suspend.v    | 6  |     0.2090     |
|    source.n    | 9  |     0.2341     |
|  difference.n  | 5  |     0.3354     |
|     bank.n    

In [0]:
word_to_paraphrases_dict, word_to_k_dict = load_input_file('drive/My Drive/CIS-530/Homework 5/Data/data/test_input.txt')
predicted_clusterings = cluster_with_sparse_representation(word_to_paraphrases_dict, word_to_k_dict)
write_to_output_file('drive/My Drive/CIS-530/Homework 5/test_output_sparse.txt', predicted_clusterings)

In [0]:
# TASK 3.3
def cluster_with_dense_representation(word_to_paraphrases_dict, word_to_k_dict):
    """
    Clusters paraphrases using dense vector representation
    :param word_to_paraphrases_dict: dictionary, where key is a target word and value is a list of paraphrases
    :param word_to_k_dict: dictionary, where key is a target word and value is a number of clusters
    :return: dictionary, where key is a target word and value is a list of list of paraphrases,
    where each list corresponds to a cluster
    """
    # Note: any vector representation should be in the same directory as this file
    vectors = Magnitude("GoogleNews-vectors-negative300.filter.magnitude")
    clusterings = {}

    for target_word in word_to_paraphrases_dict.keys():
        paraphrase_list = word_to_paraphrases_dict[target_word]
        k = word_to_k_dict[target_word]
        # TODO: Implement
        clusterings[target_word] = None

    return clusterings

In [0]:
# TASK 3.4
def cluster_with_no_k(word_to_paraphrases_dict):
    """
    Clusters paraphrases using any vector representation
    :param word_to_paraphrases_dict: dictionary, where key is a target word and value is a list of paraphrases
    :return: dictionary, where key is a target word and value is a list of list of paraphrases,
    where each list corresponds to a cluster
    """
    # Note: any vector representation should be in the same directory as this file
    vectors = Magnitude("GoogleNews-vectors-negative300.filter.magnitude")
    clusterings = {}

    for target_word in word_to_paraphrases_dict.keys():
        paraphrase_list = word_to_paraphrases_dict[target_word]
        # TODO: Implement
        clusterings[target_word] = None

    return clusterings