In [64]:
import tensorflow as tf

ModuleNotFoundError: No module named 'tensorflow'

In [63]:
from Utils.methods import change_the_sentences

from keras import models, layers
from sklearn import metrics
import numpy as np

class Keras_Model:

    def __init__(self, params):
        '''
        Initializing Keras Model for Sentiment Analysis Task

        Parameters
        __________
        params: dict | Information regarding number of words, max sent.length, embedding dimension
        '''
        self.model = models.Sequential()
        self.model.add(layers.Embedding(params["NB_WORDS"], params['EMB_DIM'], input_length=params['MAX_LEN']))
        self.model.add(layers.Flatten())
        self.model.add(layers.Dense(1, activation='sigmoid'))

ModuleNotFoundError: No module named 'keras'

In [53]:
from Evaluation.weat_analysis import *
from Utils.methods import *
from Utils.sets import *

from sklearn.metrics.pairwise import cosine_similarity as cs
from sklearn.decomposition import PCA
from scipy.linalg import null_space as ns
from nltk.corpus import stopwords
from copy import deepcopy as cp
import pickle
import time

class Word_Embedding:

    def __init__(self, filename, dimension=300):
        """
        Word Embedding initialization

        Parameters
        ----------
        filename: String | directory of the .txt Embedding file
        dimension: int | Embedding dimensionality (has to correspond with .txt file)
        """
        self.words = []
        self.vectors = []
        self.word_idx = {}
        self.dimension = dimension
        assert (filename is not None)

        with open(filename+".vocab", "r") as f:
            self.words = [line.strip() for line in f]
        vectors = np.load(filename+".npy")
        self.vectors = (vectors.T/np.linalg.norm(vectors,axis=1)).T
        word_counter = np.shape(self.vectors)[0]
        self.word_ids = {w:i for i,w in enumerate(self.words)}
        
        print(f'Regular embedding successfully read. Shape: {np.shape(self.vectors)}')

    def get_value(self, word):
        '''
        Get vector representation for particular word

        Parameters
        ----------
        word: String | String representation of the word

        Returns
        -------
        vectors: list | Vector representation of the word
        '''
        try:
            return self.vectors[self.get_index_out_of_word(word)]
        except:
            print(f'Word {word} not in the Embedding.')

    def get_word_out_of_index(self, idx):
        '''
        Take word with the corresponding index from dictionary

        Parameters
        ----------
        idx: int | Index of a word

        Returns
        -------
        word: String | String representation of the word
        '''
        return self.vectors[idx]

    def get_index_out_of_word(self, word):
        '''
        Take word with the corresponding index from dictionary

        Parameters
        ----------
        word: String | String representation of the word

        Returns
        -------
        index: int | Index for the given word
        '''
        return self.word_idx[word]

    def normalize_vectors(self):
        '''
        Vector normalization
        '''
        self.vectors /= np.linalg.norm(self.vectors, axis=1)[:, np.newaxis]

    def get_top_k_neighbors(self, word, k):
        '''
        Find top k neighbors for a given word

        Parameters
        ----------
        word: String | Word for which we ought to find closest k words
        k: int | Number of closest neighbors we ought to output

        Returns
        -------
        top_k_words : list | Top k closest words in space
        '''
        v_s = self.vectors / np.linalg.norm(self.vectors, axis=1)[:, np.newaxis]
        query_vector = v_s[self.get_index_out_of_word(word)]
        dots = np.dot(v_s[:, np.newaxis], query_vector)
        result = {self.words[i]: round(float(dot),3) for i, dot in enumerate(dots)}
        return sorted(result.items(), key=lambda x: x[1], reverse=True)[1:(k + 1)]

    def get_center_vector(self, def_sets):
        '''
        Paper reference: https://www.aclweb.org/anthology/N19-1062/
        Code source: https://github.com/TManzini/DebiasMulticlassWordEmbedding
        '''
        means = {}
        for k, v in def_sets.items():
            wSet = []
            for w in v:
                try:
                    wSet.append(self.get_value(w))
                except KeyError as e:
                    pass
            set_vectors = np.array(wSet)
            means[k] = np.mean(set_vectors, axis=0)
            
        # calculate vectors to perform PCA
        matrix = []
        for k, v in def_sets.items():
            wSet = []
            for w in v:
                try:
                    wSet.append(self.get_value(w))
                except KeyError as e:
                    pass
            set_vectors = np.array(wSet)
            diffs = set_vectors - means[k]
            matrix.append(diffs)

        matrix = np.concatenate(matrix)

        pca = PCA(n_components=1)
        pca.fit(matrix)
        return pca.components_[0]

    def hard_weat(self, bias_levels, bias_combinations, subspace_words, sets, neighbors_threshold=1):
        '''
        HardWEAT Debiasing

        Parameters
        ----------
        bias_levels: dict | Bias levels for each class respectively
        bias_combinations: dict | Classes and respective subclasses to be included in debiasing, by default in following form : {"gender" : ["male_terms", "female_terms"],   "race": ["black_names", "white_names"], "religion" : ["islam_words", "atheism_words", "christianity_words"]}
        subspace_words: set | Words within the default combination dictionary
        sets: dict | Existing attribute and target set of words
        neighbors_threshold: float | Cosine similarity float threshold for equidistancing phase
        '''
        def_vectors, subcategories_vectors, r_cat= {}, {}, 0.0000000000000000001
        temp_sets = cp(sets)
        def_sets = get_hardweat_sets()

        def_vectors = {bias_category: self.get_center_vector(def_sets[bias_category]) for bias_category in bias_combinations}
        centroid = generate_centroid(scale_bias(bias_levels), def_vectors)
        neutral_words = list(set(self.words) -subspace_words)

        start = time.time()
        print(f'Start of neutralization, there is total of {len(neutral_words)} neutral words out of total {len(self.words)} words.')
        neutral_indices = [self.word_idx[word] for word in neutral_words]
        self.vectors[neutral_indices] = neutralize_vectors(self.vectors[neutral_indices,:], centroid)
        self.normalize_vectors()
        end = time.time();
        print(f'Neutralization done in {round(end-start, 3)}s, starting with neighbor thresholding and equidistancing...')

        start = time.time()
        for key_category in def_vectors:

            subcat_keys_within_this_cat = [x for t in list(bias_combinations[key_category].keys()) for x in t]
            vectors_for_equidistancing = {key: np.zeros(self.dimension) for key in subcat_keys_within_this_cat}
            center_vector = neutralize_vectors(def_vectors[key_category], centroid)
            equidistant_def_subcat_vectors_dict = make_vectors_equidistant(center_vector, vectors_for_equidistancing, r_cat)

            for i, key_subcategory in enumerate(subcat_keys_within_this_cat):
                values_okay, values_not_ok_idx, values_not_ok_idx_max = False, 0, 500
                while not values_okay:

                    r_subcat = random.randint(1, 2**16-1)
                    new_vectors = make_vectors_equidistant(equidistant_def_subcat_vectors_dict[key_subcategory], {word: self.get_value(word) for word in temp_sets[key_subcategory]}, r_subcat)

                    found_artifact = False
                    matrix_of_similarities = cs(np.float16(list(new_vectors.values())), np.float16(self.vectors))

                    for ravel_idx, cs_value in enumerate(np.ravel(matrix_of_similarities)):
                        if (cs_value>neighbors_threshold):
                            found_artifact = True
                            values_not_ok_idx+=1
                            if(values_not_ok_idx % 100==0):
                                print(f'{values_not_ok_idx} unsuccessful equdistancing iterations for {key_subcategory}')
                            break

                    if (found_artifact == True and values_not_ok_idx<values_not_ok_idx_max):
                        values_okay = False
                    else:
                        for key in new_vectors:
                            self.vectors[self.get_index_out_of_word(key)] = new_vectors[key]
                        values_okay = True
                        if (values_not_ok_idx>=values_not_ok_idx_max):
                            print(f'Could not perform equidistancing below the requested threshold for {key_subcategory}')

            print(f'Finished with all {key_category} subcategories')

        self.normalize_vectors()
        self.vectors = np.array(self.vectors)
        end = time.time();
        print(f'Equidistancing done in {round(end-start, 3)}s.')

    def soft_weat(self, sets, target_at_dict, bias_combinations, l=1, nullspace_iterations = -1, neighb_count=20):
        '''
        SoftWEAT Debiasing

        Parameters
        ----------
        sets: Existing attribute and target set of words
        target_at_dict: dict | Keys being target/subclass sets and values corresponding attribute sets for debiasing
        bias_combinations: dict | Classes and respective subclasses to be included in debiasing, by default in following form : {"gender" : ["male_terms", "female_terms"],   "race": ["black_names", "white_names"], "religion" : ["islam_words", "atheism_words", "christianity_words"]}
        l: float | Trade-off parameter (1 - Highest level of removal, 0 - lowest)
        nullspace_iterations: int | Number of nullspaces to be included in iterative bias minimization. If -1, all are taken into consideration
        neighb_count: int | Number of neighbors that initial target/subclass lists are expanded on
        '''

        nullspace_dict, neighbors, subclasses, duplicates = {}, {}, [], {} 
        target_words_complete = list(dict.fromkeys([word for class_name in target_at_dict for word in sets[class_name]]))
        substopwords = set(stopwords.words('english')) - set(target_words_complete)
        attribute_sets_complete = set([word for subclass_name in target_at_dict for a_set in target_at_dict[subclass_name]for word in sets[a_set]])

        cs_matrix = cs([self.get_value(word) for word in target_words_complete], self.vectors)
        cs_idx = {word:idx for idx, word in enumerate(target_words_complete)}
        dictionary_categories = {"gender" : ["male_terms", "female_terms"],   "race": ["black_names", "white_names"], "religion" : ["islam_words", "atheism_words", "christianity_words"]}

        #Generating neighbors
        for class_name in dictionary_categories.keys():
            for subclass_name in dictionary_categories[class_name]:

                if subclass_name not in target_at_dict: continue
                
                subclasses.append(subclass_name)
                neighbors[subclass_name] = set(sets[subclass_name])
                for word in sets[subclass_name]:
                    new_words = set([self.words[neighbor_idx] for neighbor_idx in (cs_matrix[cs_idx[word],:].argsort()[-neighb_count:]) if cs([self.vectors[neighbor_idx]], [self.get_value(word)]) > 0.6])
                    new_words -= set([word for class_name, subclasses in dictionary_categories.items() for subcl in subclasses for word in sets[subcl] if subcl!=subclass_name])
                    neighbors[subclass_name] = neighbors[subclass_name].union(new_words)

                    #Identifying duplicates
                    for word in new_words:
                        for subclass in subclasses:
                            if word in neighbors[subclass] and subclass!=subclass_name:
                                if word not in duplicates:
                                    duplicates[word] = set()
                                duplicates[word].add(subclass)
                                duplicates[word].add(subclass_name)
        
        #Removing duplicates
        for word_dup in duplicates:
            word_dup_subclasses = list(duplicates[word_dup])
            cs_subclasses = [cs([np.sum([self.get_value(w) for w in sets[subclass_name]], axis = 0)], [self.get_value(word_dup)]) for subclass_name in word_dup_subclasses]
            for sc in [x for i,x in enumerate(word_dup_subclasses) if i!=np.argmax(cs_subclasses)]:
                neighbors[sc].remove(word_dup)

        for class_name in dictionary_categories.keys():

            #Iterating through target set that might contain bias towards some attribute sets of words 
            for subclass_name in dictionary_categories[class_name]:

                if subclass_name not in target_at_dict.keys(): 
                    continue 

                attribute_set_names = list(target_at_dict[subclass_name]); attribute_set_names.sort()
                all_words_for_subclass = list( neighbors[subclass_name] - attribute_sets_complete - substopwords)
                mean_value = np.mean([self.get_value(word) for word in all_words_for_subclass], axis=0)
                vectors_for_nullspacing = np.array([self.get_wordset_mean(sets[a_set]) for a_set in attribute_set_names])
                null_space = ns(vectors_for_nullspacing)

                bias_levels_per_nullspace = []
                #print(f'Words for subclass {subclass_name} len: {(all_words_for_subclass)}')

                no_of_iterations = np.size(null_space, 1) if nullspace_iterations==-1 else nullspace_iterations
                for k in range(0, no_of_iterations):

                    e = cp(self)
                    nullspace_dict[subclass_name] = null_space[:,k] 
                    T = nullspace_dict[subclass_name] -  mean_value
                    T = make_translation_matrix(T, l)
                    vectors_for_translation = np.vstack([np.transpose([e.get_value(word) for word in all_words_for_subclass]), np.ones((1, len(all_words_for_subclass)))])
                    transformed_points = np.matmul(T, vectors_for_translation)
                    for i, word in enumerate(all_words_for_subclass):  
                        e.vectors[e.get_index_out_of_word(word)] = transformed_points[0:-1,i]
                    _, bias_levels_d, _, _, _ = weat_analysis(e, bias_combinations, sets, steps=1000)
                    bias_levels_per_nullspace.append(bias_levels_d[class_name])
                    del e

                min_nullspace_key = np.argmin(bias_levels_per_nullspace)
                final_t_vector = null_space[:,min_nullspace_key] - mean_value
                T_final = make_translation_matrix(final_t_vector, l)
                vectors_for_translation = np.vstack([np.transpose([self.get_value(word) for word in all_words_for_subclass]), np.ones((1, len(all_words_for_subclass)))])
                transformed_points = np.matmul(T_final, vectors_for_translation)

                for i, word in enumerate(all_words_for_subclass):  
                    self.vectors[self.get_index_out_of_word(word)] = transformed_points[0:-1,i]
                
                print(f'Subclass {subclass_name} finished.')
                
        self.normalize_vectors()

    def get_wordset_mean(self, set_of_words):
        """
        For Embedding and given set of words, find average vector

        Parameters
        ----------
        embedding: Embedding | Word Embedding instance
        set_of_words: list | words which will be averaged based on their vector position

        Returns
        -------
        set_mean: ndarray | Average value of all given word representations
        """
        matrix = [self.get_value(word) for word in set_of_words]
        return np.mean(matrix, axis=0)

    def reduce_dim_version_of_embeddings(self, dimension=3):
        '''
        Reduce dimensionality of embeddings via PCA

        Parameters
        ----------
        dimension: dimensionality to which embeddings will be reduced
        '''
        self.vectors = PCA(n_components=dimension).fit_transform(self.vectors)

    def save_embedding(self, filename, pkl_format=False):
        '''
        Save Embedding in pkl/txt format

        Parameters
        ----------
        filename: String | directory, filename to which output will be generated
        pkl_format: bool | Determining whether embeddings will be saves in .txt or .pkl format
        '''
        if(pkl_format==True):
            print(f'entered pkl method for file: {filename}')
            output = open(f'{filename}.pkl', 'wb')
            pickle.dump(self, output)
            output.close()
        else:
            print(f'entered txt method for file: {filename}')
            with open(filename, 'w') as file:
                for i in range(0, len(self.vectors)):
                    vector_string = f'{self.words[i]} ' + ' '.join([str(np.float16(x)) for x in self.vectors[i]])
                    if (i != len(self.vectors)-1):
                        file.write(f'{vector_string}\n')
                    else:
                        file.write(f'{vector_string}')
                print('Done.')
            file.close()

In [1]:
def get_sent_analysis_sets():

    """
    Generate structures for Sentiment Analysis task

    Returns
    -------
    dictionary_categories: dict | Bias class as keys and respective subclasses as values
    parameters_dict: dict | Input model information
    target_sets_dict: dict | Dictionary containing existing words for each Word Embedding model within a subclass pair
    """

    dictionary_categories = {"gender": ["male_terms", "female_terms"], "race": ["black_names", "white_names"],
                             "religion": ["islam_words", "atheism_words", "christianity_words"]}

    parameters_dict = \
        {
            "NB_WORDS": 124252,
            "MAX_LEN": 50,
            "NO_PRETR_DIM": 100,
            "EMB_DIM": 300
        }

    target_sets_dict = {
        'black_white':
            {
                'word2vec':
                    {
                        'first_set': ['theo', 'jerome', 'leroy', 'lamar', 'lionel', 'malik', 'ebony', 'jasmine', 'tia',
                                      'hakim', 'kareem', 'jamal', 'kenya'],
                        'second_set': ['adam', 'chip', 'harry', 'josh', 'roger', 'alan', 'frank', 'ian', 'justin',
                                       'ryan', 'andrew', 'fred', 'jack']
                    },
                'fasttext':
                    {
                        'first_set': ['alonzo', 'jamel', 'theo', 'alphonse', 'jerome', 'leroy', 'torrance', 'darnell',
                                      'lamar', 'lionel', 'tyree', 'lamont', 'malik', 'terrence', 'tyrone', 'marcellus',
                                      'ebony', 'jasmine', 'tanisha', 'tia', 'latoya', 'yolanda', 'malika', 'yvette',
                                      'hakim', 'jermaine', 'kareem', 'jamal', 'aisha', 'keisha', 'kenya'],
                        'second_set': ['adam', 'chip', 'harry', 'josh', 'roger', 'alan', 'frank', 'ian', 'justin',
                                       'ryan', 'andrew', 'fred', 'jack', 'matthew', 'stephen', 'brad', 'jed', 'todd',
                                       'brandon', 'hank', 'jonathan', 'peter', 'wilbur', 'amanda', 'courtney',
                                       'heather', 'melanie', 'sara', 'amber', 'crystal', 'katie', 'meredith', 'shannon',
                                       'betsy']
                    },
                'glove':
                    {
                        'first_set': ['alonzo', 'jamel', 'theo', 'alphonse', 'jerome', 'leroy', 'torrance', 'darnell',
                                      'lamar', 'lionel', 'tyree', 'lamont', 'malik', 'terrence', 'tyrone', 'marcellus',
                                      'ebony', 'jasmine', 'tanisha', 'tia', 'latoya', 'yolanda', 'malika', 'yvette',
                                      'hakim', 'jermaine', 'kareem', 'jamal', 'aisha', 'keisha', 'kenya'],
                        'second_set': ['adam', 'chip', 'harry', 'josh', 'roger', 'alan', 'frank', 'ian', 'justin',
                                       'ryan', 'andrew', 'fred', 'jack', 'matthew', 'stephen', 'brad', 'jed', 'todd',
                                       'brandon', 'hank', 'jonathan', 'peter', 'wilbur', 'amanda', 'courtney',
                                       'heather', 'melanie', 'sara', 'amber', 'crystal', 'katie', 'meredith', 'shannon',
                                       'betsy', 'kristin', 'nancy', 'stephanie', 'ellen', 'lauren', 'peggy', 'colleen']
                    }
            },
        'male_female':
            {
                'glove':
                    {'first_set': ['male', 'man', 'boy', 'brother', 'him', 'son', 'father', 'uncle', 'grandfather'],
                     'second_set': ['female', 'woman', 'girl', 'sister', 'her', 'daughter', 'mother', 'aunt',
                                    'grandmother']},
                'word2vec':
                    {'first_set': ['male', 'man', 'boy', 'brother', 'him', 'son', 'father', 'uncle', 'grandfather'],
                     'second_set': ['female', 'woman', 'girl', 'sister', 'her', 'daughter', 'mother', 'aunt',
                                    'grandmother']},
                'fasttext':
                    {'first_set': ['male', 'man', 'boy', 'brother', 'him', 'son', 'father', 'uncle', 'grandfather'],
                     'second_set': ['female', 'woman', 'girl', 'sister', 'her', 'daughter', 'mother', 'aunt',
                                    'grandmother']}
            },
        'islam_christianity':
            {
                "fasttext":
                    {"first_set": ["allah", "ramadan", "emir", "salaam", "koran", "imam", "sultan", "prophet", "veil",
                                   "ayatollah", "mosque"],
                     "second_set": ["baptism", "messiah", "catholicism", "resurrection", "christianity", "salvation",
                                    "protestant", "gospel", "trinity", "jesus", "christ", "christian", "cross",
                                    "catholic"]
                     },
                "glove":
                    {"first_set": ["allah", "ramadan", "emir", "salaam", "koran", "imam", "sultan", "prophet", "veil",
                                   "ayatollah", "mosque", "islam"],
                     "second_set": ["baptism", "messiah", "catholicism", "resurrection", "christianity", "salvation",
                                    "protestant", "gospel", "trinity", "jesus", "christ", "christian", "cross",
                                    "catholic", "church"]
                     },
                "word2vec": {
                    "first_set": ["allah", "ramadan", "emir", "salaam", "koran", "imam", "sultan", "prophet", "veil",
                                  "ayatollah"],
                    "second_set": ["baptism", "messiah", "catholicism", "resurrection", "christianity", "salvation",
                                   "protestant", "gospel", "trinity", "jesus", "christ", "christian", "cross"]}
            }
    }

    return dictionary_categories, parameters_dict, target_sets_dict

In [33]:
def filter_emb(target_sets, word_index):
    """
    Filtering opposite target sets to be equal size and using only ones in embedding

    Parameters
    ----------
    target_sets: dict | target_sets_dict from sets.get_sent_analysis_sets() method
    word_index: dict | Word as a key and index as value

    Returns
    -------
    filtered_word_index: dict | Two opposing sets of words
    """
    lengths = [0, 0]
    for i, key_idx in enumerate(list(target_sets.keys())):
        target_sets[key_idx] = [word for word in target_sets[key_idx] if word in word_index]
        lengths[i] = len(target_sets[key_idx])

    return {key_idx: word_list[0:min(lengths)] for key_idx, word_list in target_sets.items()}

In [38]:
def null_intersection(x_train, y_train, target_words, word_index):
    """
    Eliminating training set that contains target words

    Parameters
    ----------
    x_train: ndarray | Training data
    y_train: ndarray | Label data
    target_words: list | List of target words to exclude from learning procedure
    word_index: dict | Word to index mapping

    Returns
    -------
    x_train_new: ndarray | Training data
    y_train_new: ndarray | Label data
    """
    x_train_new, y_train_new = [], []

    for i, x_t in enumerate(x_train):
        intersection = len(set(x_t).intersection([word_index[word] for word in target_words]))
        if intersection == 0:
            x_train_new.append(x_t)
            y_train_new.append(y_train[i])

    return x_train_new, y_train_new

In [59]:
def compare_embeddings(datasets, embeddings, params, number_of_models = 6):

    '''
    Comparing different levels of debiasing based on Sentiment Analysis task

    Parameters
    ----------
    datasets: dict | From utils.get_dataset_and_dicts() method
    embeddings: dict | Keys as embedding types (original, hardweat, softweat) and Embedding instances as values
    params: dict | From sets.get_sent_analysis_sets() method
    number_of_models: int | Number of different models from which polarity score will be generated
    '''

    keys = list(datasets['targets_sets'].keys())
    emb_mod = {'No sentence modified': (datasets['x_test_padded'], datasets['y_test']),
             'First set modification': change_the_sentences(datasets, keys[0]),
             'Second set modification': change_the_sentences(datasets, keys[1])}
    results = {'original': [], 'hardweat': [], 'softweat': []}
    x_train_fit, y_train_fit = datasets['x_train_padded'], datasets['y_train']

    for emb_type, e in embeddings.items():

        emb_matrix = np.zeros((params['NB_WORDS'], params['EMB_DIM']))
        set_words = set(e.words)
        for word, i in datasets['word_2_index'].items():
            if word not in set_words or i >= params['NB_WORDS']: continue
            else: emb_matrix[i] = e.get_value(word);

        for j in range(0, 6):

            keras_model = Keras_Model(params)
            keras_model.model.layers[0].set_weights([emb_matrix])
            keras_model.model.layers[0].trainable = keras_model.model.layers[1].trainable = False
            keras_model.model.compile(loss = 'binary_crossentropy', optimizer='adadelta',metrics = ['accuracy']) 
            keras_model.model.fit(x_train_fit, y_train_fit, epochs=8, verbose=0)
                
            polarity_scores = {'No sentence modified':[], 'First set modification':[], 'Second set modification':[]}
            cnf_scores = {'No sentence modified':[], 'First set modification':[], 'Second set modification':[]}

            for i, (key, (x_t, y_t)) in enumerate(emb_mod.items()):

                prediction_results = keras_model.model.predict(x_t) 
                polarity_scores[key] = np.array(prediction_results)[:,0]
                prediction_results = [1 if output_instance>0.5 else 0 for output_instance in prediction_results]
                correct_results = y_t
                cnf_scores[key].append(metrics.confusion_matrix(correct_results, prediction_results))

            final_polarity = [polarity_scores['First set modification'][i]-polarity_scores['Second set modification'][i] for i in range(0, len(polarity_scores['Second set modification']))]
            results[emb_type].append((final_polarity, cnf_scores))
            print(f'F1 score for {emb_type} embedding, {j+1}.model: {round(metrics.f1_score(correct_results, prediction_results),2)}')

        print('________________________________________________________________')

    return results

In [2]:
dictionary_categories, parameters_dict, target_sets_dict = get_sent_analysis_sets()

In [3]:
target_pairs = ['islam_christianity', 'male_female', 'black_white']
target_pair = target_pairs[0]

In [9]:
emb_name='word2vec'

In [10]:
target_sets_dict[target_pair][emb_name]

{'first_set': ['allah',
  'ramadan',
  'emir',
  'salaam',
  'koran',
  'imam',
  'sultan',
  'prophet',
  'veil',
  'ayatollah'],
 'second_set': ['baptism',
  'messiah',
  'catholicism',
  'resurrection',
  'christianity',
  'salvation',
  'protestant',
  'gospel',
  'trinity',
  'jesus',
  'christ',
  'christian',
  'cross']}

In [11]:
parameters_dict

{'NB_WORDS': 124252, 'MAX_LEN': 50, 'NO_PRETR_DIM': 100, 'EMB_DIM': 300}

In [None]:
datasets_and_dicts = get_dataset_and_dicts( target_sets_dict[target_pair][emb_name], parameters_dict)

In [35]:
targets_sets=target_sets_dict[target_pair][emb_name]
params=parameters_dict

In [18]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

In [22]:
from keras_preprocessing.text import Tokenizer

In [28]:
from sklearn.preprocessing import LabelEncoder

In [42]:
from keras_preprocessing.sequence import pad_sequences

In [23]:
df = pd.read_csv('data/IMDB_Dataset.csv')
df = df.reindex(np.random.permutation(df.index))
X_train, X_test, y_train, y_test = train_test_split(df.review, df.sentiment, shuffle=False)

In [24]:
tk = Tokenizer(num_words=params["NB_WORDS"])

In [25]:
tk.fit_on_texts(df.review)

In [26]:
x_train = tk.texts_to_sequences(X_train)
x_test = tk.texts_to_sequences(X_test)

In [29]:
le = LabelEncoder()
y_train = le.fit_transform(y_train)
y_test = le.transform(y_test)

In [36]:
# Filtering input to satisfy learning constraints
targets_sets = filter_emb(targets_sets, tk.word_index)

In [39]:
x_train, y_train = null_intersection(x_train, y_train,
                                         set([word for set_name in targets_sets for word in targets_sets[set_name]]),
                                         tk.word_index)

In [40]:
x_train_copy, x_test_copy, y_train_copy, y_test_copy = [], [], [], []

In [41]:
for i in range(0, len(x_train)):
    if len(x_train[i]) < params['MAX_LEN']:
        x_train_copy.append(x_train[i])
        y_train_copy.append(y_train[i])

for i in range(0, len(x_test)):
    if len(x_test[i]) < params['MAX_LEN']:
        x_test_copy.append(x_test[i])
        y_test_copy.append(y_test[i])

In [43]:
x_train_padded = pad_sequences(x_train_copy, maxlen=params['MAX_LEN'], padding='post')
x_test_padded = pad_sequences(x_test_copy, maxlen=params['MAX_LEN'], padding='post')

In [45]:
print(f'Training shape: {x_train_padded.shape}\nTest shape: {x_test_padded.shape}')

Training shape: (827, 50)
Test shape: (283, 50)


In [48]:
datasets = {'x_train': np.array(x_train_copy), 'x_test': np.array(x_test_copy),
            'x_train_padded': np.array(x_train_padded), 'x_test_padded': np.array(x_test_padded),
            'y_train': np.array(y_train_copy), 'y_test': np.array(y_test_copy),
            'word_2_index': tk.word_index, 'index_2_word': tk.index_word, 'targets_sets': targets_sets}

In [54]:
embedding = Word_Embedding('glove-twitter-25d-10000f')

Regular embedding successfully read. Shape: (10000, 25)


In [57]:
embedding2=Word_Embedding('glove-twitter-25d-10000f')

Regular embedding successfully read. Shape: (10000, 25)


In [58]:
embeddings_dict = {'original': embedding, 'hardweat': embedding2, 'softweat': embedding2}

In [61]:
datasets_and_dicts=datasets

In [62]:
results = compare_embeddings(datasets_and_dicts, embeddings_dict, parameters_dict)

Shape of modified test input: (100, 50)
Shape of modified test input: (100, 50)
Word the not in the Embedding.
Word and not in the Embedding.
Word of not in the Embedding.
Word to not in the Embedding.
Word is not in the Embedding.
Word br not in the Embedding.
Word in not in the Embedding.
Word it not in the Embedding.
Word this not in the Embedding.
Word that not in the Embedding.
Word was not in the Embedding.
Word as not in the Embedding.
Word for not in the Embedding.
Word with not in the Embedding.
Word movie not in the Embedding.
Word but not in the Embedding.
Word film not in the Embedding.
Word on not in the Embedding.
Word not not in the Embedding.
Word you not in the Embedding.
Word are not in the Embedding.
Word his not in the Embedding.
Word have not in the Embedding.
Word be not in the Embedding.
Word one not in the Embedding.
Word he not in the Embedding.
Word all not in the Embedding.
Word at not in the Embedding.
Word by not in the Embedding.
Word an not in the Embeddi

Word naturally not in the Embedding.
Word saving not in the Embedding.
Word faith not in the Embedding.
Word bright not in the Embedding.
Word national not in the Embedding.
Word bob not in the Embedding.
Word aware not in the Embedding.
Word kick not in the Embedding.
Word broken not in the Embedding.
Word loss not in the Embedding.
Word mixed not in the Embedding.
Word bigger not in the Embedding.
Word dealing not in the Embedding.
Word morning not in the Embedding.
Word fail not in the Embedding.
Word cuts not in the Embedding.
Word spanish not in the Embedding.
Word prove not in the Embedding.
Word tape not in the Embedding.
Word drunk not in the Embedding.
Word forces not in the Embedding.
Word robin not in the Embedding.
Word eat not in the Embedding.
Word gotten not in the Embedding.
Word sean not in the Embedding.
Word media not in the Embedding.
Word suit not in the Embedding.
Word finest not in the Embedding.
Word advice not in the Embedding.
Word gary not in the Embedding.
W

Word weather not in the Embedding.
Word refuse not in the Embedding.
Word limit not in the Embedding.
Word gloria not in the Embedding.
Word rises not in the Embedding.
Word lauren not in the Embedding.
Word attend not in the Embedding.
Word paradise not in the Embedding.
Word brady not in the Embedding.
Word nails not in the Embedding.
Word poker not in the Embedding.
Word option not in the Embedding.
Word btw not in the Embedding.
Word freaks not in the Embedding.
Word ugh not in the Embedding.
Word deleted not in the Embedding.
Word basketball not in the Embedding.
Word steady not in the Embedding.
Word mann not in the Embedding.
Word bunny not in the Embedding.
Word supply not in the Embedding.
Word records not in the Embedding.
Word dates not in the Embedding.
Word labor not in the Embedding.
Word iv not in the Embedding.
Word frozen not in the Embedding.
Word confident not in the Embedding.
Word saint not in the Embedding.
Word selection not in the Embedding.
Word championship no

Word vid not in the Embedding.
Word starbucks not in the Embedding.
Word buenos not in the Embedding.
Word oct not in the Embedding.
Word vive not in the Embedding.
Word und not in the Embedding.
Word mums not in the Embedding.
Word nin not in the Embedding.
Word cyrus not in the Embedding.
Word nuit not in the Embedding.
Word zac not in the Embedding.
Word obama not in the Embedding.
Word gf not in the Embedding.
Word gm not in the Embedding.
Word abre not in the Embedding.
Word spurs not in the Embedding.
Word thanx not in the Embedding.
Word foods not in the Embedding.
Word nb not in the Embedding.
Word cho not in the Embedding.
Word dubai not in the Embedding.
Word hep not in the Embedding.
Word devo not in the Embedding.
Word dat not in the Embedding.
Word aye not in the Embedding.
Word bong not in the Embedding.
Word dec not in the Embedding.
Word meow not in the Embedding.
Word wah not in the Embedding.
Word mora not in the Embedding.
Word iam not in the Embedding.
Word ist not 

NameError: name 'Keras_Model' is not defined