In [4]:
# File will include the working.
# Made by Sarvesh Bhatnagar
# dontpatronizeme
from dont_patronize_me import DontPatronizeMe

# Feature
import feature.basicFeatures as bf
import feature.makeWordVector as mwv

# Preprocessing
import preprocessing.basicPreProcessing as bp

# Model
import models.deepModel as dm

# Misc for model training.
from tensorflow import keras
import numpy as np
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler
from sklearn.model_selection import train_test_split
import contractions


# Scores
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score


def ready_data(X, y):
    X = np.array(X)
    X = X.reshape(-1, 1)
    x_rus = X
    y_rus = y
    x_rus = [item[0] for item in x_rus]
    x_rus = np.array(x_rus).astype(np.float32)
    return x_rus, y_rus


def contract_words(text):
    """
    Removes Contractations from text. i.e. are'nt -> are not
    """
    return contractions.fix(text)


def preprocess_text(text):
    """
    Should return a list of words
    """
    text = str(text)
    text = contract_words(text)
    text = text.lower()
    text = text.replace('"', "").replace(
        ",", "").replace("'", "")
    return text.split()



In [5]:

# Deep Learning Pipeline.
if __name__ == '__main__':
    # Load the data.
    dpm = DontPatronizeMe('dataset', 'dontpatronizeme_pcl.tsv')
    dpm.load_task1()
    data = dpm.train_task1_df
    process = bp.BasicPreProcessing()
    data['text_split'] = data['text'].apply(preprocess_text)

    # Train WordVectors. Only run once.
    # mwv.Word2VecModelTrainer(
    #     sentences=data['text_split'], path="dataword.wordvectors").train()

    # Load the trained word vectors.
    wv = mwv.Word2VecModelTrainer().load_trained("word2vec.wordvectors")

    # Make Embedding Columns for each text split.
    basic_features = bf.BasicFeatures()
    data['embeddings'] = data['text_split'].apply(
        basic_features.add_vectors, wv=wv)

    # NOTE NEW FEATURE
    data["text_feature"] = data['text_split'].apply(
        basic_features.get_text_feature)
    
    
    data["embeddings_feature"] = data['text_feature'].apply(
        basic_features.add_vectors_multiple, wv=wv)

    data["text_feature_v2"] = data['text_split'].apply(basic_features.get_text_feature, n=[1,5])

    data["embeddings_feature_v2"] = data['text_feature_v2'].apply(basic_features.add_vectors_multiple, wv=wv)

    data["text_feature_v3"] = data['text_split'].apply(basic_features.get_text_feature, n=[3,7])

    data["embeddings_feature_v3"] = data['text_feature_v3'].apply(basic_features.add_vectors_multiple, wv=wv)


    


In [6]:
(data["embeddings"][0].shape)

(100,)

In [7]:
def get_word_size_embeddings(text_list, size):
    """
    Returns word size embeddings
    """
    embeddings = np.zeros((size,))
    ind = len(text_list) if len(text_list) < size else size 
    for i in range(0, ind):
        embeddings[i] = len(text_list[i]) + 30
    return embeddings

In [8]:
data["word_size_embeddings"] = data['text_split'].apply(get_word_size_embeddings, size=100)

In [9]:
import nltk
from collections import Counter
def get_tags(text):
    tags_p = nltk.pos_tag(text)
    return [i[1] for i in tags_p]

def get_most_common_tags(text):
    tags_p = nltk.pos_tag(text)
    tags_p = [i[1] for i in tags_p]
    tags_p = list(Counter(tags_p).most_common(3))
    tags_p = [i[0] for i in tags_p]
    return tags_p

def get_most_common_words(text):
    stopwords = {"i", "the", "and", "or", "a", "an", "is", "are", "was", "were", "be", "been", "am", "me", "my"}
    text = [i for i in text if i not in stopwords]
    words = list(Counter(text).most_common(3))
    words = [i[0] for i in words]
    return words
data["tags_p"] = data["text_split"].apply(get_tags)

In [10]:
data["most_common_words"] = data["text_split"].apply(get_most_common_words)

In [11]:
data["most_common_tags"] = data["text_split"].apply(get_most_common_tags)

In [44]:
import feature.makeWordVector as mwv
wvec = mwv.Word2VecModelTrainer(sentences=data["tags_p"], path="pos_tags.wordvectors")
wvec.train(size=10)


(<gensim.models.word2vec.Word2Vec at 0x7fb1927ebb90>,
 <gensim.models.keyedvectors.Word2VecKeyedVectors at 0x7fb1927ebb10>)

In [45]:
wvp = wvec.load_trained("pos_tags.wordvectors")

In [46]:
data["pos_embeddings"] = data["tags_p"].apply(basic_features.add_vectors, wv=wvp)

In [47]:
data["most_common_tags_embeddings"] = data["most_common_tags"].apply(basic_features.add_vectors, wv=wvp)

In [48]:
data["most_common_words_embeddings"] = data["most_common_words"].apply(basic_features.add_vectors, wv=wv)

In [49]:
wv.similar_by_vector(data["most_common_words_embeddings"][2])

[('house', 0.8690413236618042),
 ('station', 0.8608390688896179),
 ('spree', 0.8473489284515381),
 ('park', 0.838900089263916),
 ('phone', 0.8366880416870117),
 ('opened', 0.8346849083900452),
 ('lady', 0.8317989706993103),
 ('concert', 0.8310965299606323),
 ('frescura', 0.8289353847503662),
 ('white', 0.8263425230979919)]

In [50]:
def combine(x,y):
    z = []
    for i in range(len(x)):
        z.append(np.concatenate((x[i],y[i])))
    return z

In [75]:
ll = combine(data["embeddings_feature"],data["pos_embeddings"])
# ll = combine(ll,data["most_common_tags_embeddings"])
# ll = combine(ll,data["most_common_words_embeddings"])
# ll = combine(ll,data["embeddings_feature_v2"])
ll = combine(ll,data["word_size_embeddings"])
# ll = combine(ll,data["embeddings_feature_v3"])
# ll = combine(ll,data["embeddings"])

In [76]:
data["combined"] = ll

In [77]:
data["word_size_embeddings"][0].shape

(100,)

In [78]:
data["combined"][0].shape

(210,)

In [79]:
rus = RandomUnderSampler(random_state=42)
X_train, X_test, y_train, y_test = train_test_split(
    data['combined'], data['label'], stratify=data['label'], test_size=0.2, random_state=1)


In [80]:
data["combined"][3].shape

(210,)

In [81]:
nn_model = dm.NNModels(input_shape=data["combined"][0].shape,)
# TODO data["combined"][0].shape


rus = RandomOverSampler(random_state=42,sampling_strategy=1)
X_train = np.array(X_train)
X_train = X_train.reshape(-1, 1)
x_rus, y_rus = rus.fit_resample(X_train, y_train)
x_rus = [item[0] for item in x_rus]
x_rus = np.array(x_rus).astype(np.float32)

In [82]:
model = nn_model.create_baseline()

In [83]:
# model.compile(
#     optimizer=keras.optimizers.RMSprop(),  # Optimizer
#     # Loss function to minimize
#     loss=keras.losses.SparseCategoricalCrossentropy(),
#     # List of metrics to monitor
#     metrics=[keras.metrics.SparseCategoricalAccuracy()],
# )

model.compile(
    optimizer=keras.optimizers.RMSprop(),  # Optimizer
    # Loss function to minimize
    loss=keras.losses.SparseCategoricalCrossentropy(),
    # List of metrics to monitor
    metrics=[keras.metrics.SparseCategoricalAccuracy()],
)

In [84]:
X_test_n, y_test_n = ready_data(X_test, y_test)

In [85]:
history = model.fit(x_rus, y_rus, batch_size=64, epochs=250, validation_data=(X_test_n, y_test_n))

Epoch 1/250
Epoch 2/250
Epoch 3/250
Epoch 4/250
Epoch 5/250
Epoch 6/250
Epoch 7/250
Epoch 8/250
Epoch 9/250
Epoch 10/250
Epoch 11/250
Epoch 12/250
Epoch 13/250
Epoch 14/250
Epoch 15/250
Epoch 16/250
Epoch 17/250
Epoch 18/250
Epoch 19/250
Epoch 20/250
Epoch 21/250
Epoch 22/250
Epoch 23/250
Epoch 24/250
Epoch 25/250
Epoch 26/250
Epoch 27/250
Epoch 28/250
Epoch 29/250
Epoch 30/250
Epoch 31/250
Epoch 32/250
Epoch 33/250
Epoch 34/250
Epoch 35/250
Epoch 36/250
Epoch 37/250
Epoch 38/250
Epoch 39/250
Epoch 40/250
Epoch 41/250
Epoch 42/250
Epoch 43/250
Epoch 44/250
Epoch 45/250
Epoch 46/250
Epoch 47/250
Epoch 48/250
Epoch 49/250
Epoch 50/250
Epoch 51/250
Epoch 52/250
Epoch 53/250
Epoch 54/250
Epoch 55/250
Epoch 56/250
Epoch 57/250
Epoch 58/250
Epoch 59/250
Epoch 60/250
Epoch 61/250
Epoch 62/250
Epoch 63/250
Epoch 64/250
Epoch 65/250
Epoch 66/250
Epoch 67/250
Epoch 68/250
Epoch 69/250
Epoch 70/250
Epoch 71/250
Epoch 72/250
Epoch 73/250
Epoch 74/250
Epoch 75/250
Epoch 76/250
Epoch 77/250
Epoch 78

In [86]:
# Prepare testing data.
X_test_n, y_test_n = ready_data(X_test, y_test)

predictions = model.predict(X_test_n)
predictions = [item.argmax() for item in predictions]
y_test_n = list(y_test_n)
print("Accuracy", accuracy_score(y_test_n, predictions))
print("Precision", precision_score(y_test_n, predictions, average=None))
print("Recall", recall_score(
    y_test_n, predictions, labels=[0, 1], average=None))

Accuracy 0.8414517669531996
Precision [0.93634841 0.28052805]
Recall [0.88496042 0.42713568]


In [None]:
# 250 epochs
# ll = combine(data["embeddings_feature"],data["pos_embeddings"])
# ll = combine(ll,data["word_size_embeddings"])
# pos vec size = 10
# Accuracy 0.8481375358166189
# Precision [0.93443526 0.28673835]
# Recall [0.89498681 0.40201005]

In [None]:
# 250 epochs
# ll = combine(data["embeddings_feature"],data["pos_embeddings"])
# ll = combine(ll,data["most_common_words_embeddings"])
# ll = combine(ll,data["word_size_embeddings"])
# pos vec size = 10
# Accuracy 0.8481375358166189
# Precision [0.93443526 0.28673835]
# Recall [0.89498681 0.40201005]

In [None]:

# 500 epochs
# ll = combine(data["embeddings_feature"],data["pos_embeddings"])
# ll = combine(ll,data["most_common_tags_embeddings"])
# ll = combine(ll,data["most_common_words_embeddings"])
# ll = combine(ll,data["embeddings_feature_v2"])
# Accuracy 0.8046800382043935
# Precision [0.94813028 0.25917431]
# Recall [0.82955145 0.5678392 ]

In [None]:
# get_text_feature : n=[3,7]
# ll = combine(data["embeddings_feature"],data["pos_embeddings"])
# ll = combine(ll,data["most_common_tags_embeddings"])
# ll = combine(ll,data["most_common_words_embeddings"])
# epochs = 250
# tag size = 50
# tags == words == 3
# Accuracy 0.8357211079274116
# Precision [0.9408755  0.28358209]
# Recall [0.87335092 0.47738693]

In [None]:
# embeddings feature gives high recall for both
# low precision for NPCL.
# ll = combine(data["embeddings_feature"],data["pos_embeddings"])
# ll = combine(ll,data["most_common_tags_embeddings"])
# ll = combine(ll,data["most_common_words_embeddings"])
# tags == words == 3 most common.
# tagsize = 100
# Accuracy 0.720152817574021
# Precision [0.96254417 0.21502209]
# Recall [0.71873351 0.73366834]

In [37]:
if __name__=="__main__":
    rus = RandomUnderSampler(random_state=42)
    X_train, X_test, y_train, y_test = train_test_split(
        data['combined'], data['label'], stratify=data['label'], test_size=0.2, random_state=1)

    # X_train_t, X_test_t, y_train_t, y_test_t = train_test_split(
    #     data['embeddings'], data['label'], stratify=data['label'], test_size=0.2, random_state=1)
    # Initializing NNModel/Deep Learning Model.
    # by default ip 100,0 and op 2 i.e. 2 classes classification.
    nn_model = dm.NNModels()

    rus = RandomOverSampler(random_state=42)
    X_train = np.array(X_train)
    X_train = X_train.reshape(-1, 1)
    x_rus, y_rus = rus.fit_resample(X_train, y_train)
    x_rus = [item[0] for item in x_rus]
    x_rus = np.array(x_rus).astype(np.float32)

    model = nn_model.dl_0()
    model.compile(
        optimizer=keras.optimizers.RMSprop(),  # Optimizer
        # Loss function to minimize
        loss=keras.losses.SparseCategoricalCrossentropy(),
        # List of metrics to monitor
        metrics=[keras.metrics.SparseCategoricalAccuracy()],
    )
    # model = nn_model.dl_0_compile(model)

    # Train the model.
    print("Training the model...")
    history = model.fit(x_rus, y_rus, batch_size=64, epochs=150)

    # Prepare testing data.
    X_test, y_test = ready_data(X_test, y_test)

    predictions = model.predict(X_test)
    predictions = [item.argmax() for item in predictions]
    y_test = list(y_test)
    print("Accuracy", accuracy_score(y_test, predictions))
    print("Precision", precision_score(y_test, predictions, average=None))
    print("Recall", recall_score(
        y_test, predictions, labels=[0, 1], average=None))

2022-01-07 19:50:32.524806: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN)to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-01-07 19:50:32.619439: I tensorflow/compiler/xla/service/service.cc:168] XLA service 0x7fd9ac0747d0 initialized for platform Host (this does not guarantee that XLA will be used). Devices:
2022-01-07 19:50:32.619460: I tensorflow/compiler/xla/service/service.cc:176]   StreamExecutor device (0): Host, Default Version


Training the model...
Epoch 1/150


ValueError: in user code:

    /Users/sarvesh/.local/lib/python3.7/site-packages/tensorflow/python/keras/engine/training.py:806 train_function  *
        return step_function(self, iterator)
    /Users/sarvesh/.local/lib/python3.7/site-packages/tensorflow/python/keras/engine/training.py:796 step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    /Users/sarvesh/.local/lib/python3.7/site-packages/tensorflow/python/distribute/distribute_lib.py:1211 run
        return self._extended.call_for_each_replica(fn, args=args, kwargs=kwargs)
    /Users/sarvesh/.local/lib/python3.7/site-packages/tensorflow/python/distribute/distribute_lib.py:2585 call_for_each_replica
        return self._call_for_each_replica(fn, args, kwargs)
    /Users/sarvesh/.local/lib/python3.7/site-packages/tensorflow/python/distribute/distribute_lib.py:2945 _call_for_each_replica
        return fn(*args, **kwargs)
    /Users/sarvesh/.local/lib/python3.7/site-packages/tensorflow/python/keras/engine/training.py:789 run_step  **
        outputs = model.train_step(data)
    /Users/sarvesh/.local/lib/python3.7/site-packages/tensorflow/python/keras/engine/training.py:747 train_step
        y_pred = self(x, training=True)
    /Users/sarvesh/.local/lib/python3.7/site-packages/tensorflow/python/keras/engine/base_layer.py:985 __call__
        outputs = call_fn(inputs, *args, **kwargs)
    /Users/sarvesh/.local/lib/python3.7/site-packages/tensorflow/python/keras/engine/functional.py:386 call
        inputs, training=training, mask=mask)
    /Users/sarvesh/.local/lib/python3.7/site-packages/tensorflow/python/keras/engine/functional.py:508 _run_internal_graph
        outputs = node.layer(*args, **kwargs)
    /Users/sarvesh/.local/lib/python3.7/site-packages/tensorflow/python/keras/engine/base_layer.py:976 __call__
        self.name)
    /Users/sarvesh/.local/lib/python3.7/site-packages/tensorflow/python/keras/engine/input_spec.py:216 assert_input_compatibility
        ' but received input with shape ' + str(shape))

    ValueError: Input 0 of layer dense_1 is incompatible with the layer: expected axis -1 of input shape to have value 100 but received input with shape [None, 200]
