In [1]:
# File will include the working.
# Made by Sarvesh Bhatnagar
# dontpatronizeme
from dont_patronize_me import DontPatronizeMe

# Feature
import feature.basicFeatures as bf
import feature.makeWordVector as mwv

# Preprocessing
import preprocessing.basicPreProcessing as bp

# Model
import models.deepModel as dm

# Misc for model training.
from tensorflow import keras
import numpy as np
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler
from sklearn.model_selection import train_test_split
import contractions


# Scores
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score


def ready_data(X, y):
    X = np.array(X)
    X = X.reshape(-1, 1)
    x_rus = X
    y_rus = y
    x_rus = [item[0] for item in x_rus]
    x_rus = np.array(x_rus).astype(np.float32)
    return x_rus, y_rus


def contract_words(text):
    """
    Removes Contractations from text. i.e. are'nt -> are not
    """
    return contractions.fix(text)


def preprocess_text(text):
    """
    Should return a list of words
    """
    text = str(text)
    text = contract_words(text)
    text = text.lower()
    text = text.replace('"', "").replace(
        ",", "").replace("'", "")
    return text.split()



In [28]:

# Deep Learning Pipeline.
if __name__ == '__main__':
    # Load the data.
    dpm = DontPatronizeMe('dataset', 'dontpatronizeme_pcl.tsv')
    dpm.load_task1()
    data = dpm.train_task1_df
    process = bp.BasicPreProcessing()
    data['text_split'] = data['text'].apply(preprocess_text)

    # Train WordVectors. Only run once.
    # mwv.Word2VecModelTrainer(
    #     sentences=data['text_split'], path="dataword.wordvectors").train()

    # Load the trained word vectors.
    wv = mwv.Word2VecModelTrainer().load_trained("word2vec.wordvectors")

    # Make Embedding Columns for each text split.
    basic_features = bf.BasicFeatures()
    data['embeddings'] = data['text_split'].apply(
        basic_features.add_vectors, wv=wv)

    # NOTE NEW FEATURE
    data["text_feature"] = data['text_split'].apply(
        basic_features.get_text_feature)
    
    
    data["embeddings_feature"] = data['text_feature'].apply(
        basic_features.add_vectors_multiple, wv=wv)

    data["text_feature_v2"] = data['text_split'].apply(basic_features.get_text_feature, n=[1,5])

    data["embeddings_feature_v2"] = data['text_feature_v2'].apply(basic_features.add_vectors_multiple, wv=wv)

    data["text_feature_v3"] = data['text_split'].apply(basic_features.get_text_feature, n=[6,8])

    data["embeddings_feature_v3"] = data['text_feature_v3'].apply(basic_features.add_vectors_multiple, wv=wv)


    


In [29]:
(data["embeddings"][0].shape)

(100,)

In [30]:
import nltk
from collections import Counter
def get_tags(text):
    tags_p = nltk.pos_tag(text)
    return [i[1] for i in tags_p]

def get_most_common_tags(text):
    tags_p = nltk.pos_tag(text)
    tags_p = [i[1] for i in tags_p]
    tags_p = list(Counter(tags_p).most_common(3))
    tags_p = [i[0] for i in tags_p]
    return tags_p

def get_most_common_words(text):
    stopwords = {"i", "the", "and", "or", "a", "an", "is", "are", "was", "were", "be", "been", "am", "me", "my"}
    text = [i for i in text if i not in stopwords]
    words = list(Counter(text).most_common(3))
    words = [i[0] for i in words]
    return words
data["tags_p"] = data["text_split"].apply(get_tags)

In [31]:
data["most_common_words"] = data["text_split"].apply(get_most_common_words)

In [32]:
data["most_common_tags"] = data["text_split"].apply(get_most_common_tags)

In [33]:
import feature.makeWordVector as mwv
wvec = mwv.Word2VecModelTrainer(sentences=data["tags_p"], path="pos_tags.wordvectors")
wvec.train(size=50)


(<gensim.models.word2vec.Word2Vec at 0x7f7af42a6790>,
 <gensim.models.keyedvectors.Word2VecKeyedVectors at 0x7f7af42a6850>)

In [34]:
wvp = wvec.load_trained("pos_tags.wordvectors")

In [35]:
data["pos_embeddings"] = data["tags_p"].apply(basic_features.add_vectors, wv=wvp)

In [36]:
data["most_common_tags_embeddings"] = data["most_common_tags"].apply(basic_features.add_vectors, wv=wvp)

In [37]:
data["most_common_words_embeddings"] = data["most_common_words"].apply(basic_features.add_vectors, wv=wv)

In [38]:
wv.similar_by_vector(data["most_common_words_embeddings"][2])

[('house', 0.8690413236618042),
 ('station', 0.8608390688896179),
 ('spree', 0.8473489284515381),
 ('park', 0.838900089263916),
 ('phone', 0.8366880416870117),
 ('opened', 0.8346849083900452),
 ('lady', 0.8317989706993103),
 ('concert', 0.8310965299606323),
 ('frescura', 0.8289353847503662),
 ('white', 0.8263425230979919)]

In [39]:
def combine(x,y):
    z = []
    for i in range(len(x)):
        z.append(np.concatenate((x[i],y[i])))
    return z

In [40]:
ll = combine(data["embeddings_feature"],data["pos_embeddings"])
ll = combine(ll,data["most_common_tags_embeddings"])
ll = combine(ll,data["most_common_words_embeddings"])
ll = combine(ll,data["embeddings_feature_v2"])
ll = combine(ll,data["embeddings_feature_v3"])
# ll = combine(ll,data["embeddings"])

In [41]:
data["combined"] = ll

In [42]:
data["embeddings"][0].shape

(100,)

In [43]:
data["combined"][0].shape

(400,)

In [44]:
rus = RandomUnderSampler(random_state=42)
X_train, X_test, y_train, y_test = train_test_split(
    data['combined'], data['label'], stratify=data['label'], test_size=0.2, random_state=1)


In [45]:
data["combined"][0].shape

(400,)

In [46]:
nn_model = dm.NNModels(input_shape=data["combined"][0].shape,)
# TODO data["combined"][0].shape


rus = RandomOverSampler(random_state=42,sampling_strategy=1)
X_train = np.array(X_train)
X_train = X_train.reshape(-1, 1)
x_rus, y_rus = rus.fit_resample(X_train, y_train)
x_rus = [item[0] for item in x_rus]
x_rus = np.array(x_rus).astype(np.float32)

In [75]:
model = nn_model.create_baseline()

In [76]:
model.compile(
    optimizer=keras.optimizers.RMSprop(),  # Optimizer
    # Loss function to minimize
    loss=keras.losses.SparseCategoricalCrossentropy(),
    # List of metrics to monitor
    metrics=[keras.metrics.SparseCategoricalAccuracy()],
)

In [77]:
X_test_n, y_test_n = ready_data(X_test, y_test)

In [78]:
history = model.fit(x_rus, y_rus, batch_size=64, epochs=500, validation_data=(X_test_n, y_test_n))

Epoch 1/500
Epoch 2/500
Epoch 3/500
Epoch 4/500
Epoch 5/500
Epoch 6/500
Epoch 7/500
Epoch 8/500
Epoch 9/500
Epoch 10/500
Epoch 11/500
Epoch 12/500
Epoch 13/500
Epoch 14/500
Epoch 15/500
Epoch 16/500
Epoch 17/500
Epoch 18/500
Epoch 19/500
Epoch 20/500
Epoch 21/500
Epoch 22/500
Epoch 23/500
Epoch 24/500
Epoch 25/500
Epoch 26/500
Epoch 27/500
Epoch 28/500
Epoch 29/500
Epoch 30/500
Epoch 31/500
Epoch 32/500
Epoch 33/500
Epoch 34/500
Epoch 35/500
Epoch 36/500
Epoch 37/500
Epoch 38/500
Epoch 39/500
Epoch 40/500
Epoch 41/500
Epoch 42/500
Epoch 43/500
Epoch 44/500
Epoch 45/500
Epoch 46/500
Epoch 47/500
Epoch 48/500
Epoch 49/500
Epoch 50/500
Epoch 51/500
Epoch 52/500
Epoch 53/500
Epoch 54/500
Epoch 55/500
Epoch 56/500
Epoch 57/500
Epoch 58/500
Epoch 59/500
Epoch 60/500
Epoch 61/500
Epoch 62/500
Epoch 63/500
Epoch 64/500
Epoch 65/500
Epoch 66/500
Epoch 67/500
Epoch 68/500
Epoch 69/500
Epoch 70/500
Epoch 71/500
Epoch 72/500
Epoch 73/500
Epoch 74/500
Epoch 75/500
Epoch 76/500
Epoch 77/500
Epoch 78

In [79]:
# Prepare testing data.
X_test_n, y_test_n = ready_data(X_test, y_test)

predictions = model.predict(X_test_n)
predictions = [item.argmax() for item in predictions]
y_test_n = list(y_test_n)
print("Accuracy", accuracy_score(y_test_n, predictions))
print("Precision", precision_score(y_test_n, predictions, average=None))
print("Recall", recall_score(
    y_test_n, predictions, labels=[0, 1], average=None))

Accuracy 0.7903533906399236
Precision [0.9516129  0.25103734]
Recall [0.80949868 0.6080402 ]


In [None]:

# 500 epochs
# 
# Accuracy 0.7903533906399236
# Precision [0.9516129  0.25103734]
# Recall [0.80949868 0.6080402 ]

In [None]:
# get_text_feature : n=[3,7]
# ll = combine(data["embeddings_feature"],data["pos_embeddings"])
# ll = combine(ll,data["most_common_tags_embeddings"])
# ll = combine(ll,data["most_common_words_embeddings"])
# epochs = 250
# tag size = 50
# tags == words == 3
# Accuracy 0.8357211079274116
# Precision [0.9408755  0.28358209]
# Recall [0.87335092 0.47738693]

In [None]:
# embeddings feature gives high recall for both
# low precision for NPCL.
# ll = combine(data["embeddings_feature"],data["pos_embeddings"])
# ll = combine(ll,data["most_common_tags_embeddings"])
# ll = combine(ll,data["most_common_words_embeddings"])
# tags == words == 3 most common.
# tagsize = 100
# Accuracy 0.720152817574021
# Precision [0.96254417 0.21502209]
# Recall [0.71873351 0.73366834]

In [37]:
if __name__=="__main__":
    rus = RandomUnderSampler(random_state=42)
    X_train, X_test, y_train, y_test = train_test_split(
        data['combined'], data['label'], stratify=data['label'], test_size=0.2, random_state=1)

    # X_train_t, X_test_t, y_train_t, y_test_t = train_test_split(
    #     data['embeddings'], data['label'], stratify=data['label'], test_size=0.2, random_state=1)
    # Initializing NNModel/Deep Learning Model.
    # by default ip 100,0 and op 2 i.e. 2 classes classification.
    nn_model = dm.NNModels()

    rus = RandomOverSampler(random_state=42)
    X_train = np.array(X_train)
    X_train = X_train.reshape(-1, 1)
    x_rus, y_rus = rus.fit_resample(X_train, y_train)
    x_rus = [item[0] for item in x_rus]
    x_rus = np.array(x_rus).astype(np.float32)

    model = nn_model.dl_0()
    model.compile(
        optimizer=keras.optimizers.RMSprop(),  # Optimizer
        # Loss function to minimize
        loss=keras.losses.SparseCategoricalCrossentropy(),
        # List of metrics to monitor
        metrics=[keras.metrics.SparseCategoricalAccuracy()],
    )
    # model = nn_model.dl_0_compile(model)

    # Train the model.
    print("Training the model...")
    history = model.fit(x_rus, y_rus, batch_size=64, epochs=150)

    # Prepare testing data.
    X_test, y_test = ready_data(X_test, y_test)

    predictions = model.predict(X_test)
    predictions = [item.argmax() for item in predictions]
    y_test = list(y_test)
    print("Accuracy", accuracy_score(y_test, predictions))
    print("Precision", precision_score(y_test, predictions, average=None))
    print("Recall", recall_score(
        y_test, predictions, labels=[0, 1], average=None))

2022-01-07 19:50:32.524806: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN)to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-01-07 19:50:32.619439: I tensorflow/compiler/xla/service/service.cc:168] XLA service 0x7fd9ac0747d0 initialized for platform Host (this does not guarantee that XLA will be used). Devices:
2022-01-07 19:50:32.619460: I tensorflow/compiler/xla/service/service.cc:176]   StreamExecutor device (0): Host, Default Version


Training the model...
Epoch 1/150


ValueError: in user code:

    /Users/sarvesh/.local/lib/python3.7/site-packages/tensorflow/python/keras/engine/training.py:806 train_function  *
        return step_function(self, iterator)
    /Users/sarvesh/.local/lib/python3.7/site-packages/tensorflow/python/keras/engine/training.py:796 step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    /Users/sarvesh/.local/lib/python3.7/site-packages/tensorflow/python/distribute/distribute_lib.py:1211 run
        return self._extended.call_for_each_replica(fn, args=args, kwargs=kwargs)
    /Users/sarvesh/.local/lib/python3.7/site-packages/tensorflow/python/distribute/distribute_lib.py:2585 call_for_each_replica
        return self._call_for_each_replica(fn, args, kwargs)
    /Users/sarvesh/.local/lib/python3.7/site-packages/tensorflow/python/distribute/distribute_lib.py:2945 _call_for_each_replica
        return fn(*args, **kwargs)
    /Users/sarvesh/.local/lib/python3.7/site-packages/tensorflow/python/keras/engine/training.py:789 run_step  **
        outputs = model.train_step(data)
    /Users/sarvesh/.local/lib/python3.7/site-packages/tensorflow/python/keras/engine/training.py:747 train_step
        y_pred = self(x, training=True)
    /Users/sarvesh/.local/lib/python3.7/site-packages/tensorflow/python/keras/engine/base_layer.py:985 __call__
        outputs = call_fn(inputs, *args, **kwargs)
    /Users/sarvesh/.local/lib/python3.7/site-packages/tensorflow/python/keras/engine/functional.py:386 call
        inputs, training=training, mask=mask)
    /Users/sarvesh/.local/lib/python3.7/site-packages/tensorflow/python/keras/engine/functional.py:508 _run_internal_graph
        outputs = node.layer(*args, **kwargs)
    /Users/sarvesh/.local/lib/python3.7/site-packages/tensorflow/python/keras/engine/base_layer.py:976 __call__
        self.name)
    /Users/sarvesh/.local/lib/python3.7/site-packages/tensorflow/python/keras/engine/input_spec.py:216 assert_input_compatibility
        ' but received input with shape ' + str(shape))

    ValueError: Input 0 of layer dense_1 is incompatible with the layer: expected axis -1 of input shape to have value 100 but received input with shape [None, 200]
