In [None]:
#@title Mount drive files
from google.colab import drive

drive.mount('/content/drive/')

In [None]:
#@title Install necassary libraries

!pip install fastparquet
!pip install -q clean-text[gpl]
!pip install hazm
# !pip install transformers
# !wget https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.fa.300.bin.gz
# !wget https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.fa.300.vec.gz
!cp "/content/drive/MyDrive/Projects/datacamp/Text/cc.fa.300.vec.gz" -r "/content"
!gunzip /content/cc.fa.300.vec.gz
# !rm "/content/cc.fa.300.vec.gz"
!pip install fasttext

In [None]:
#@title Load libraries

import os
import pandas as pd
import numpy as np
import re
import tensorflow as tf
import tensorflow_datasets as tfds
# import fasttext
import keras.backend as K

from fastparquet import ParquetFile
from json import loads
from cleantext import clean
from hazm import Normalizer, Lemmatizer, word_tokenize
from sklearn.model_selection import train_test_split
from keras.utils.np_utils import to_categorical
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
# from transformers import BertTokenizer, BasicTokenizer, TFBertModel
# from transformers import TFBertPreTrainedModel, TFBertForSequenceClassification
# from transformers import glue_convert_examples_to_features, InputExample

from keras.models import Model, Sequential
from keras.models import load_model
from keras.layers import Dense, Embedding, Dropout, BatchNormalization, Flatten
from keras.layers import GlobalMaxPool1D, MaxPooling1D, GlobalMaxPooling1D
from keras.layers import Conv1D, LSTM, GRU, Bidirectional, SimpleRNN
from keras.layers import multiply, Input, Concatenate
from tensorflow.keras.optimizers import Adam, schedules
from keras.regularizers import l2
from keras import losses, metrics

In [None]:
#@title Load Dataset

test_dir = '/content/drive/MyDrive/Projects/datacamp/Text/datasets/future_test.parquet'
reject_dir = '/content/drive/MyDrive/Projects/datacamp/Text/datasets/reject_reasons_info.csv'
train_dir = '/content/drive/MyDrive/Projects/datacamp/Text/datasets/train.parquet'

df_rejects = pd.read_csv(reject_dir)
pf_train = ParquetFile(train_dir)
pf_test = ParquetFile(test_dir)

df_train = pf_train.to_pandas()
df_test = pf_test.to_pandas()


In [None]:
#@title Preview Data

print('Train Data ============================================================')
df_train.info()
print()
print('Test Data =============================================================')
df_test.info()

display(df_train.head(10))
print()
display(df_rejects.head(10))

In [None]:
#@title Modify dataset

def get_features(row, feature):
    try:
        return row[feature]
    except:
        return np.nan

print(df_train.review_label.unique())
print(df_test.review_label.unique())

# df_train['label'] = df_train['reject_reason_id'].apply(
                            # lambda rec: 0 if rec == 0 else 1)

df_train['label'] = df_train['review_label'].apply(
                            lambda rec: 1 if rec == 'reject' else 0)

df_test['label'] = df_train['review_label'].apply(
                            lambda rec: 1 if rec == 'reject' else 0)

df_train['interview_post'] = df_train['interview_post'].apply(
                                            lambda rec: loads(rec))

df_test['interview_post'] = df_test['interview_post'].apply(
                                            lambda rec: loads(rec))

# ['category', 'description', 'elevator', 'floor', 'location',
#  'other_options_and_attributes', 'parking', 'price', 'rooms',
#  'size', 'title', 'user_type', 'warehouse', 'year']

total_features = df_train.interview_post.iloc[0].keys()
price_features = ['mode', 'value']

for feature in total_features:
    df_train[feature] = df_train['interview_post'].apply(
                                        lambda rec: get_features(rec, feature))
    df_test[feature] = df_test['interview_post'].apply(
                                        lambda rec: get_features(rec, feature))

for item in price_features:
    df_train['price_' + item] = df_train['price'].apply(
                                        lambda rec: get_features(rec, item))
    df_test['price_' + item] = df_test['price'].apply(
                                        lambda rec: get_features(rec, item))

df_train.drop(columns=['interview_post', 'price'], inplace=True)
df_test.drop(columns=['interview_post', 'price'], inplace=True)
display(df_train.head())

In [None]:
#@title Plot distriubtion of rejects

bins = (df_train.reject_reason_id.unique())
print(f"Reject reasons are {bins}")
print(f"Bins value is {len(bins)}!")
print()
df_train[['reject_reason_id', 'label']].hist(bins=len(bins));

In [None]:
#@title Cleaning descriptions

def cleanhtml(raw_html):
    cleanr = re.compile('<.*?>')
    cleantext = re.sub(cleanr, '', raw_html)
    return cleantext


def cleaning(text):
    text = text.strip()
    
    # regular cleaning
    text = clean(text,
        fix_unicode=True,
        to_ascii=False,
        lower=True,
        no_line_breaks=True,
        no_urls=True,
        no_emails=True,
        no_phone_numbers=True,
        no_numbers=False,
        no_digits=False,
        no_currency_symbols=True,
        no_punct=False,
        replace_with_url="",
        replace_with_email="",
        replace_with_phone_number="",
        replace_with_number="",
        replace_with_digit="0",
        replace_with_currency_symbol="",
    )

    # cleaning htmls
    text = cleanhtml(text)
    
    # normalizing
    normalizer = Normalizer()
    text = normalizer.normalize(text)
    
    # removing wierd patterns
    wierd_pattern = re.compile("["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
        u"\U00002702-\U000027B0"
        u"\U000024C2-\U0001F251"
        u"\U0001f926-\U0001f937"
        u'\U00010000-\U0010ffff'
        u"\u200d"
        u"\u2640-\u2642"
        u"\u2600-\u2B55"
        u"\u23cf"
        u"\u23e9"
        u"\u231a"
        u"\u3030"
        u"\ufe0f"
        u"\u2069"
        u"\u2066"
        # u"\u200c"
        u"\u2068"
        u"\u2067"
        "]+", flags=re.UNICODE)
    
    text = wierd_pattern.sub(r'', text)
    
    # removing extra spaces, hashtags
    text = re.sub("#", "", text)
    text = re.sub("\s+", " ", text)
    
    return text

df_train['description'] = df_train['description'].apply(cleaning)
df_test['description'] = df_test['description'].astype(str)
df_test['description'] = df_test['description'].apply(cleaning)

df_train['title'] = df_train['title'].apply(cleaning)
df_test['title'] = df_test['title'].apply(cleaning)

df_train['city'] = df_train['location'].apply(
                                        lambda rec: get_features(rec, 'city'))
df_test['city'] = df_test['location'].apply(
                                        lambda rec: get_features(rec, 'city'))

df_train['context'] = df_train['description'] + df_train['title'] + \
                    df_train['city'].astype(str) + df_train['price_value'].astype(str)

df_test['context'] = df_test['description'] + df_test['title'] + \
                    df_test['city'].astype(str) + df_test['price_value'].astype(str)

df_train['price_value'].fillna(-1, inplace=True)
df_test['price_value'].fillna(-1, inplace=True)

df_train['label'] = df_train['label'].astype(str)
df_test['label'] = df_test['label'].astype(str)
train_data, val_data = train_test_split(df_train, test_size=0.2)


In [None]:
#@title Bert preprocessing

def convert_data_into_input_example(data):
    input_examples = []
    for i in range(len(data)):
        example = InputExample(
            guid= None,
            text_a= data.iloc[i]['description'],
            text_b= None,
            label= data.iloc[i]['label']
        )
        input_examples.append(example)
    return input_examples


train_examples = convert_data_into_input_example(train_data)
val_examples = convert_data_into_input_example(val_data)
test_examples = convert_data_into_input_example(df_test)

# tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased',
                                        # do_lower_case=False)

label_list = ['0', '1']
bert_train_dataset = glue_convert_examples_to_features(examples=train_examples,
                                                       tokenizer=tokenizer,
                                                       max_length=128,
                                                       task='mrpc',
                                                       label_list=label_list)

bert_val_dataset = glue_convert_examples_to_features(examples=val_examples,
                                                     tokenizer=tokenizer,
                                                     max_length=128,
                                                     task='mrpc',
                                                     label_list=label_list)

bert_test_dataset = glue_convert_examples_to_features(examples=test_examples,
                                                     tokenizer=tokenizer,
                                                     max_length=128,
                                                     tglue_convert_examples_to_featuresask='mrpc',
                                                     label_list=label_list)

# model = TFBertForSequenceClassification.from_pretrained(
                                                # 'bert-base-multilingual-cased')

optimizer = tf.keras.optimizers.Adam(learning_rate=2e-5,
                                     epsilon=1e-08,
                                     clipnorm=1.0)
loss = losses.SparseCategoricalCrossentropy(from_logits=True)
metric = metrics.SparseCategoricalAccuracy('accuracy')
model.compile(loss=loss,
              optimizer=optimizer,
              metrics=[metric])

In [None]:
#@title Bert Training

EPOCHS = 4 #@param {type: 'number'}

def modify_ds(bdset):   
    input_ids, attention_mask, token_type_ids, label = [], [], [], []
    for in_ex in bdset:
        input_ids.append(in_ex.input_ids)
        attention_mask.append(in_ex.attention_mask)
        token_type_ids.append(in_ex.token_type_ids)
        label.append(in_ex.label)

    input_ids = np.vstack(input_ids)
    attention_mask = np.vstack(attention_mask)
    token_type_ids = np.vstack(token_type_ids)
    label = np.vstack(label)
    return ([input_ids, attention_mask, token_type_ids], label)

def example_to_features(input_ids, attention_masks, token_type_ids, y):
    return {"input_ids": input_ids,
            "attention_mask": attention_masks,
            "token_type_ids": token_type_ids},y


x_train, y_train = modify_ds(bert_train_dataset)
x_val, y_val = modify_ds(bert_val_dataset)
x_test, y_test = modify_ds(bert_test_dataset)

train_ds = tf.data.Dataset.from_tensor_slices((x_train[0], x_train[1],
                                               x_train[2], y_train)).map(example_to_features).shuffle(100).batch(32)
val_ds   = tf.data.Dataset.from_tensor_slices((x_val[0], x_val[1], 
                                               x_val[2], y_val)).map(example_to_features).batch(64)
test_ds   = tf.data.Dataset.from_tensor_slices((x_test[0], x_test[1], 
                                               x_test[2], y_test)).map(example_to_features).batch(64)


print('format of model input examples: '.format(train_ds.take(1)))

history = model.fit(train_ds,
                    validation_data=val_ds,
                    epochs=EPOCHS)

model.save('/content/drive/MyDrive/Projects/datacamp/Text/bert_model.h5')

In [None]:
#@title Bert Test

model = load_model('/content/drive/MyDrive/Projects/datacamp/Text/bert_model.h5')
test_result = model.evaluate(test_ds)
predictions = model.predict(test_ds)

In [None]:
#@title Tokenization and cleaning

puncs = ['،', '.', ',', ':', ';', '"']
normalizer = Normalizer()
lemmatizer = Lemmatizer()

# turn a doc into clean tokens
def clean_doc(doc):
    doc = normalizer.normalize(doc) # Normalize document using Hazm Normalizer
    tokenized = word_tokenize(doc)  # Tokenize text
    tokens = []
    for t in tokenized:
      temp = t
      for p in puncs:
        temp = temp.replace(p, '')
      tokens.append(temp)
    # tokens = [w for w in tokens if not w in stop_set]    # Remove stop words
    tokens = [w for w in tokens if not len(w) <= 1]
    tokens = [w for w in tokens if not w.isdigit()]
    tokens = [lemmatizer.lemmatize(w) for w in tokens] # Lemmatize sentence words using Hazm Lemmatizer
    tokens = ' '.join(tokens)
    return tokens

In [None]:
#@title Prepare data

NUM_WORDS = 2000
NUM_CLASSES = 2
TEST_SIZE = 0.2

tokenizer = Tokenizer(num_words=NUM_WORDS)

x_train = np.array(df_train['context'])
y_train = np.array(df_train['label'])

x_test = np.array(df_test['context'])
y_test = np.array(df_test['label'])

# Apply preprocessing step to training data and test data
train_docs = np.empty_like(x_train)
for index, document in enumerate(x_train):
  train_docs[index] = clean_doc(document)

test_docs = np.empty_like(x_test)
for index, document in enumerate(x_test):
  test_docs[index] = clean_doc(document)

tokenizer.fit_on_texts(train_docs)
max_length = max([len(s.split()) for s in train_docs])

# Embed and Pad embeded training sequences
encoded_train = tokenizer.texts_to_sequences(train_docs)
encoded_test = tokenizer.texts_to_sequences(test_docs)

x_train_padded = pad_sequences(encoded_train, maxlen=max_length,
                               padding='post')
x_test_padded = pad_sequences(encoded_test, maxlen=max_length,
                              padding='post')
vocab_size = len(tokenizer.word_index)

categorical_y_train = to_categorical(y_train, NUM_CLASSES)
categorical_y_test = to_categorical(y_test, NUM_CLASSES)

In [None]:
#@title Build CNN model 

Metrics = [
    metrics.Precision(name="precision"),
    metrics.Accuracy(name='accuracy'),
    metrics.Recall(name="recall"),
    metrics.AUC(name='auc'),
    metrics.AUC(name='prc', curve='PR')
]

model_cnn = Sequential()
model_cnn.add(Embedding(vocab_size, 400, input_length=max_length))

model_cnn.add(Conv1D(filters=64, kernel_size=4, activation='relu', 
                     padding='same', kernel_regularizer=l2(0.1)))
model_cnn.add(BatchNormalization())
model_cnn.add(MaxPooling1D(pool_size=2))
model_cnn.add(Dropout(0.3))

model_cnn.add(Conv1D(filters=64, kernel_size=8, activation='relu',
                     padding='same', kernel_regularizer=l2(0.1) ))
model_cnn.add(MaxPooling1D(pool_size=2))
model_cnn.add(Dropout(0.3))

model_cnn.add(Conv1D(filters=64, kernel_size=16, activation='relu', 
                     padding='same', kernel_regularizer=l2(0.1)))
model_cnn.add(GlobalMaxPooling1D())
model_cnn.add(Dropout(0.5))

model_cnn.add(Dense(300, activation="relu"))
model_cnn.add(Dropout(0.5))

# model_cnn.add(Dense(100, activation="relu"))
# model_cnn.add(Dropout(0.5))

# model_cnn.add(Dense(100, activation="relu"))
# model_cnn.add(Dropout(0.5))

model_cnn.add(Dense(NUM_CLASSES, activation='softmax'))

# lr_schedule = schedules.ExponentialDecay(
#                                         initial_learning_rate=4e-3,
#                                         decay_steps=2,
#                                         decay_rate=0.99999
#                                         )

optimizer = Adam(learning_rate=5e-4, beta_1=0.9, beta_2=0.999, 
                epsilon=1e-07, amsgrad=False)

model_cnn.compile(loss='categorical_crossentropy',
                  optimizer=optimizer,
                  metrics=metrics.categorical_accuracy)

In [None]:
#@title Training CNN

BATCH_SIZE = 64 #@param {type:"number"}
EPOCHS = 10 #@param {type:"number"}

# Train model
history_cnn = model_cnn.fit(x_train_padded,
                            categorical_y_train,
                            batch_size=BATCH_SIZE,
                            epochs=EPOCHS,
                            validation_split=0.2,
                            shuffle=True)

model_cnn.save('/content/drive/MyDrive/Projects/datacamp/Text/cnn_model.h5')

In [None]:
#@title CNN Test

threshold = 0.1
model = load_model('/content/drive/MyDrive/Projects/datacamp/Text/cnn_model.h5')
test_result = model_cnn.evaluate(x_test_padded, categorical_y_test)
predictions = model_cnn.predict(x_test_padded)

In [None]:
#@title Get dictionary

def get_dict(df):
    wordDict = {}
    for idx, row in enumerate(df.context):
        price = row.split()[-1]
        row = row + price
        row = re.split(r'([a-zA-Z]+)', row)
        row = " ".join(str(item) for item in row)
        words = row.split()
        for wrd in words:
            if wrd in wordDict:
                wordDict[wrd] += 1
            else:
                wordDict[wrd] = 1
    return wordDict

train_wordDict = get_dict(df_train)
test_wordDict = get_dict(df_test)


In [None]:
#@title Tockenize data

MAX_NB_WORDS = 55000
MAX_SEQUENCE_LENGTH = 500

content_train = df_train['context']
content_test = df_test['context']

y_train = np.array(df_train['label'])
y_test = np.array(df_test['label'])

tokenizer = Tokenizer(num_words=MAX_NB_WORDS)
tokenizer.fit_on_texts(content_train)
word_index = tokenizer.word_index

train_sequences = tokenizer.texts_to_sequences(content_train)
test_sequences = tokenizer.texts_to_sequences(content_test)

train_data = pad_sequences(train_sequences, maxlen=MAX_SEQUENCE_LENGTH)
test_data = pad_sequences(test_sequences, maxlen=MAX_SEQUENCE_LENGTH)

# x_train, x_val, y_train, y_val  = train_test_split(train_data, y_train,
                                                #    test_size=0.2, random_state=0)


In [None]:
#@title Get Embeddings by fastText

fastTextDir = '/content/'
fastText_path = os.path.join(fastTextDir, 'cc.fa.300.vec')

def get_embedding(wordDict):
    embeddings_index = {}
    with open(fastText_path, encoding='utf8') as infile:
        for line in infile:
            values = line.split()
            word = values[0]
            try:
                coefs = np.asarray(values[1:], dtype='float32')
            except:
                print("Warnning"+str(values)+" in" + str(line))
            if word in wordDict:
                embeddings_index[word] = coefs
    return embeddings_index

train_embeddings = get_embedding(train_wordDict)
test_embeddings = get_embedding(test_wordDict)

In [None]:
#@title Get Embedding matrix

EMBEDDING_DIM = 300
embeddings_index = train_embeddings
embedding_matrix = np.random.random((len(word_index) + 1, EMBEDDING_DIM))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector


In [None]:
#@title RNN model

from tensorflow import expand_dims

nClasses = 2
type = 1

model = Sequential()
input1 = Input((MAX_SEQUENCE_LENGTH - 1, ), name='context')
layerM1Embedding = Embedding(len(word_index) + 1,
                            EMBEDDING_DIM,
                            weights=[embedding_matrix],
                            input_length=MAX_SEQUENCE_LENGTH,
                            trainable=True)(input1)

input2 = Input((1,), name='price')
if type == 1:
    layer = GRU(100, dropout=0.2, recurrent_dropout=0.2)(layerM1Embedding)
    layer = expand_dims(layer, axis=-1)
    layer = Conv1D(filters=64, kernel_size=2, padding='same',
                   activation='relu')(layer)
    layer = MaxPooling1D(pool_size=2)(layer)
    layer = LSTM(200, dropout=0.2, recurrent_dropout=0.2)(layer)
elif type == 2:
    layerM1 = GRU(100, dropout=0.2, recurrent_dropout=0.2)(layerM1Embedding)
    layerM1 = Dense(nClasses, activation='softmax')(layerM1)
    layerM2 = Dense(nClasses, activation='softmax')(input2)
    layer = Concatenate()([layerM1, layerM2])     

out = Dense(nClasses, activation='softmax')(layer)
model = Model(inputs=[input1, input2], outputs=out)

optimizer = Adam(learning_rate=5e-4, beta_1=0.9, beta_2=0.999, 
                epsilon=1e-07, amsgrad=False)

model.compile(loss=losses.SparseCategoricalCrossentropy(from_logits=True),
              optimizer='rmsprop',
              metrics=metrics.SparseCategoricalAccuracy('accuracy'))


In [None]:
#@title CNN model

nClasses = 2

embedding_layer = Embedding(len(word_index) + 1,
                            EMBEDDING_DIM,
                            weights=[embedding_matrix],
                            input_length=MAX_SEQUENCE_LENGTH,
                            trainable=True)

sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH, ))
embedded_sequences = embedding_layer(sequence_input)

x = BatchNormalization()(embedded_sequences)
x = Conv1D(256, 5, activation='relu')(x)
x = MaxPooling1D(5)(x)
x = Conv1D(256, 5, activation='relu')(x)
x = MaxPooling1D(5)(x)
x = Conv1D(256, 5, activation='relu')(x)
x = MaxPooling1D(5)(x)
x = Flatten()(x)
x = Dense(256, activation='relu')(x)
preds = Dense(nClasses, activation='softmax')(x)
model = Model(sequence_input, preds)

optimizer = Adam(learning_rate=5e-4, beta_1=0.9, beta_2=0.999, 
                epsilon=1e-07, amsgrad=False)

model.compile(loss=losses.SparseCategoricalCrossentropy(from_logits=True),
                optimizer='rmsprop',
                metrics=metrics.SparseCategoricalAccuracy('accuracy'))



In [None]:
#@title Training Model

model_type = 'CNN' #@param {type: "string"}
BATCH_SIZE = 64 #@param {type:"number"}
EPOCHS =   4 #@param {type:"number"}

if model_type == "RNN":
    model.fit([train_data[:,:-1], train_data[:,-1]], y_train.astype(float),
                validation_split=0.2,
                epochs=EPOCHS,
                batch_size=BATCH_SIZE)
    model.save(('/content/drive/MyDrive/Projects/datacamp/Text/rnnft1_model.h5'))
elif model_type == "CNN":
    model.fit(train_data, y_train.astype(float),
            validation_split=0.2, 
            epochs=EPOCHS,
            batch_size=BATCH_SIZE)
    model.save(('/content/drive/MyDrive/Projects/datacamp/Text/cnnft_model.h5'))


In [None]:
#@title Test FastText models


model_type = 'CNN' #@param {type: "string"}
if model_type == 'RNN':
    model = load_model('/content/drive/MyDrive/Projects/datacamp/Text/rnnft_model.h5')
elif model_type == 'CNN':
    model = load_model('/content/drive/MyDrive/Projects/datacamp/Text/cnnft_model.h5')

test_result = model.evaluate(test_data)
predictions = model.predict(test_data)
label = np.where(predictions > (0.5 + threshold), 1, arr)
label = np.where(arr < (0.5 - threshold), 0, arr)
label = np.where((arr <= (0.5 + threshold)) &(arr >= (0.5 - threshold)),
                 2, arr)