# RAPIDS cuML TfidfVectorizer and KNN to find similar Text and Images
In this notebook we use RAPIDS cuML's TfidfVectorizer and cuML's KNN to find items with similar titles and items with similar images. First we use RAPIDS cuML TfidfVectorizer to extract text embeddings of each item's title and then compare the embeddings using RAPIDS cuML KNN. Next we extract image embeddings of each item with EffNetB0 and compare them using RAPIDS cuML KNN.[](http://)

# Load Libraries

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import cv2, matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow.keras.applications import EfficientNetB0
print('TF',tf.__version__)
import torch
from IPython.display import Image, clear_output  # to display images

clear_output()
print(f"Setup complete. Using torch {torch.__version__} ({torch.cuda.get_device_properties(0).name if torch.cuda.is_available() else 'CPU'})")

In [None]:
!pip3 install psycopg2-binary 
!pip3 install langdetect

In [None]:
import abc


class Embedder(abc.ABC):
    @abc.abstractmethod
    def encode(self, X):
        pass

    @abc.abstractmethod
    def fit(self, X):
        pass


In [None]:
import abc


class Regressor(abc.ABC):
    @abc.abstractmethod
    def fit(self, X, y):
        pass

    @abc.abstractmethod
    def predict(self, X):
        pass

    @abc.abstractmethod
    def save(self, path):
        pass

    @abc.abstractmethod
    def load(self, path):
        pass


In [None]:
import os
from typing import List
import numpy as np


class Basepipeline:
    def __init__(self, Embedder, Regressor):
        self.embedder = Embedder()
        self.regressor = Regressor()

    def fit(self, X: List[str], y: List[float]):
        X_emb = self.embedder.fit(X)
        #4 X_emb = self.embedder.encode(X)
        self.regressor.fit(X_emb, y)

    def predict(self, X: List[str]):
        X_emb = self.embedder.encode(X)
        return self.regressor.predict(X_emb)

    def save(self, odir):
        os.makedirs(odir, exist_ok=True)
        self.embedder.save(f'{odir}/{self.embedder.name}')
        self.regressor.save(f'{odir}/{self.regressor.name}')


In [None]:
from sklearn.metrics import mean_squared_error


def N_distance(y_true, y_pred, N):
    """

    :param y_true:
    :param y_pred:
    :return:
    """
    score = 0
    for y_i, yhat_i in zip(y_true, y_pred):
        vals = abs(y_i - yhat_i)
        if all(a <= N for a in vals):
            score += 1
    return score


def rmse(y, yhat):
    """

    :param y: actual M x N matrix
    :param yhat: predictions M x N matrix
    :return: rmse
    """
    return mean_squared_error(y, yhat, squared=False)


In [None]:
from abc import ABC
from sklearn.feature_extraction.text import TfidfVectorizer
# from cuml.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA
from nltk import word_tokenize
from nltk.stem.porter import PorterStemmer
import numpy as np
import pickle
from nltk import download
download('punkt')


class TfidfModel(Embedder, ABC):
    """
    Tf-idf + PCA.
    """

    def __init__(self):
        self.name = ''
        self.model = TfidfVectorizer(lowercase=True, max_features=30000)
        self.pca = PCA(n_components=100)

    def fit(self, X):
        print('Fitting the tfidf vectorizer...')
        tokenized_text = self.tokenize_text(X)
        matrix = self.model.fit_transform(tokenized_text).todense()
        matrix = np.squeeze(np.asarray(matrix))
        print('Dimension of original tfidf matrix: ', matrix.shape)

        self.pca.fit(matrix)
        reduced_matrix = self.pca.transform(matrix)
        print('Dimension of reduced matrix: ', reduced_matrix.shape)
        print('Encoder fitting completed!')
        return reduced_matrix

    def encode(self, X):
        print('Encoding data...')
        tokenized_text = self.tokenize_text(X)
        matrix = self.model.transform(tokenized_text).todense()
        matrix = np.squeeze(np.asarray(matrix))
        reduced_matrix = self.pca.transform(matrix)
        return reduced_matrix

    def load(self, path):
        pass

    def save(self, odir):
        print('Saving model...')
        pickle.dump(self.model, open(odir + 'tfidf_vectorizer', 'wb'))  # Save tfidf vectorizer
        pickle.dump(self.pca, open(odir + 'pca_model', 'wb'))  # Save the PCA model
        print('Model saved!')

    def tokenize_item(self, item):
        tokens = word_tokenize(item)
        stems = []
        for token in tokens:
            stems.append(PorterStemmer().stem(token))
        return stems

    def tokenize_text(self, text):
        return [' '.join(self.tokenize_item(txt.lower())) for txt in text]


In [None]:
from sklearn.datasets import make_regression
from sklearn.model_selection import RepeatedKFold
from keras.models import Sequential
from keras.layers import Dense
import keras
from keras import callbacks

import tensorflow as tf

print(tf.__version__)
print(tf.config.list_physical_devices())


class NeuralNetMulti(Regressor):
    def __init__(self):
        self.name = 'keras-sequential'
        self.model = Sequential()
        # self.earlystopping = callbacks.EarlyStopping(monitor="mae",
        #                                              mode="min", patience=5,
        #                                              restore_best_weights=True)

    def fit(self, X, y):
        print('Fitting into the neural net...')
        n_inputs = X.shape[1]
        n_outputs = y.shape[1]
        self.model.add(Dense(30, input_dim=n_inputs, kernel_initializer='he_uniform', activation='relu'))
        self.model.add(Dense(20, activation='relu'))
        self.model.add(Dense(20, activation='relu'))
        self.model.add(Dense(10, activation='relu'))
        self.model.add(Dense(n_outputs))
        self.model.summary()
        self.model.compile(loss='mae', optimizer='adam', metrics=['mse', 'mae'])
        self.model.fit(X, y, verbose=1, epochs=1000)
        # self.model.fit(X, y, verbose=1, epochs=1000, callbacks=[self.earlystopping])
        print('Fitting completed!')

    def predict(self, X):
        print('Predicting...')
        predictions = self.model.predict(X)
        print('Predicted!')
        return predictions

    def save(self, path):
        print('Saving model to ', path, '...')
        self.model.save(path)
        print('Model saved')

    def load(self, path):
        print('Loading model...')
        model = keras.models.load_model(path)
        print('Model loaded!')

    # def get_dataset(self):
    #     X, y = make_regression(n_samples=1000, n_features=10, n_informative=5, n_targets=3, random_state=2)
    #     return X, y

    # evaluate a model using repeated k-fold cross-validation
    # def evaluate_model(self, X, y):
    #     results = list()
    #     n_inputs, n_outputs = X.shape[1], y.shape[1]
    #     # define evaluation procedure
    #     cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)
    #     # enumerate folds
    #     for train_ix, test_ix in cv.split(X):
    #         # prepare data
    #         X_train, X_test = X[train_ix], X[test_ix]
    #         y_train, y_test = y[train_ix], y[test_ix]
    #         # define model
    #         model = get_model(n_inputs, n_outputs)
    #         # fit model
    #         model.fit(X_train, y_train, verbose=0, epochs=100)
    #         # evaluate model on test set
    #         mae = model.evaluate(X_test, y_test, verbose=0)
    #         # store result
    #         print('>%.3f' % mae)
    #         results.append(mae)
    #     return results
# READ THIIIIIIIIIIS
# https://stackoverflow.com/questions/56299770/units-in-dense-layer-in-keras/56302896


In [None]:
import pandas as pd
import psycopg2


# query = '''SELECT big5_openness, big5_conscientiousness, big5_extraversion, big5_agreeableness, big5_neuroticism, input_text  FROM data_personality_analiser_nlp where input_text IS NOT NULL and input_text <> '' '''

class Connector:
    def __init__(self):
        host = '134.213.113.101'
        user = 'nlp'
        password = 'p2021nlp-psql'
        # driver = 'SQL+Server'
        db = 'nlp-data'
        port = '5432'
        self.connection = psycopg2.connect(
            database=db,
            user=user,
            password=password,
            host=host,
            port=port
        )

    def query(self, db_query):
        print('Quierying database...')
        df = pd.read_sql_query(db_query, self.connection)
#         df = pd.read_csv('../data/big5_data.csv')
#         print(df.shape)
        print('Database queried!')
        return df


In [None]:
!mkdir data

In [None]:
import pandas as pd
import re
from langdetect import detect


class DataLoader:
    def __init__(self):
        self.connector = Connector()

    def parse_input(self):
        # df = pd.read_csv(self.input_file)
        db_query = '''SELECT big5_openness, big5_conscientiousness, big5_extraversion, big5_agreeableness, big5_neuroticism, input_text  FROM data_personality_analiser_nlp where input_text IS NOT NULL and input_text <> '' '''
        df = self.connector.query(db_query)
        df = df[df.input_text.apply(detect).eq('en')]
        if not os.path.exists('data'):
            os.makedirs('data')
#         df.to_csv('data/big5_data.csv')
        y = df[['big5_openness', 'big5_conscientiousness', 'big5_extraversion', 'big5_agreeableness',
                'big5_neuroticism']].to_numpy()

        input_texts = df[['input_text']].astype(str).to_numpy()
        X = [re.sub(r'http\S+', '', text[0]) for text in input_texts]  # remove links
        return X, y


In [None]:
# RESTRICT TENSORFLOW TO 12GB OF GPU RAM
# SO THAT WE HAVE GPU RAM FOR RAPIDS CUML KNN
LIMIT = 12
gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
  try:
    tf.config.experimental.set_virtual_device_configuration(
        gpus[0],
        [tf.config.experimental.VirtualDeviceConfiguration(memory_limit=1024*LIMIT)])
    logical_gpus = tf.config.experimental.list_logical_devices('GPU')
    print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPUs")
  except RuntimeError as e:
    print(e)
print('Restrict TensorFlow to max %iGB GPU RAM'%LIMIT)
print('so RAPIDS can use %iGB GPU RAM'%(16-LIMIT))

In [None]:
# from models.base_pipeline import Basepipeline
# from models.bert_model import BertModel
# from models.catboost_regr import CatboostRegr
# from models.neuralnet_multi import NeuralNetMulti
# from models.data_loader import DataLoader
# from models.tfidf_model import TfidfModel
# from models.scores import rmse
# from models.scores import N_distance


from sklearn.model_selection import train_test_split
import numpy as np


def main():
    dl = DataLoader()

    X, y = dl.parse_input()
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

    base_model = Basepipeline(TfidfModel, NeuralNetMulti)
    base_model.fit(X_train, y_train)
    base_model.save('tfidf_pca_nn_full/')
    y_pred = base_model.predict(X_test)

    ndist = N_distance(y_test, y_pred, 10)
    _rmse = rmse(y_test, y_pred)
    print('Results: ')
    print('Shape of prediction set size: ', y_test.shape)
    print('10_distance: ', ndist)
    print('rmse: ', _rmse)


if __name__ == '__main__':
    main()


In [None]:
# # RESTRICT TENSORFLOW TO 12GB OF GPU RAM
# # SO THAT WE HAVE GPU RAM FOR RAPIDS CUML KNN
# LIMIT = 12
# gpus = tf.config.experimental.list_physical_devices('GPU')
# if gpus:
#   try:
#     tf.config.experimental.set_virtual_device_configuration(
#         gpus[0],
#         [tf.config.experimental.VirtualDeviceConfiguration(memory_limit=1024*LIMIT)])
#     logical_gpus = tf.config.experimental.list_logical_devices('GPU')
#     print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPUs")
#   except RuntimeError as e:
#     print(e)
# print('Restrict TensorFlow to max %iGB GPU RAM'%LIMIT)
# print('so RAPIDS can use %iGB GPU RAM'%(16-LIMIT))

In [None]:
# import cudf, cuml, cupy
# from cuml.feature_extraction.text import TfidfVectorizer
# from cuml.neighbors import NearestNeighbors
# print('RAPIDS',cuml.__version__)

In [None]:
# model = TfidfVectorizer(stop_words='english', binary=True)
# text_embeddings = model.fit_transform(train_gf.title).toarray()
# print('text embeddings shape is',text_embeddings.shape)

In [None]:
# model = EfficientNetB0(weights='imagenet',include_top=False, pooling='avg', input_shape=None)
# train_gen = DataGenerator(train, batch_size=128)
# image_embeddings = model.predict(train_gen,verbose=1)
# print('image embeddings shape is',image_embeddings.shape)