In [1]:
import sys
import os
import numpy as np
import zipfile
from tqdm import tqdm
import scrapbook as sb
from tempfile import TemporaryDirectory
# import tensorflow as tf

from recommenders.models.deeprec.deeprec_utils import download_deeprec_resources 
from recommenders.models.newsrec.newsrec_utils import prepare_hparams
from recommenders.models.newsrec.models.nrms import NRMS
from recommenders.models.newsrec.io.mind_iterator import MINDIterator
from recommenders.models.newsrec.io.mind_all_iterator import MINDAllIterator
from recommenders.models.newsrec.newsrec_utils import get_mind_data_set

import torch
import torch.nn as nn
import torch.nn.functional as F

print("System version: {}".format(sys.version))
# print("Tensorflow version: {}".format(tf.__version__))

System version: 3.8.16 (default, Mar  2 2023, 03:18:16) [MSC v.1916 64 bit (AMD64)]


In [2]:
print(1)

1


In [3]:
news_title = [""]
news_title.append(['hi'])
news_title

['', ['hi']]

In [4]:
import re
def word_tokenize(sent):
    """Split sentence into word list using regex.
    Args:
        sent (str): Input sentence
                                                                                                                                                                    
    Return:
        list: word list
    """
    pat = re.compile(r"[\w]+|[.,!?;|]")
    if isinstance(sent, str):
        return pat.findall(sent.lower())
    else:
        return []
word_tokenize('HI MY NAME IS')

['hi', 'my', 'name', 'is']

In [5]:
import numpy as np
a2d = np.array([[1, 2, 3], [4, 5, 6]])
a2d[[1] + [0]]

array([[4, 5, 6],
       [1, 2, 3]])

In [6]:
import numpy as np

In [7]:
epochs = 5
seed = 42
batch_size = 4

# Options: demo, small, large
MIND_type = 'demo'

In [8]:
import os

# Specify the name of the directory
directory = "recommenders\data"

# Specify the current path
current_path = os.getcwd()

# Full path
data_path = os.path.join(current_path, directory)

# Create the new directory
if not os.path.exists(data_path):
    os.makedirs(data_path)

print(f"Data path is set to: {data_path}")


Data path is set to: C:\Users\Rija Farooqui\Desktop\news_recommendation\recommenders\recommenders\data


In [9]:
train_news_file = os.path.join(data_path, 'train', r'news.tsv')
train_behaviors_file = os.path.join(data_path, 'train', r'behaviors.tsv')
valid_news_file = os.path.join(data_path, 'valid', r'news.tsv')
valid_behaviors_file = os.path.join(data_path, 'valid', r'behaviors.tsv')
wordEmb_file = os.path.join(data_path, "utils", "embedding.npy")
userDict_file = os.path.join(data_path, "utils", "uid2index.pkl")
wordDict_file = os.path.join(data_path, "utils", "word_dict.pkl")
vertDict_file = os.path.join(data_path, "utils", "vert_dict.pkl")
subvertDict_file = os.path.join(data_path, "utils", "subvert_dict.pkl")
yaml_file = os.path.join(data_path, "utils", r'naml.yaml')

mind_url, mind_train_dataset, mind_dev_dataset, mind_utils = get_mind_data_set(MIND_type)

In [10]:
# import pickle

# # specify the path to your .pkl file
# file_path = wordEmb_file

# # open and read the pickle file
# with open(file_path, 'rb') as f:
#     data = pickle.load(f)

# # print or inspect the content of the pickle file
# print(data)


In [11]:
if not os.path.exists(train_news_file):
    download_deeprec_resources(mind_url, os.path.join(data_path, 'train'), mind_train_dataset)
    
if not os.path.exists(valid_news_file):
    download_deeprec_resources(mind_url, \
                               os.path.join(data_path, 'valid'), mind_dev_dataset)
if not os.path.exists(yaml_file):
    download_deeprec_resources(r'https://recodatasets.z20.web.core.windows.net/newsrec/', \
                               os.path.join(data_path, 'utils'), mind_utils)

In [12]:
yaml_file

'C:\\Users\\Rija Farooqui\\Desktop\\news_recommendation\\recommenders\\recommenders\\data\\utils\\naml.yaml'

In [13]:
hparams = prepare_hparams(yaml_file, 
                          wordEmb_file=wordEmb_file,
                          wordDict_file=wordDict_file, 
                          userDict_file=userDict_file,
                          vertDict_file=vertDict_file,
                          subvertDict_file=subvertDict_file,
                          batch_size=batch_size,
                          epochs=epochs,
                          show_step=10)
print(hparams)

HParams object with values {'support_quick_scoring': True, 'dropout': 0.2, 'attention_hidden_dim': 200, 'head_num': 4, 'head_dim': 100, 'filter_num': 400, 'window_size': 3, 'vert_emb_dim': 100, 'subvert_emb_dim': 100, 'gru_unit': 400, 'type': 'ini', 'user_emb_dim': 50, 'learning_rate': 0.0001, 'optimizer': 'adam', 'epochs': 5, 'batch_size': 4, 'show_step': 10, 'title_size': 30, 'body_size': 50, 'his_size': 50, 'vert_num': 17, 'subvert_num': 249, 'cold_his_size': 10, 'data_format': 'naml', 'npratio': 4, 'metrics': ['group_auc', 'mean_mrr', 'ndcg@5;10'], 'embedding_type': 'bert-base-uncased', 'word_emb_dim': 768, 'cnn_activation': 'relu', 'model_type': 'naml', 'dense_activation': 'relu', 'loss': 'cross_entropy_loss', 'wordEmb_file': 'C:\\Users\\Rija Farooqui\\Desktop\\news_recommendation\\recommenders\\recommenders\\data\\utils\\embedding.npy', 'wordDict_file': 'C:\\Users\\Rija Farooqui\\Desktop\\news_recommendation\\recommenders\\recommenders\\data\\utils\\word_dict.pkl', 'userDict_file

In [14]:
hparams.metrics

['group_auc', 'mean_mrr', 'ndcg@5;10']

In [15]:
import pickle
def load_dict(file_path):
    """load pickle file

    Args:
        file path (str): file path

    Returns:
        object: pickle loaded object
    """
    with open(file_path, "rb") as f:
        return pickle.load(f)
uid2index = load_dict(hparams.userDict_file)
word_dict = load_dict(hparams.wordDict_file)

In [16]:
iterator = MINDAllIterator
train_iterator = iterator(hparams, hparams.npratio, col_spliter="\t")
test_iterator = iterator(hparams, col_spliter="\t")
train_batches = train_iterator.load_data_from_file(train_news_file, train_behaviors_file)
test_batches = test_iterator.load_data_from_file(valid_news_file, valid_behaviors_file)

In [17]:
hparams.head_dim  = 15

In [18]:
def _init_embedding(file_path):
    """Load pre-trained embeddings as a constant tensor.

    Args:
        file_path (str): the pre-trained glove embeddings file path.

    Returns:
        numpy.ndarray: A constant numpy array.
    """

    return np.load(file_path)


word2vec_embedding = _init_embedding(hparams.wordEmb_file)
word2vec_embedding = torch.tensor(word2vec_embedding)
criterion = nn.CrossEntropyLoss()  # or any other suitable loss function
model = NRMS(hparams, word2vec_embedding)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [19]:
import torch.optim as optim
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# device = 'cpu'
model = model.to(device)
optimizer = optim.Adam(model.parameters(), lr=hparams.learning_rate)
device

device(type='cuda')

In [20]:
# from transformers import BertModel, BertTokenizer

# tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
# model = BertModel.from_pretrained('bert-base-uncased')
# tokenizer(['hello my name is', 'qazi saad'])

In [21]:
x= ['s', ['hi', 'my']]
x = [' '.join(n) for n in x]
x = np.array(x)
np.asarray(x[[1] + [0]])

array(['hi my', 's'], dtype='<U5')

In [22]:
%%time
model.fit_model(train_iterator, train_news_file, train_behaviors_file, optimizer)

Epoch 1: 0it [00:00, ?it/s]

embeddings torch.Size([200, 30, 768]) <class 'torch.Tensor'>
embeddings torch.Size([20, 30, 768]) <class 'torch.Tensor'>


Epoch 1: 1it [00:07,  7.40s/it, Loss=1.61, Accuracy=0.25]

embeddings torch.Size([200, 30, 768]) <class 'torch.Tensor'>


Epoch 1: 1it [00:09,  9.35s/it, Loss=1.61, Accuracy=0.25]


In [23]:
 %%time
_, labels, preds = model.evaluate_model(test_iterator, valid_news_file, valid_behaviors_file)

Evaluate: 0it [00:00, ?it/s]

news_title [''
 'the brands queen elizabeth , prince charles , and prince philip swear by'
 'the cost of trump s aid freeze in the trenches of ukraine s war' ...
 'marvel s kevin feige breaks silence on scorsese attack it s unfortunate exclusive'
 'why kate meghan were on different balconies for remembrance sunday'
 'tennessee judge holds lawyer s baby as he swears her into the state bar in viral video']


Evaluate: 0it [00:01, ?it/s]


In [24]:
from recommenders.models.deeprec.deeprec_utils import cal_metric
res = cal_metric(labels, preds, hparams.metrics)

In [None]:
res

In [None]:
import os
import openai
openai.api_key = "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"
response = openai.Completion.create(
  model="gpt-3.5-turbo-0613",
  prompt="Write a tagline for an ice cream shop."
)
response

In [None]:
import openai

openai.ChatCompletion.create(
  model="gpt-3.5-turbo-0613",
  messages=[
        {"role": "system", "content": "You are a helpful assistant."},
        {"role": "user", "content": "Who won the world series in 2020?"},
        {"role": "assistant", "content": "The Los Angeles Dodgers won the World Series in 2020."},
        {"role": "user", "content": "Where was it played?"}
    ]
)

In [None]:
from elasticsearch import Elasticsearch

# Connect to Elasticsearch
es = Elasticsearch("http://localhost:9200")

# Define the analyzer/tokenizer configuration
tokenizer_config = {
    "tokenizer": "tokenizer_name",
    "text": "Sample text to analyze"
}

# Use the analyze API to inspect the tokenizer
analysis_result = es.indices.analyze(index='your_index_name', body=tokenizer_config)

# Print the tokenization result
for token in analysis_result['tokens']:
    print(token['token'])


In [None]:
from elasticsearch import Elasticsearch

# Connect to Elasticsearch
es = Elasticsearch("http://localhost:9200")

# Define the index name
index_name = 'sudachi_sample'

# Define the Sudachi tokenizer and analyzer configuration
sudachi_config = {
    "settings": {
        "index": {
            "analysis": {
                "tokenizer": {
                    "sudachi_tokenizer": {
                        "type": "sudachi_tokenizer",
                        "split_mode": "C",
                        "discard_punctuation": True,
                        "resources_path": "sudachi"
                    }
                },
                "analyzer": {
                    "sudachi_analyzer": {
                        "filter": [],
                        "tokenizer": "sudachi_tokenizer",
                        "type": "custom"
                    }
                }
            }
        }
    }
}

# Create the index with the Sudachi tokenizer and analyzer configuration
# es.indices.create(index=index_name, body=sudachi_config)

# Test the analyzer by analyzing a sample text
sample_text = "日本の選挙管理委員会はとても良い"
analysis_result = es.indices.analyze(index=index_name, analyzer="sudachi_analyzer", text=sample_text)

# Print the tokens produced by the analyzer
tokens = [token["token"] for token in analysis_result["tokens"]]
print("Tokens:", tokens)