In [None]:
from google.colab import drive
drive.mount('/content/drive')

!pip install faker
!pip install sentence-transformers
!pip install faiss-gpu


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import random
from faker import Faker
from datetime import datetime, timedelta
import json

fake = Faker()

def generate_log_entry():
    ip_address = fake.ipv4()
    timestamp = datetime.now() - timedelta(days=random.randint(0, 30))
    url = fake.url()
    http_method = random.choice(['GET', 'POST'])
    user_agent = fake.user_agent()
    response_code = random.choice([200, 404, 500])

    log_entry = {
        "ip_address": ip_address,
        "timestamp": timestamp.strftime('%d/%b/%Y:%H:%M:%S'),
        "request": f"{http_method} {url} HTTP/1.1",
        "status": response_code,
        "user_agent": user_agent
    }
    return log_entry

def generate_log_file(filename, num_entries):
    log_data = [generate_log_entry() for _ in range(num_entries)]
    try:
        with open(filename, 'w') as file:
            json.dump(log_data, file, indent=4)
        print(f"Log verileri '{filename}' dosyasına başarıyla yazıldı.")
    except Exception as e:
        print(f"Bir hata oluştu: {e}")

generate_log_file('/content/drive/My Drive/qa_project/web_traffic_logs.json', 1000)


Log verileri '/content/drive/My Drive/qa_project/web_traffic_logs.json' dosyasına başarıyla yazıldı.


In [None]:
pip install datasets



In [None]:
import json
from sklearn.model_selection import train_test_split
from datasets import Dataset

log_file_path = '/content/drive/My Drive/qa_project/web_traffic_logs.json'
with open(log_file_path, 'r') as file:
    log_data = json.load(file)

def create_dataset_from_logs(log_data):
    inputs = []
    targets = []

    for entry in log_data:
        request = entry["request"]
        url = request.split(' ')[1]

        inputs.append(request)
        targets.append(url)

    return inputs, targets

inputs, targets = create_dataset_from_logs(log_data)

train_inputs, eval_inputs, train_targets, eval_targets = train_test_split(
    inputs, targets, test_size=0.2, random_state=42
)

train_data = {"input_text": train_inputs, "target_text": train_targets}
eval_data = {"input_text": eval_inputs, "target_text": eval_targets}

train_dataset = Dataset.from_dict(train_data)
eval_dataset = Dataset.from_dict(eval_data)


In [None]:
from transformers import T5Tokenizer

t5_tokenizer = T5Tokenizer.from_pretrained("t5-small")

def preprocess_function(examples):
    inputs = examples['input_text']
    targets = examples['target_text']
    model_inputs = t5_tokenizer(inputs, max_length=512, truncation=True, padding='max_length')
    labels = t5_tokenizer(targets, max_length=128, truncation=True, padding='max_length')
    model_inputs['labels'] = labels['input_ids']
    return model_inputs

processed_train_dataset = train_dataset.map(preprocess_function, batched=True)
processed_eval_dataset = eval_dataset.map(preprocess_function, batched=True)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Map:   0%|          | 0/800 [00:00<?, ? examples/s]

Map:   0%|          | 0/200 [00:00<?, ? examples/s]

In [None]:
from transformers import T5Tokenizer

t5_tokenizer = T5Tokenizer.from_pretrained("t5-small")

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [41]:
from transformers import T5ForConditionalGeneration, T5Tokenizer, Trainer, TrainingArguments, DataCollatorForSeq2Seq

t5_model = T5ForConditionalGeneration.from_pretrained("t5-small")
t5_tokenizer = T5Tokenizer.from_pretrained("t5-small")

training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=3,
    weight_decay=0.01,
)

data_collator = DataCollatorForSeq2Seq(tokenizer=t5_tokenizer, model=t5_model)

trainer = Trainer(
    model=t5_model,
    args=training_args,
    train_dataset=processed_train_dataset,
    eval_dataset=processed_eval_dataset,
    tokenizer=t5_tokenizer,
    data_collator=data_collator,
)

train_results = trainer.train()
eval_results = trainer.evaluate()

with open('/content/drive/My Drive/qa_project/evaluation_results.txt', 'w') as f:
    f.write(f"Değerlendirme Sonuçları: {eval_results}\n")

t5_model.save_pretrained('/content/drive/My Drive/qa_project/saved_model')
t5_tokenizer.save_pretrained('/content/drive/My Drive/qa_project/saved_model')

print("Model eğitimi tamamlandı ve sonuçlar kaydedildi.")


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Epoch,Training Loss,Validation Loss
1,No log,0.125089


Epoch,Training Loss,Validation Loss
1,No log,0.125089
2,No log,0.008537
3,0.800300,0.001067


Model eğitimi tamamlandı ve sonuçlar kaydedildi.


In [None]:
from transformers import T5Tokenizer, T5ForConditionalGeneration

t5_tokenizer = T5Tokenizer.from_pretrained('/content/drive/My Drive/qa_project/saved_model')
t5_model = T5ForConditionalGeneration.from_pretrained('/content/drive/My Drive/qa_project/saved_model')


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [None]:
import faiss
import numpy as np
from sentence_transformers import SentenceTransformer

index = faiss.read_index('/content/drive/My Drive/qa_project/log_index.faiss')

model = SentenceTransformer('all-MiniLM-L6-v2')

with open('/content/drive/My Drive/qa_project/web_traffic_logs.json', 'r') as file:
    log_data = json.load(file)

log_texts = [entry["request"] for entry in log_data]
log_vectors = model.encode(log_texts)


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [42]:
import torch

def answer_question(question):
    query_vector = model.encode([question])

    D, I = index.search(query_vector, k=5)

    nearest_logs = [log_texts[i] for i in I[0]]

    input_text = f"Question: {question} Context: {' '.join(nearest_logs)}"

    inputs = t5_tokenizer(input_text, return_tensors="pt", max_length=512, truncation=True)

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    t5_model.to(device)
    inputs = {k: v.to(device) for k, v in inputs.items()}

    outputs = t5_model.generate(
        input_ids=inputs['input_ids'],
        attention_mask=inputs['attention_mask'],
        max_length=100,
        num_beams=5,
        early_stopping=True
    )

    answer = t5_tokenizer.decode(outputs[0], skip_special_tokens=True)
    return answer

question = "ip_address ne http://smith.biz/ HTTP/1.1"
answer = answer_question(question)
print(f"Soru: {question}")
print(f"Yanıt: {answer}")


Soru: ip_address ne http://smith.biz/ HTTP/1.1
Yanıt: http://lewis-alvarez.com/


In [None]:
ls -l drive/MyDrive/qa_project/

total 2678
-rw------- 1 root root     171 Aug 20 18:55 evaluation_results.txt
-rw------- 1 root root  307245 Aug 19 19:04 faiss_index.index
-rw------- 1 root root  195970 Aug 19 20:07 ikinci_asama_rag.ipynb
-rw------- 1 root root 1536045 Aug 20 18:46 log_index.faiss
-rw------- 1 root root   37824 Aug 20 19:01 log_olusturma.ipynb
-rw------- 1 root root     434 Aug 20 18:45 model_results.txt
-rw------- 1 root root   47420 Aug 13 14:32 rag.ipynb
drwx------ 2 root root    4096 Aug 20 18:55 [0m[01;34msaved_model[0m/
-rw------- 1 root root  311552 Aug 16 16:10 structured_logs.json
-rw------- 1 root root  298565 Aug 20 18:51 web_traffic_logs.json
