Skip to content

Commit

Permalink
I love php forever
Browse files Browse the repository at this point in the history
  • Loading branch information
pyama86 committed Apr 11, 2023
0 parents commit 3b9807b
Show file tree
Hide file tree
Showing 7 changed files with 256 additions and 0 deletions.
2 changes: 2 additions & 0 deletions .gitignore
@@ -0,0 +1,2 @@
*csv
*json
8 changes: 8 additions & 0 deletions Gemfile
@@ -0,0 +1,8 @@
# frozen_string_literal: true

source 'https://rubygems.org'

gem 'faraday-http-cache'
gem 'octokit'
gem 'parallel'
gem 'thor'
33 changes: 33 additions & 0 deletions Gemfile.lock
@@ -0,0 +1,33 @@
GEM
remote: https://rubygems.org/
specs:
addressable (2.8.4)
public_suffix (>= 2.0.2, < 6.0)
faraday (2.7.4)
faraday-net_http (>= 2.0, < 3.1)
ruby2_keywords (>= 0.0.4)
faraday-http-cache (2.4.1)
faraday (>= 0.8)
faraday-net_http (3.0.2)
octokit (6.1.1)
faraday (>= 1, < 3)
sawyer (~> 0.9)
parallel (1.22.1)
public_suffix (5.0.1)
ruby2_keywords (0.0.5)
sawyer (0.9.2)
addressable (>= 2.3.5)
faraday (>= 0.17.3, < 3)
thor (1.2.1)

PLATFORMS
arm64-darwin-21

DEPENDENCIES
faraday-http-cache
octokit
parallel
thor

BUNDLED WITH
2.4.4
2 changes: 2 additions & 0 deletions requirements.txt
@@ -0,0 +1,2 @@
google-cloud-dlp>=3.12.1
llama-index
112 changes: 112 additions & 0 deletions run.py
@@ -0,0 +1,112 @@
import google.cloud.dlp
from google.cloud.dlp import CharsToIgnore
from llama_index import GPTSimpleVectorIndex, SimpleDirectoryReader
import os
import sys
import csv
import codecs
import tempfile
import argparse

def get_args():
parser = argparse.ArgumentParser()

parser.add_argument("--project", type=str, required=False)
parser.add_argument("--mask", type=bool, default=False)
args = parser.parse_args()
return(args)

def mask_content(project, input_str, info_types, masking_character=None, number_to_mask=0, ignore_commpn=None):
dlp = google.cloud.dlp_v2.DlpServiceClient()
parent = f"projects/{project}"
item = {"value": input_str}

inspect_config = {"info_types": [{"name": info_type} for info_type in info_types]}
deidentify_config = {
"info_type_transformations": {
"transformations": [
{
"primitive_transformation": {
"character_mask_config": {
"masking_character": masking_character,
"number_to_mask": number_to_mask,
"characters_to_ignore":[{
"common_characters_to_ignore": ignore_commpn
}]
}
}
}
]
}
}

response = dlp.deidentify_content(
request={
"parent": parent,
"deidentify_config": deidentify_config,
"inspect_config": inspect_config,
"item": item,
}
)
return response.item.value

def main():
args = get_args()
if os.environ.get('OPENAI_API_KEY') is None:
print('Please set OPENAI_API_KEY')
sys.exit(1)
if args.mask:
if args.project is None:
print('Please set project id')
sys.exit(1)

GCP_PROJECT = args.project
print('GCP_PROJECT is {}'.format(GCP_PROJECT))

filename = 'contents.csv'
file_cnt = 0
tmpdir = tempfile.mkdtemp()
print("data directory: {}".format(tmpdir))

with open(filename, 'r') as f:
try:
for rows in csv.reader(f):
for row in rows:
if args.mask:
data = mask_content(GCP_PROJECT, row, ['PERSON_NAME',
'EMAIL_ADDRESS',
'PHONE_NUMBER',
'CREDIT_CARD_NUMBER',
'LOCATION',
'MALE_NAME',
'FEMALE_NAME',
'AUTH_TOKEN',
'AWS_CREDENTIALS',
'BASIC_AUTH_HEADER',
'GCP_API_KEY',
'ENCRYPTION_KEY',
'GCP_CREDENTIALS',
'OAUTH_CLIENT_SECRET',
'PASSWORD',
'JAPAN_BANK_ACCOUNT'
],
masking_character='*',
ignore_commpn=CharsToIgnore.CommonCharsToIgnore.PUNCTUATION.value)
else:
data = row
file_dst = os.path.join(tmpdir, '{}.txt'.format(file_cnt))
file_cnt += 1
f = open(file_dst, 'w')
f.write(data)
f.close()
except Exception as e:
print(e)

documents = SimpleDirectoryReader(tmpdir).load_data()
print('start indexing')
index = GPTSimpleVectorIndex(documents)
index.save_to_disk('index.json')
print('finish indexing')

if __name__ == "__main__":
main()
71 changes: 71 additions & 0 deletions run.rb
@@ -0,0 +1,71 @@
require 'parallel'
require 'thor'
require 'octokit'
require 'csv'
require 'faraday-http-cache'

STDOUT.sync = true
class AnyGithubLoader < Thor
desc 'load', 'Load github repository e.g. ruby run.rb load -t issue,pr,content -e README.md rails/rails'
option :types, aliases: '-t', type: :array, desc: 'List in load types(issue, pr, content)', default: %w[issue pr]
option :file_pattern, aliases: '-e', type: :string, desc: 'File pattern to load', default: 'README.md'
def load(repo_name)
contents = options[:types].map do |type|
case type
when 'issue'
write_to_csv(client.issues(repo_name).map(&:body))
write_to_csv(client.issues_comments(repo_name).map(&:body))
when 'pr'
write_to_csv(client.pull_requests(repo_name).map(&:body))
write_to_csv(client.pull_requests_comments(repo_name).map(&:body))
when 'content'
file_pattern = options[:file_pattern]
default_branch = client.repo(repo_name).default_branch

target_files = client.tree(
repo_name,
default_branch,
recursive: true
)[:tree].select { |obj| obj[:type] == 'blob' && obj[:path] =~ /#{file_pattern}/ }

target_files.each_slice(15) do |files|
write_to_csv(Parallel.map(files, in_threads: 2) do |f|
puts "target file: #{f[:path]}"
Base64.decode64(client.contents(repo_name, ref: default_branch, path: f[:path])[:content])
end)
end
end
end.flatten
end

no_commands do
def client
unless @_octokit
@_octokit = Octokit::Client.new(
api_endpoint: ENV['GITHUB_API'] || 'https://api.github.com',
access_token: ENV['GITHUB_TOKEN'],
auto_paginate: true,
per_page: 100
)
stack = Faraday::RackBuilder.new do |builder|
builder.use Faraday::HttpCache, serializer: Marshal, shared_cache: false
builder.use Octokit::Response::RaiseError
builder.adapter Faraday.default_adapter
end
Octokit.middleware = stack
end
@_octokit
end

# メモリを使いたくないので小出しに書く
def write_to_csv(contents)
CSV.open('contents.csv', 'a') do |csv|
contents.each do |content|
csv << [content]
end
end
end
end
end

AnyGithubLoader.start(ARGV)
28 changes: 28 additions & 0 deletions try.py
@@ -0,0 +1,28 @@
from llama_index import GPTSimpleVectorIndex,LLMPredictor
from langchain import OpenAI, PromptTemplate

llm_predictor = LLMPredictor(llm=OpenAI(temperature=0, model_name="gpt-4"))
index = GPTSimpleVectorIndex.load_from_disk(save_path='./index.json', llm_predictor=llm_predictor)

question = "PHPカンファレンスについての記事を執筆してください。"

template = """
## 役割
あなたは技術ブログのライターです。
## 指示
- 日本語で答えてください。
- 文字数は3000字程度で執筆してください
- わからないときはわからないと答えてください。
- マークダウン形式で執筆してください。
## 質問
{question}
"""
prompt = PromptTemplate(
input_variables=['question'],
template=template,
)

response = index.query(prompt.format(question=question))
print(response)

0 comments on commit 3b9807b

Please sign in to comment.