Skip to content

Commit

Permalink
Merge commit 'origin/master'
Browse files Browse the repository at this point in the history
  • Loading branch information
Scott Chacon committed Sep 3, 2009
2 parents f6e097b + 5776d9b commit f5bfd0b
Show file tree
Hide file tree
Showing 5 changed files with 174 additions and 47 deletions.
36 changes: 36 additions & 0 deletions TODO
@@ -0,0 +1,36 @@
TODO
===============

* Web UI
- check key/values in any of the keyspaces
- simple browsing UI
= object API

* Implementation Fixes
= fetches hangup gracefully
= partial packfile cache usage
- re-packcache removes old pack caches (at least the index entries)

* Multi-node Test Environment
- multi-node project loading/updating
- benchmark clones
- x.github.com
- quick node setup
- install java, python
- copy cassandra tarball, conf files, start running it (god?)
- list of small, public projects to seed

* Updating System
- seeding / updating population
- new github api to get all public repos, last update time / since update time
- get new/updated repos (from feed)
- clone/fetch on claiming node
- run update script
- re-packcache occasionally
- keep clone/fetch stats (total time, how many packed/loose objects)

* Large project support
- split large blobs in object space on insert
- if the client split up blobs, be able to reconstruct and pack them
- split long revtree entries
- reconstruct long revtree entries (possibly done)
128 changes: 118 additions & 10 deletions dulwich/agitmemnon.py
Expand Up @@ -35,7 +35,8 @@ def __init__(self):
port = 9160
self.keyspace = 'Agitmemnon'
self.memcache = {}

self.revtree = {}

socket = TSocket.TSocket(host, port)
transport = TTransport.TBufferedTransport(socket)
protocol = TBinaryProtocol.TBinaryProtocol(transport)
Expand Down Expand Up @@ -78,8 +79,6 @@ def __contains__(self, sha):
return False

def __getitem__(self, name):
if name in self.memcache:
return self.memcache[name]
o = self.get_object(name)
data = ''
otype = ''
Expand All @@ -90,11 +89,35 @@ def __getitem__(self, name):
if col.name == 'type':
otype = col.value
data = zlib.decompress(base64.b64decode(data))
shafile = ShaFile.from_raw_string(type_num_map[otype], data)
if otype != BLOB_ID: # caching commit/tree/tag objects since they are hit twice
self.memcache[name] = shafile
return shafile
return ShaFile.from_raw_string(type_num_map[otype], data)

def get_revtree_objects(self, sha):
# check for entry in revtree cache
# if it's not there, pull another chunk, check there, loop
# return all the objects included in that commit
if sha in self.revtree:
return self.revtree[sha]
else:
self.load_next_revtree_hunk()
if sha in self.revtree:
return self.revtree[sha]
else:
return False

def load_next_revtree_hunk(self):
if len(self.revtree) > 0: # hack
return False
o = self.get_super('RevTree', self.repo_name, 100000)
nilsha = '0000000000000000000000000000000000000000'
for col in o:
self.revtree[col.name] = []
for sup in col.columns:
objects = sup.value.split(":")
if nilsha in objects:
objects.remove(nilsha)
if '' in objects:
objects.remove('')
self.revtree[col.name].extend(objects)

def find_common_revisions(self, graphwalker):
"""Find which revisions this store has in common using graphwalker."""
Expand All @@ -108,7 +131,7 @@ def find_common_revisions(self, graphwalker):
return haves

def find_missing_objects(self, haves, wants, progress=None):
return iter(MissingObjectFinder(self, haves, wants, progress).next, None)
return iter(AgitMissingObjectFinder(self, haves, wants, progress).next, None)

def iter_shas(self, shas):
"""Iterate over the objects for the specified shas."""
Expand All @@ -119,6 +142,37 @@ def fetch_objects(self, determine_wants, graph_walker, progress):
haves = self.find_common_revisions(graph_walker)
return self.iter_shas(self.find_missing_objects(haves, wants, progress))

def partial_sender(self, objects, f, entries):
# PackCacheIndex (projectname) [(cache_key) => (list of objects/offset/size), ...]

sent = set()
objs = set()
for sha, path in objects.itershas():
objs.add(sha)

index = a.get('PackCacheIndex', self.repo_name)

# parse cache_index entries, figure out what we need to pull
# (which caches have enough objects that we need)
# "sha:offset:size:base_sha\n"
for cache in index:
# cache.name
cacheobjs = set()
entries = cache.value.split("\n")
if '' in entries:
entries.remove('')
for entry in entries:
(sha, offset, size, ref) = entry.split(":")
cacheobjs.add(sha)
if len(cacheobjs - objs) == 0:
# pull each partial cache and send all the objects that are needed
data = self.get_value('PackCache', cache.name, 'data')
data = base64.b64decode(data)
f.write(data)
sent = sent | cacheobjs # add each sent object to the sent[] array to return

return sent # return the sent[] array

def get_refs(self):
"""Get dictionary with all refs."""
print self.repo_name
Expand All @@ -128,8 +182,12 @@ def get_refs(self):
x = x.super_column
for col in x.columns:
if len(col.value) == 40:
ret['refs/' + x.name + '/' + col.name] = col.value
if x.name != 'meta':
ret['refs/' + x.name + '/' + col.name] = col.value
if x.name == 'heads' and col.name == 'master':
if 'HEAD' not in ret:
ret['HEAD'] = col.value
if x.name == 'meta' and col.name == 'HEAD':
ret['HEAD'] = col.value
return ret

Expand All @@ -141,16 +199,66 @@ def set_args(self, args):
rname = rname.replace('.git','')
self.repo_name = rname

class AgitMissingObjectFinder(object):
"""Find the objects missing from another object store.
:param object_store: Object store containing at least all objects to be
sent
:param haves: SHA1s of commits not to send (already present in target)
:param wants: SHA1s of commits to send
:param progress: Optional function to report progress to.
"""

def __init__(self, object_store, haves, wants, progress=None):
self.sha_done = set(haves)
self.objects_to_send = set([w for w in wants if w not in haves])
self.object_store = object_store
if progress is None:
self.progress = lambda x: None
else:
self.progress = progress

def add_todo(self, entries):
self.objects_to_send.update([e for e in entries if not e in self.sha_done])

def next(self):
if not self.objects_to_send:
return None
sha = self.objects_to_send.pop()
obs = self.object_store.get_revtree_objects(sha)
if obs:
self.add_todo(obs)
self.sha_done.add(sha)
self.progress("counting objects: %d\r" % len(self.sha_done))
return (sha, sha) # sorry, hack

class AgitmemnonBackend(Backend):

def __init__(self):
self.repo = Agitmemnon()
self.fetch_objects = self.repo.fetch_objects
self.get_refs = self.repo.get_refs
self.set_args = self.repo.set_args
self.partial_sender = self.repo.partial_sender


a = Agitmemnon()
#a.repo_name = 'fuzed2'
#a.load_next_revtree_hunk()
#print a.revtree

#index = a.get('PackCacheIndex', 'fuzed2')
#myset = set()
#for cache in index:
# print cache.name
# entries = cache.value.split("\n")
# if '' in entries:
# entries.remove('')
# for entry in entries:
# (sha, offset, size, ref) = entry.split(":")
# myset.add(sha)
# print myset

#a = Agitmemnon()
#print a.get_object('7486f4075d2b9307d02e3905c69e28e456a51a32')[0].value
#print a['7486f4075d2b9307d02e3905c69e28e456a51a32'].get_parents()
#print a.get_object('7486f4075d2b9307d02e3905c69e28e456a51a32')
Expand Down
3 changes: 3 additions & 0 deletions dulwich/object_store.py
Expand Up @@ -421,6 +421,9 @@ def __init__(self, store, sha_iter):
self.sha_iter = sha_iter
self._shas = []

def remove_objects(self, objs):
print "FU"

def __iter__(self):
"""Yield tuple with next object and path."""
for sha, path in self.itershas():
Expand Down
52 changes: 16 additions & 36 deletions dulwich/pack.py
Expand Up @@ -818,56 +818,36 @@ def write_pack(filename, objects, num_objects):
write_pack_index_v2(filename + ".idx", entries, data_sum)


def write_pack_data(f, objects, num_objects, window=10):
def write_pack_data(f, objects, num_objects, window=10, progress=None, backend=None):
"""Write a new pack file.
:param filename: The filename of the new pack file.
:param objects: List of objects to write (tuples with object and path)
:return: List with (name, offset, crc32 checksum) entries, pack checksum
"""

# this gets a list of all the objects - actual backend walker calls here
recency = list(objects)

# FIXME: Somehow limit delta depth
# FIXME: Make thin-pack optional (its not used when cloning a pack)

# Build a list of objects ordered by the magic Linus heuristic
# This helps us find good objects to diff against us
magic = []
for obj, path in recency:
magic.append( (obj.type, path, 1, -len(obj.as_raw_string()), obj) )
magic.sort()

# Build a map of objects and their index in magic - so we can find preceeding objects
# to diff against
offs = {}
for i in range(len(magic)):
offs[magic[i][4]] = i
if progress is None:
progress = lambda x: None

# Write the pack
entries = []
f = SHA1Writer(f)
f.write("PACK") # Pack header
f.write(struct.pack(">L", 2)) # Pack version
f.write(struct.pack(">L", num_objects)) # Number of objects in pack
for o, path in recency:
sha1 = o.sha().digest()
orig_t = o.type
raw = o.as_raw_string()
winner = raw
t = orig_t
#for i in range(offs[o]-window, window):
# if i < 0 or i >= len(offs): continue
# b = magic[i][4]
# if b.type != orig_t: continue
# base = b.as_raw_string()
# delta = create_delta(base, raw)
# if len(delta) < len(winner):
# winner = delta
# t = 6 if magic[i][2] == 1 else 7
offset, crc32 = write_pack_object(f, t, winner)
entries.append((sha1, offset, crc32))

sent = set()
if backend and (num_objects > 500):
sent = backend.partial_sender(objects, f, entries)

shas = set()
for sha, path in objects.itershas():
shas.add(sha)

for sha in (shas - sent):
o = backend.repo[sha]
offset, crc32 = write_pack_object(f, o.type, o.as_raw_string())
entries.append((o.sha().digest(), offset, crc32))
return entries, f.write_sha()


Expand Down
2 changes: 1 addition & 1 deletion dulwich/server.py
Expand Up @@ -172,7 +172,7 @@ def next(self):
progress("counting objects: %d, done.\n" % len(objects_iter))
progress("GitHub is collecting your data\n")
write_pack_data(ProtocolFile(None, write), objects_iter,
len(objects_iter))
len(objects_iter), 10, progress, self.backend)
progress("Have a nice day!\n")
# we are done
self.proto.write("0000")
Expand Down

0 comments on commit f5bfd0b

Please sign in to comment.