{"payload":{"feedbackUrl":"https://github.com/orgs/community/discussions/53140","repo":{"id":705996574,"defaultBranch":"main","name":"gpt-fast","ownerLogin":"pytorch-labs","currentUserCanPush":false,"isFork":false,"isEmpty":false,"createdAt":"2023-10-17T05:30:32.000Z","ownerAvatar":"https://avatars.githubusercontent.com/u/107212512?v=4","public":true,"private":false,"isOrgOwned":true},"refInfo":{"name":"","listCacheKey":"v0:1714944978.0","currentOid":""},"activityList":{"items":[{"before":"30d69b3245a29823e7c4c5ae6a1f48fa38267afd","after":"1095a5c465d5f6af734a8b86e3f7be49ecfc7668","ref":"refs/heads/main","pushedAt":"2024-05-07T21:49:28.000Z","pushType":"push","commitsCount":2,"pusher":{"login":"Chillee","name":"Horace He","path":"/Chillee","primaryAvatarUrl":"https://avatars.githubusercontent.com/u/6355099?s=80&v=4"},"commit":{"message":"Revert quantization additions to something that works on CUDA still","shortMessageHtmlLink":"Revert quantization additions to something that works on CUDA still"}},{"before":"a2aa7d6d7b01ef55c024b5891197c80569f3be83","after":"de06b53a4f95c72cd3abd0a8e9fa2d6913676c1a","ref":"refs/heads/grok1","pushedAt":"2024-05-05T21:42:49.000Z","pushType":"force_push","commitsCount":0,"pusher":{"login":"Chillee","name":"Horace He","path":"/Chillee","primaryAvatarUrl":"https://avatars.githubusercontent.com/u/6355099?s=80&v=4"},"commit":{"message":"Added grok-1 support","shortMessageHtmlLink":"Added grok-1 support"}},{"before":null,"after":"a2aa7d6d7b01ef55c024b5891197c80569f3be83","ref":"refs/heads/grok1","pushedAt":"2024-05-05T21:36:18.000Z","pushType":"branch_creation","commitsCount":0,"pusher":{"login":"Chillee","name":"Horace He","path":"/Chillee","primaryAvatarUrl":"https://avatars.githubusercontent.com/u/6355099?s=80&v=4"},"commit":{"message":"Added grok-1 support","shortMessageHtmlLink":"Added grok-1 support"}},{"before":null,"after":"ca0d85075b3cf92ead264da3826c8cc9f0207185","ref":"refs/heads/malfet/set-prec-to-float16","pushedAt":"2024-05-03T18:08:40.000Z","pushType":"branch_creation","commitsCount":0,"pusher":{"login":"malfet","name":"Nikita Shulga","path":"/malfet","primaryAvatarUrl":"https://avatars.githubusercontent.com/u/2453524?s=80&v=4"},"commit":{"message":"[WIP] Set precision to float16","shortMessageHtmlLink":"[WIP] Set precision to float16"}},{"before":"c21a88962b02ee54b74999078e81be0fd24ac2af","after":"30d69b3245a29823e7c4c5ae6a1f48fa38267afd","ref":"refs/heads/main","pushedAt":"2024-04-29T21:02:52.000Z","pushType":"pr_merge","commitsCount":1,"pusher":{"login":"Chillee","name":"Horace He","path":"/Chillee","primaryAvatarUrl":"https://avatars.githubusercontent.com/u/6355099?s=80&v=4"},"commit":{"message":"llama3 8B support, tiktoken tokenizer (#158)\n\n* WIP: llama3 support, tiktoken tokenizer\r\n\r\n* Finalizing","shortMessageHtmlLink":"llama3 8B support, tiktoken tokenizer (<a class=\"issue-link js-issue-link\" data-error-text=\"Failed to load title\" data-id=\"2251849777\" data-permission-text=\"Title is private\" data-url=\"https://github.com/pytorch-labs/gpt-fast/issues/158\" data-hovercard-type=\"pull_request\" data-hovercard-url=\"/pytorch-labs/gpt-fast/pull/158/hovercard\" href=\"https://github.com/pytorch-labs/gpt-fast/pull/158\">#158</a>)"}},{"before":"2a9b8283f83ca416faacfa1cb637ea49543e6a99","after":"c21a88962b02ee54b74999078e81be0fd24ac2af","ref":"refs/heads/main","pushedAt":"2024-04-18T06:15:58.000Z","pushType":"push","commitsCount":1,"pusher":{"login":"Chillee","name":"Horace He","path":"/Chillee","primaryAvatarUrl":"https://avatars.githubusercontent.com/u/6355099?s=80&v=4"},"commit":{"message":"Update quantize.py","shortMessageHtmlLink":"Update quantize.py"}},{"before":"095b2229ee3a40e379c11f05b94bd6923db63b4b","after":"2a9b8283f83ca416faacfa1cb637ea49543e6a99","ref":"refs/heads/main","pushedAt":"2024-04-11T01:29:37.000Z","pushType":"pr_merge","commitsCount":2,"pusher":{"login":"HDCharles","name":null,"path":"/HDCharles","primaryAvatarUrl":"https://avatars.githubusercontent.com/u/39544797?s=80&v=4"},"commit":{"message":"Merge pull request #156 from pytorch-labs/094_fix_shape_gptq\n\nshape fix for gptq","shortMessageHtmlLink":"Merge pull request <a class=\"issue-link js-issue-link\" data-error-text=\"Failed to load title\" data-id=\"2236255772\" data-permission-text=\"Title is private\" data-url=\"https://github.com/pytorch-labs/gpt-fast/issues/156\" data-hovercard-type=\"pull_request\" data-hovercard-url=\"/pytorch-labs/gpt-fast/pull/156/hovercard\" href=\"https://github.com/pytorch-labs/gpt-fast/pull/156\">#156</a> from pytorch-labs/094_fix_shape_gptq"}},{"before":null,"after":"f2c6534f083f1931e09c3e1a41eb5659acbd1caa","ref":"refs/heads/094_fix_shape_gptq","pushedAt":"2024-04-10T19:13:04.000Z","pushType":"branch_creation","commitsCount":0,"pusher":{"login":"HDCharles","name":null,"path":"/HDCharles","primaryAvatarUrl":"https://avatars.githubusercontent.com/u/39544797?s=80&v=4"},"commit":{"message":"shape fix for gptq\n\nSummary: aligns with previous shape fixes\n(https://github.com/pytorch-labs/gpt-fast/pull/152)\n\nTest Plan:\n\nexport MODEL_REPO=meta-llama/Llama-2-7b-chat-hf\npython quantize.py --checkpoint_path checkpoints/$MODEL_REPO/model.pth --mode int4-gptq --calibration_tasks wikitext --calibration_limit 10\npython eval.py --checkpoint_path checkpoints/$MODEL_REPO/model_int4-gptq.g32.cuda.pth --tasks wikitext\n\nwikitext: {'word_perplexity,none': 12.4647656874071, 'word_perplexity_stderr,none': 'N/A', 'byte_perplexity,none': 1.6028703940149458, 'byte_perplexity_stderr,none': 'N/A', 'bits_per_byte,none': 0.6806577757911142, 'bits_per_byte_stderr,none': 'N/A', 'alias': 'wikitext'}\n\npython quantize.py --checkpoint_path checkpoints/$MODEL_REPO/model.pth --mode int4\npython eval.py --checkpoint_path checkpoints/$MODEL_REPO/model_int4.g32.cuda.pth --tasks wikitext\n\nwikitext: {'word_perplexity,none': 12.639992147818221, 'word_perplexity_stderr,none': 'N/A', 'byte_perplexity,none': 1.6070602521912754, 'byte_perplexity_stderr,none': 'N/A', 'bits_per_byte,none': 0.6844240198082908, 'bits_per_byte_stderr,none': 'N/A', 'alias': 'wikitext'}\n\nReviewers:\n\nSubscribers:\n\nTasks:\n\nTags:","shortMessageHtmlLink":"shape fix for gptq"}},{"before":"410cc25bd2fae6f60ef145d6e172277fcaac5590","after":"55b9f6e0a947cd4ffc18567d8709b6b57bb99922","ref":"refs/heads/gh/HDCharles/9/orig","pushedAt":"2024-04-09T18:21:27.000Z","pushType":"force_push","commitsCount":0,"pusher":{"login":"HDCharles","name":null,"path":"/HDCharles","primaryAvatarUrl":"https://avatars.githubusercontent.com/u/39544797?s=80&v=4"},"commit":{"message":"testing HQQ [not for land]\n\nSummary:\n\nfor eval=5\nwikitext: {'word_perplexity,none': 11.49343838017535, 'word_perplexity_stderr,none': 'N/A', 'byte_perplexity,none': 1.6110947678444059, 'byte_perplexity_stderr,none':\n\nfor eval all\n...\n\nTest Plan: sh run.sh\n\nReviewers:\n\nSubscribers:\n\nTasks:\n\nTags:\n\nghstack-source-id: e1564ea867790825ad8a00c8de8a672a349b8a48\nPull Request resolved: https://github.com/pytorch-labs/gpt-fast/pull/155","shortMessageHtmlLink":"testing HQQ [not for land]"}},{"before":"a72a2c516c6e96bd3367491ed19765ac7f2bb03a","after":"8efb00dcd31edb94f2d60f7dca92f5147b74eefe","ref":"refs/heads/gh/HDCharles/9/head","pushedAt":"2024-04-09T18:21:27.000Z","pushType":"push","commitsCount":1,"pusher":{"login":"HDCharles","name":null,"path":"/HDCharles","primaryAvatarUrl":"https://avatars.githubusercontent.com/u/39544797?s=80&v=4"},"commit":{"message":"Update on \"testing HQQ [not for land]\"\n\n\nSummary:\n\nfor eval=5\nwikitext: {'word_perplexity,none': 11.49343838017535, 'word_perplexity_stderr,none': 'N/A', 'byte_perplexity,none': 1.6110947678444059, 'byte_perplexity_stderr,none':\n\nfor eval all\n...\n\nTest Plan: sh run.sh\n\nReviewers:\n\nSubscribers:\n\nTasks:\n\nTags:\n\n[ghstack-poisoned]","shortMessageHtmlLink":"Update on \"testing HQQ [not for land]\""}},{"before":null,"after":"410cc25bd2fae6f60ef145d6e172277fcaac5590","ref":"refs/heads/gh/HDCharles/9/orig","pushedAt":"2024-04-09T18:13:04.000Z","pushType":"branch_creation","commitsCount":0,"pusher":{"login":"HDCharles","name":null,"path":"/HDCharles","primaryAvatarUrl":"https://avatars.githubusercontent.com/u/39544797?s=80&v=4"},"commit":{"message":"testing HQQ [not for land]\n\nSummary:\n\nfor eval=5\nwikitext: {'word_perplexity,none': 11.49343838017535, 'word_perplexity_stderr,none': 'N/A', 'byte_perplexity,none': 1.6110947678444059, 'byte_perplexity_stderr,none':\n\nfor eval all\n...\n\nTest Plan: sh run.sh\n\nReviewers:\n\nSubscribers:\n\nTasks:\n\nTags:\n\nghstack-source-id: c680a099241e78dbba06e1b1f342c6845084cddf\nPull Request resolved: https://github.com/pytorch-labs/gpt-fast/pull/155","shortMessageHtmlLink":"testing HQQ [not for land]"}},{"before":null,"after":"095b2229ee3a40e379c11f05b94bd6923db63b4b","ref":"refs/heads/gh/HDCharles/9/base","pushedAt":"2024-04-09T18:13:02.000Z","pushType":"branch_creation","commitsCount":0,"pusher":{"login":"HDCharles","name":null,"path":"/HDCharles","primaryAvatarUrl":"https://avatars.githubusercontent.com/u/39544797?s=80&v=4"},"commit":{"message":"Merge pull request #152 from pytorch-labs/malfet-patch-1\n\nDiscovered by @HDCharles\r\n\r\nTest plan:\r\n```\r\n% python3 quantize.py --checkpoint_path checkpoints/openlm-research/open_llama_7b/model.pth --mode int4 --device cuda\r\n% python3 generate.py --checkpoint_path checkpoints/openlm-research/open_llama_7b/model_int4.g32.cuda.pth --prompt \"Once upon a time\" --device cuda\r\n...\r\nUsing int4 weight-only quantization!\r\nTime to load model: 3.20 seconds\r\nOnce upon a time I was a kid. And that kid, as I understand, went through a phase as a teen where he binge watched a whole bunch of movies. I don’t remember the exact number, but it seems like at least 50 movies in succession. I read somewhere that people would record movies on VHS tapes and then binge watched them, so maybe that’s what this kid was doing. I also read somewhere that the person had never binge watched 50 movies in succession again.\r\nThat’s the truth and it’s a shame. That’s how you know the world is changing in a horrible way. The binge watcher, the VHS watcher, the guy who turns a whole bunch of movies into a marathon and then stops. The person who made that guy stop. That’s why I’m writing this: to prevent you from reading this, and I’m sorry. I’m sorry that you’ll never turn\r\nTime for inference 1: 8.27 sec total, 24.17 tokens/sec\r\nBandwidth achieved: 106.17 GB/s\r\n\r\n```\r\nand\r\n```\r\n% python3 quantize.py --checkpoint_path checkpoints/openlm-research/open_llama_7b/model.pth --mode int4 --device cpu\r\n% python3 generate.py --checkpoint_path checkpoints/openlm-research/open_llama_7b/model_int4.g32.cpu.pth --prompt \"Once upon a time\" --device cpu\r\n...\r\nUsing int4 weight-only quantization!\r\nTime to load model: 0.09 seconds\r\nOnce upon a time, I was ith the new movie.\r\nWelcome to the third installment of the Once Upon a Time! series.\r\nThis time around, I’ve decided to focus on a movie that has had its fair share of publicity and fame, but one that I was not familiar with before.\r\nThe movie in question is the 2004 remake of the classic fairy tale The Three Little Pigs, which was released the same year as Pirates of the Caribbean: The Curse of the Black Pearl and the 2007 adaptation of the classic novel The Lion King.\r\nIt was the first film in the Once Upon a Time! series that I had not seen, and as such, I was only familiar with the first half of the story.\r\nI was intrigued by the story, and I knew that I would be interested in seeing the movie when I was able.\r\nI had watched a bunch of trailers and clips to get an idea of what the movie was going\r\nTime for inference 2: 27.75 sec total, 7.21 tokens/sec\r\nBandwidth achieved: 31.65 GB/s\r\n```","shortMessageHtmlLink":"Merge pull request <a class=\"issue-link js-issue-link\" data-error-text=\"Failed to load title\" data-id=\"2228738976\" data-permission-text=\"Title is private\" data-url=\"https://github.com/pytorch-labs/gpt-fast/issues/152\" data-hovercard-type=\"pull_request\" data-hovercard-url=\"/pytorch-labs/gpt-fast/pull/152/hovercard\" href=\"https://github.com/pytorch-labs/gpt-fast/pull/152\">#152</a> from pytorch-labs/malfet-patch-1"}},{"before":null,"after":"a72a2c516c6e96bd3367491ed19765ac7f2bb03a","ref":"refs/heads/gh/HDCharles/9/head","pushedAt":"2024-04-09T18:13:02.000Z","pushType":"branch_creation","commitsCount":0,"pusher":{"login":"HDCharles","name":null,"path":"/HDCharles","primaryAvatarUrl":"https://avatars.githubusercontent.com/u/39544797?s=80&v=4"},"commit":{"message":"testing HQQ [not for land]\n\nSummary:\n\nfor eval=5\nwikitext: {'word_perplexity,none': 11.49343838017535, 'word_perplexity_stderr,none': 'N/A', 'byte_perplexity,none': 1.6110947678444059, 'byte_perplexity_stderr,none':\n\nfor eval all\n...\n\nTest Plan: sh run.sh\n\nReviewers:\n\nSubscribers:\n\nTasks:\n\nTags:\n\n[ghstack-poisoned]","shortMessageHtmlLink":"testing HQQ [not for land]"}},{"before":"7d4527002e706608d974e0e2c74ef01dff9c1f75","after":"095b2229ee3a40e379c11f05b94bd6923db63b4b","ref":"refs/heads/main","pushedAt":"2024-04-05T20:44:46.000Z","pushType":"pr_merge","commitsCount":3,"pusher":{"login":"malfet","name":"Nikita Shulga","path":"/malfet","primaryAvatarUrl":"https://avatars.githubusercontent.com/u/2453524?s=80&v=4"},"commit":{"message":"Merge pull request #152 from pytorch-labs/malfet-patch-1\n\nDiscovered by @HDCharles\r\n\r\nTest plan:\r\n```\r\n% python3 quantize.py --checkpoint_path checkpoints/openlm-research/open_llama_7b/model.pth --mode int4 --device cuda\r\n% python3 generate.py --checkpoint_path checkpoints/openlm-research/open_llama_7b/model_int4.g32.cuda.pth --prompt \"Once upon a time\" --device cuda\r\n...\r\nUsing int4 weight-only quantization!\r\nTime to load model: 3.20 seconds\r\nOnce upon a time I was a kid. And that kid, as I understand, went through a phase as a teen where he binge watched a whole bunch of movies. I don’t remember the exact number, but it seems like at least 50 movies in succession. I read somewhere that people would record movies on VHS tapes and then binge watched them, so maybe that’s what this kid was doing. I also read somewhere that the person had never binge watched 50 movies in succession again.\r\nThat’s the truth and it’s a shame. That’s how you know the world is changing in a horrible way. The binge watcher, the VHS watcher, the guy who turns a whole bunch of movies into a marathon and then stops. The person who made that guy stop. That’s why I’m writing this: to prevent you from reading this, and I’m sorry. I’m sorry that you’ll never turn\r\nTime for inference 1: 8.27 sec total, 24.17 tokens/sec\r\nBandwidth achieved: 106.17 GB/s\r\n\r\n```\r\nand\r\n```\r\n% python3 quantize.py --checkpoint_path checkpoints/openlm-research/open_llama_7b/model.pth --mode int4 --device cpu\r\n% python3 generate.py --checkpoint_path checkpoints/openlm-research/open_llama_7b/model_int4.g32.cpu.pth --prompt \"Once upon a time\" --device cpu\r\n...\r\nUsing int4 weight-only quantization!\r\nTime to load model: 0.09 seconds\r\nOnce upon a time, I was ith the new movie.\r\nWelcome to the third installment of the Once Upon a Time! series.\r\nThis time around, I’ve decided to focus on a movie that has had its fair share of publicity and fame, but one that I was not familiar with before.\r\nThe movie in question is the 2004 remake of the classic fairy tale The Three Little Pigs, which was released the same year as Pirates of the Caribbean: The Curse of the Black Pearl and the 2007 adaptation of the classic novel The Lion King.\r\nIt was the first film in the Once Upon a Time! series that I had not seen, and as such, I was only familiar with the first half of the story.\r\nI was intrigued by the story, and I knew that I would be interested in seeing the movie when I was able.\r\nI had watched a bunch of trailers and clips to get an idea of what the movie was going\r\nTime for inference 2: 27.75 sec total, 7.21 tokens/sec\r\nBandwidth achieved: 31.65 GB/s\r\n```","shortMessageHtmlLink":"Merge pull request <a class=\"issue-link js-issue-link\" data-error-text=\"Failed to load title\" data-id=\"2228738976\" data-permission-text=\"Title is private\" data-url=\"https://github.com/pytorch-labs/gpt-fast/issues/152\" data-hovercard-type=\"pull_request\" data-hovercard-url=\"/pytorch-labs/gpt-fast/pull/152/hovercard\" href=\"https://github.com/pytorch-labs/gpt-fast/pull/152\">#152</a> from pytorch-labs/malfet-patch-1"}},{"before":"d4746165956094fa5cf082dd96e3b763786521d2","after":"bc50dc050c2d627cd43155609eaf679e4aab5cd7","ref":"refs/heads/malfet-patch-1","pushedAt":"2024-04-05T20:36:51.000Z","pushType":"push","commitsCount":1,"pusher":{"login":"malfet","name":"Nikita Shulga","path":"/malfet","primaryAvatarUrl":"https://avatars.githubusercontent.com/u/2453524?s=80&v=4"},"commit":{"message":"Int4Linear should have the same shape regardless of device\n\nThis make Int4 quantization work for both CPU and CUDA devices","shortMessageHtmlLink":"Int4Linear should have the same shape regardless of device"}},{"before":null,"after":"d4746165956094fa5cf082dd96e3b763786521d2","ref":"refs/heads/malfet-patch-1","pushedAt":"2024-04-05T20:08:44.000Z","pushType":"branch_creation","commitsCount":0,"pusher":{"login":"malfet","name":"Nikita Shulga","path":"/malfet","primaryAvatarUrl":"https://avatars.githubusercontent.com/u/2453524?s=80&v=4"},"commit":{"message":"Fix int4 quantization\n\nDiscovered by @HDCharles","shortMessageHtmlLink":"Fix int4 quantization"}},{"before":"521f4d3111985cb526231b8a76ceff9934467c93","after":null,"ref":"refs/heads/malfet-patch-1","pushedAt":"2024-04-05T18:20:25.000Z","pushType":"branch_deletion","commitsCount":0,"pusher":{"login":"malfet","name":"Nikita Shulga","path":"/malfet","primaryAvatarUrl":"https://avatars.githubusercontent.com/u/2453524?s=80&v=4"}},{"before":"d8f0be6ef60497fc8ded41b569eaab63c9a6ef87","after":"7d4527002e706608d974e0e2c74ef01dff9c1f75","ref":"refs/heads/main","pushedAt":"2024-04-05T18:20:21.000Z","pushType":"pr_merge","commitsCount":1,"pusher":{"login":"malfet","name":"Nikita Shulga","path":"/malfet","primaryAvatarUrl":"https://avatars.githubusercontent.com/u/2453524?s=80&v=4"},"commit":{"message":"Enable TinyLLAMAs quantization (#151)\n\nCopy-n-paste code from https://github.com/pytorch-labs/gpt-fast/commit/11ce176d48a60e0682c817114caab37070c6a7ba into `quantize.py`","shortMessageHtmlLink":"Enable TinyLLAMAs quantization (<a class=\"issue-link js-issue-link\" data-error-text=\"Failed to load title\" data-id=\"2226756529\" data-permission-text=\"Title is private\" data-url=\"https://github.com/pytorch-labs/gpt-fast/issues/151\" data-hovercard-type=\"pull_request\" data-hovercard-url=\"/pytorch-labs/gpt-fast/pull/151/hovercard\" href=\"https://github.com/pytorch-labs/gpt-fast/pull/151\">#151</a>)"}},{"before":null,"after":"e40eddd24376399caecd3ff94c736d2373b91292","ref":"refs/heads/batched_generation","pushedAt":"2024-04-05T04:34:17.000Z","pushType":"branch_creation","commitsCount":0,"pusher":{"login":"Chillee","name":"Horace He","path":"/Chillee","primaryAvatarUrl":"https://avatars.githubusercontent.com/u/6355099?s=80&v=4"},"commit":{"message":"Add batched inference","shortMessageHtmlLink":"Add batched inference"}},{"before":null,"after":"521f4d3111985cb526231b8a76ceff9934467c93","ref":"refs/heads/malfet-patch-1","pushedAt":"2024-04-05T00:45:41.000Z","pushType":"branch_creation","commitsCount":0,"pusher":{"login":"malfet","name":"Nikita Shulga","path":"/malfet","primaryAvatarUrl":"https://avatars.githubusercontent.com/u/2453524?s=80&v=4"},"commit":{"message":"Enable TinyLLAMAs quantization\n\nCopy-n-paste code from https://github.com/pytorch-labs/gpt-fast/commit/11ce176d48a60e0682c817114caab37070c6a7ba into `quantize.py`","shortMessageHtmlLink":"Enable TinyLLAMAs quantization"}},{"before":"f6973170327003c6b1ce7edb5c015b4fa0097e6d","after":"d8f0be6ef60497fc8ded41b569eaab63c9a6ef87","ref":"refs/heads/main","pushedAt":"2024-04-05T00:21:13.000Z","pushType":"push","commitsCount":1,"pusher":{"login":"malfet","name":"Nikita Shulga","path":"/malfet","primaryAvatarUrl":"https://avatars.githubusercontent.com/u/2453524?s=80&v=4"},"commit":{"message":"Fix dtype inference for quantized models\n\n`self.output.weight` would be int8 if output is quantized linear layer\r\n\r\nIn that case, check for `scales` or `scales_and_zeros` (for int4) quantization","shortMessageHtmlLink":"Fix dtype inference for quantized models"}},{"before":"743261bd0194b9a6d8225a7d17ef4779e1858e2b","after":"dfaa329efa22854258422e3968f96c50c6384003","ref":"refs/heads/gh/HDCharles/8/orig","pushedAt":"2024-03-28T08:50:20.000Z","pushType":"force_push","commitsCount":0,"pusher":{"login":"HDCharles","name":null,"path":"/HDCharles","primaryAvatarUrl":"https://avatars.githubusercontent.com/u/39544797?s=80&v=4"},"commit":{"message":"fixing GPTQ\n\nSummary:\n\ntrying to fix the issue with kv_cache update by changing tracing into a\ntensor subclass. However it seems we have less success than the fx\ntracer. The fx tracer breaks due\n\nk_out[:,:, input_pos] = k_val\n\ngetting traced as\n\nnew_var = torch.ops.aten.index_put_(k_out, [None, None,\ninput_pos], k_val)\n\nwith new var never being accessed afterward. new_var becomes hte correct\nmultiInput value, but then is lost.\n\nThe subclass ont he other hand, tries to use the func \"<slot wrapper '__setitem__' of 'torch._C.TensorBase' objects>\"\nwhich seems to not want to mutate k_out and so the attempt to make it a\nmultiTensor fails.\n\nTest Plan: sh run.sh\n\nReviewers:\n\nSubscribers:\n\nTasks:\n\nTags:\n\nghstack-source-id: 9ed1621201317e5f655132ba11538a67c8aa5a69\nPull Request resolved: https://github.com/pytorch-labs/gpt-fast/pull/148","shortMessageHtmlLink":"fixing GPTQ"}},{"before":"298c44337c2a1072c98ff982f13bb1b37011d79b","after":"7392a31d3ae0e7c727cd16814ae392bc49b958a3","ref":"refs/heads/gh/HDCharles/8/head","pushedAt":"2024-03-28T08:50:20.000Z","pushType":"push","commitsCount":1,"pusher":{"login":"HDCharles","name":null,"path":"/HDCharles","primaryAvatarUrl":"https://avatars.githubusercontent.com/u/39544797?s=80&v=4"},"commit":{"message":"Update on \"fixing GPTQ\"\n\n\nSummary:\n\ntrying to fix the issue with kv_cache update by changing tracing into a\ntensor subclass. However it seems we have less success than the fx\ntracer. The fx tracer breaks due\n\nk_out[:,:, input_pos] = k_val\n\ngetting traced as\n\nnew_var = torch.ops.aten.index_put_(k_out, [None, None,\ninput_pos], k_val)\n\nwith new var never being accessed afterward. new_var becomes hte correct\nmultiInput value, but then is lost.\n\nThe subclass ont he other hand, tries to use the func \"<slot wrapper '__setitem__' of 'torch._C.TensorBase' objects>\"\nwhich seems to not want to mutate k_out and so the attempt to make it a\nmultiTensor fails.\n\nTest Plan: sh run.sh\n\nReviewers:\n\nSubscribers:\n\nTasks:\n\nTags:\n\n[ghstack-poisoned]","shortMessageHtmlLink":"Update on \"fixing GPTQ\""}},{"before":null,"after":"743261bd0194b9a6d8225a7d17ef4779e1858e2b","ref":"refs/heads/gh/HDCharles/8/orig","pushedAt":"2024-03-28T08:42:43.000Z","pushType":"branch_creation","commitsCount":0,"pusher":{"login":"HDCharles","name":null,"path":"/HDCharles","primaryAvatarUrl":"https://avatars.githubusercontent.com/u/39544797?s=80&v=4"}},{"before":null,"after":"298c44337c2a1072c98ff982f13bb1b37011d79b","ref":"refs/heads/gh/HDCharles/8/head","pushedAt":"2024-03-28T08:42:40.000Z","pushType":"branch_creation","commitsCount":0,"pusher":{"login":"HDCharles","name":null,"path":"/HDCharles","primaryAvatarUrl":"https://avatars.githubusercontent.com/u/39544797?s=80&v=4"},"commit":{"message":"fixing GPTQ\n\nSummary:\n\nTest Plan:\n\nReviewers:\n\nSubscribers:\n\nTasks:\n\nTags:\n\n[ghstack-poisoned]","shortMessageHtmlLink":"fixing GPTQ"}},{"before":null,"after":"f6973170327003c6b1ce7edb5c015b4fa0097e6d","ref":"refs/heads/gh/HDCharles/8/base","pushedAt":"2024-03-28T08:42:40.000Z","pushType":"branch_creation","commitsCount":0,"pusher":{"login":"HDCharles","name":null,"path":"/HDCharles","primaryAvatarUrl":"https://avatars.githubusercontent.com/u/39544797?s=80&v=4"},"commit":{"message":"Update README.md","shortMessageHtmlLink":"Update README.md"}},{"before":"b47a234d631b81cdc43a7526bc73d51af010f5d5","after":"f6973170327003c6b1ce7edb5c015b4fa0097e6d","ref":"refs/heads/main","pushedAt":"2024-03-28T02:38:10.000Z","pushType":"push","commitsCount":1,"pusher":{"login":"Chillee","name":"Horace He","path":"/Chillee","primaryAvatarUrl":"https://avatars.githubusercontent.com/u/6355099?s=80&v=4"},"commit":{"message":"Update README.md","shortMessageHtmlLink":"Update README.md"}},{"before":"93dab0e9615dbcee40c040e8c8e8c46d014ab773","after":null,"ref":"refs/heads/gh/HDCharles/6/orig","pushedAt":"2024-03-27T18:49:39.000Z","pushType":"branch_deletion","commitsCount":0,"pusher":{"login":"HDCharles","name":null,"path":"/HDCharles","primaryAvatarUrl":"https://avatars.githubusercontent.com/u/39544797?s=80&v=4"}},{"before":"2e7673714b8ffdb4062e6dd89764f8b7f16a936e","after":null,"ref":"refs/heads/gh/HDCharles/6/head","pushedAt":"2024-03-27T18:49:39.000Z","pushType":"branch_deletion","commitsCount":0,"pusher":{"login":"HDCharles","name":null,"path":"/HDCharles","primaryAvatarUrl":"https://avatars.githubusercontent.com/u/39544797?s=80&v=4"}},{"before":"2e7673714b8ffdb4062e6dd89764f8b7f16a936e","after":null,"ref":"refs/heads/gh/HDCharles/6/base","pushedAt":"2024-03-27T18:49:39.000Z","pushType":"branch_deletion","commitsCount":0,"pusher":{"login":"HDCharles","name":null,"path":"/HDCharles","primaryAvatarUrl":"https://avatars.githubusercontent.com/u/39544797?s=80&v=4"}}],"hasNextPage":true,"hasPreviousPage":false,"activityType":"all","actor":null,"timePeriod":"all","sort":"DESC","perPage":30,"cursor":"djE6ks8AAAAERAfpAAA","startCursor":null,"endCursor":null}},"title":"Activity · pytorch-labs/gpt-fast"}