pszemraj · pszemraj · Nov 21, 2024 · Nov 21, 2024 · Nov 21, 2024 · Nov 21, 2024
diff --git a/README.md b/README.md
@@ -4,6 +4,42 @@
 
 This aims to be a simpler implementation of the [original repo](https://github.com/microsoft/Samba).
 
+## Installation
+
+> [!TIP]
+> While the `pip install` command _should_ install all deps and the package, in practice some of the more CUDA-heavy deps are better installed separately from source. See section below for more details.
+
+```bash
+git clone https://github.com/pszemraj/samba-pytorch.git
+cd samba-pytorch
+pip install -e .
+```
+
+### Installing custom kernel packages first
+
+After installing `torch`, `xformers`, and `flash-attn`, you may want to install `mamba-ssm`, `causal-conv1d`, and `fla` from source:
+
+```bash
+pip install --upgrade pip ninja
+pip install git+https://github.com/state-spaces/mamba.git --no-build-isolation
+pip install git+https://github.com/Dao-AILab/causal-conv1d.git --no-build-isolation
+pip install git+https://github.com/sustcsonglin/flash-linear-attention@98c176e --no-build-isolation
+```
+
+Then, clone this repo and run commands as above.
+
+## Usage
+
+A basic example of creating a random model from a named config:
+
+```python
+from samba_pytorch import Config, GPT
+cfg = Config.from_name('Samba_421M_1k_window')
+print*(cfg)
+model = GPT(cfg)
+model
+```
+
 ## repo structure
 
 ```text

diff --git a/samba_pytorch/config.py b/samba_pytorch/config.py
@@ -3,14 +3,13 @@
 
 # Copyright Lightning AI. Licensed under the Apache License 2.0,
 # see LICENSE file at https://github.com/Lightning-AI/litgpt/blob/main/LICENSE
-
+import warnings
 from dataclasses import dataclass
 from typing import Any, Literal, Optional, Type
 
 import torch
 from typing_extensions import Self
 
-import samba_pytorch.samba
 from samba_pytorch.utils import find_multiple
 
 
@@ -101,8 +100,9 @@ def from_name(cls, name: str, **kwargs: Any) -> Self:
 
     @property
     def mlp_class(self) -> Type:
+        from samba_pytorch import samba
         # `self._mlp_class` cannot be the type to keep the config json serializable
-        return getattr(samba_pytorch.samba, self._mlp_class)
+        return getattr(samba, self._mlp_class)
 
     @property
     def norm_class(self) -> Type:
@@ -112,9 +112,12 @@ def norm_class(self) -> Type:
 
             return RMSNorm
         elif self._norm_class == "FusedRMSNorm":
-            from samba_pytorch.modules.rmsnorm import FusedRMSNorm
+            warnings.warn(
+                "FusedRMSNorm has been removed, using standard torch RMSNorm instead"
+            )
+            from samba_pytorch.modules.rmsnorm import RMSNorm
 
-            return FusedRMSNorm
+            return RMSNorm
         return getattr(torch.nn, self._norm_class)
 
 
@@ -133,7 +136,7 @@ def norm_class(self) -> Type:
         rotary_percentage=1.0,
         parallel_residual=False,
         bias=False,
-        _norm_class="FusedRMSNorm",
+        _norm_class="RMSNorm",
         norm_eps=1e-5,
         _mlp_class="LLaMAMLP",
         intermediate_size=4096,
@@ -150,7 +153,7 @@ def norm_class(self) -> Type:
         rotary_percentage=1.0,
         parallel_residual=False,
         bias=False,
-        _norm_class="FusedRMSNorm",
+        _norm_class="RMSNorm",
         norm_eps=1e-5,
         _mlp_class="LLaMAMLP",
         intermediate_size=4096,
@@ -168,7 +171,7 @@ def norm_class(self) -> Type:
         rotary_percentage=1.0,
         parallel_residual=False,
         bias=False,
-        _norm_class="FusedRMSNorm",
+        _norm_class="RMSNorm",
         norm_eps=1e-5,
         full_per_layer=2,
         _mlp_class="LLaMAMLP",
@@ -187,7 +190,7 @@ def norm_class(self) -> Type:
         rotary_percentage=1.0,
         parallel_residual=False,
         bias=False,
-        _norm_class="FusedRMSNorm",
+        _norm_class="RMSNorm",
         norm_eps=1e-5,
         _mlp_class="LLaMAMLP",
         intermediate_size=4608,
@@ -206,7 +209,7 @@ def norm_class(self) -> Type:
         rotary_percentage=1.0,
         parallel_residual=False,
         bias=False,
-        _norm_class="FusedRMSNorm",
+        _norm_class="RMSNorm",
         norm_eps=1e-5,
         _mlp_class="LLaMAMLP",
         intermediate_size=4608,
@@ -225,7 +228,7 @@ def norm_class(self) -> Type:
         rotary_percentage=1.0,
         parallel_residual=False,
         bias=False,
-        _norm_class="FusedRMSNorm",
+        _norm_class="RMSNorm",
         norm_eps=1e-5,
         _mlp_class="LLaMAMLP",
         intermediate_size=4608,
@@ -244,7 +247,7 @@ def norm_class(self) -> Type:
         rotary_percentage=1.0,
         parallel_residual=False,
         bias=False,
-        _norm_class="FusedRMSNorm",
+        _norm_class="RMSNorm",
         norm_eps=1e-5,
         _mlp_class="LLaMAMLP",
         intermediate_size=4608,
@@ -263,7 +266,7 @@ def norm_class(self) -> Type:
         rotary_percentage=1.0,
         parallel_residual=False,
         bias=False,
-        _norm_class="FusedRMSNorm",
+        _norm_class="RMSNorm",
         norm_eps=1e-5,
         _mlp_class="LLaMAMLP",
         intermediate_size=4096,
@@ -280,7 +283,7 @@ def norm_class(self) -> Type:
         rotary_percentage=1.0,
         parallel_residual=False,
         bias=False,
-        _norm_class="FusedRMSNorm",
+        _norm_class="RMSNorm",
         norm_eps=1e-5,
         _mlp_class="LLaMAMLP",
         intermediate_size=4096,
@@ -298,7 +301,7 @@ def norm_class(self) -> Type:
         rotary_percentage=1.0,
         parallel_residual=False,
         bias=False,
-        _norm_class="FusedRMSNorm",
+        _norm_class="RMSNorm",
         norm_eps=1e-5,
         _mlp_class="LLaMAMLP",
         intermediate_size=4096,
@@ -316,7 +319,7 @@ def norm_class(self) -> Type:
         rotary_percentage=1.0,
         parallel_residual=False,
         bias=False,
-        _norm_class="FusedRMSNorm",
+        _norm_class="RMSNorm",
         norm_eps=1e-5,
         _mlp_class="LLaMAMLP",
         intermediate_size=4096,
@@ -335,7 +338,7 @@ def norm_class(self) -> Type:
         parallel_residual=True,
         shared_attention_norm=True,
         bias=False,
-        _norm_class="FusedRMSNorm",
+        _norm_class="RMSNorm",
         norm_eps=1e-5,
         _mlp_class="LLaMAMLP",
         intermediate_size=4096,
@@ -354,7 +357,7 @@ def norm_class(self) -> Type:
         rotary_percentage=1.0,
         parallel_residual=False,
         bias=False,
-        _norm_class="FusedRMSNorm",
+        _norm_class="RMSNorm",
         norm_eps=1e-5,
         _mlp_class="LLaMAMLP",
         intermediate_size=4096,
@@ -373,7 +376,7 @@ def norm_class(self) -> Type:
         rotary_percentage=1.0,
         parallel_residual=False,
         bias=False,
-        _norm_class="FusedRMSNorm",
+        _norm_class="RMSNorm",
         norm_eps=1e-5,
         _mlp_class="LLaMAMLP",
         intermediate_size=4096,
@@ -393,7 +396,7 @@ def norm_class(self) -> Type:
         rotary_percentage=1.0,
         parallel_residual=False,
         bias=False,
-        _norm_class="FusedRMSNorm",
+        _norm_class="RMSNorm",
         norm_eps=1e-5,
         _mlp_class="LLaMAMLP",
         intermediate_size=4096,
@@ -412,7 +415,7 @@ def norm_class(self) -> Type:
         rotary_percentage=1.0,
         parallel_residual=False,
         bias=False,
-        _norm_class="FusedRMSNorm",
+        _norm_class="RMSNorm",
         norm_eps=1e-5,
         _mlp_class="LLaMAMLP",
         intermediate_size=4096,
@@ -431,7 +434,7 @@ def norm_class(self) -> Type:
         rotary_percentage=1.0,
         parallel_residual=False,
         bias=False,
-        _norm_class="FusedRMSNorm",
+        _norm_class="RMSNorm",
         norm_eps=1e-5,
         _mlp_class="LLaMAMLP",
         intermediate_size=4096,
@@ -450,7 +453,7 @@ def norm_class(self) -> Type:
         rotary_percentage=1.0,
         parallel_residual=False,
         bias=False,
-        _norm_class="FusedRMSNorm",
+        _norm_class="RMSNorm",
         norm_eps=1e-5,
         _mlp_class="LLaMAMLP",
         intermediate_size=4096,
@@ -469,7 +472,7 @@ def norm_class(self) -> Type:
         rotary_percentage=1.0,
         parallel_residual=False,
         bias=False,
-        _norm_class="FusedRMSNorm",
+        _norm_class="RMSNorm",
         norm_eps=1e-5,
         _mlp_class="LLaMAMLP",
         intermediate_size=4096,
@@ -489,7 +492,7 @@ def norm_class(self) -> Type:
         rotary_percentage=1.0,
         parallel_residual=False,
         bias=False,
-        _norm_class="FusedRMSNorm",
+        _norm_class="RMSNorm",
         norm_eps=1e-5,
         _mlp_class="LLaMAMLP",
         intermediate_size=4608,
@@ -510,7 +513,7 @@ def norm_class(self) -> Type:
         rotary_percentage=1.0,
         parallel_residual=False,
         bias=False,
-        _norm_class="FusedRMSNorm",
+        _norm_class="RMSNorm",
         norm_eps=1e-5,
         _mlp_class="LLaMAMLP",
         intermediate_size=4608,
@@ -531,7 +534,7 @@ def norm_class(self) -> Type:
         rotary_percentage=1.0,
         parallel_residual=False,
         bias=False,
-        _norm_class="FusedRMSNorm",
+        _norm_class="RMSNorm",
         norm_eps=1e-5,
         _mlp_class="LLaMAMLP",
         intermediate_size=4608,
@@ -552,7 +555,7 @@ def norm_class(self) -> Type:
         rotary_percentage=1.0,
         parallel_residual=False,
         bias=False,
-        _norm_class="FusedRMSNorm",
+        _norm_class="RMSNorm",
         norm_eps=1e-5,
         _mlp_class="LLaMAMLP",
         intermediate_size=4608,
@@ -573,7 +576,7 @@ def norm_class(self) -> Type:
         rotary_percentage=1.0,
         parallel_residual=False,
         bias=False,
-        _norm_class="FusedRMSNorm",
+        _norm_class="RMSNorm",
         norm_eps=1e-5,
         _mlp_class="LLaMAMLP",
         intermediate_size=4096,
@@ -592,7 +595,7 @@ def norm_class(self) -> Type:
         rotary_percentage=1.0,
         parallel_residual=False,
         bias=False,
-        _norm_class="FusedRMSNorm",
+        _norm_class="RMSNorm",
         norm_eps=1e-5,
         _mlp_class="LLaMAMLP",
         intermediate_size=4096,
@@ -612,7 +615,7 @@ def norm_class(self) -> Type:
         rotary_percentage=1.0,
         parallel_residual=False,
         bias=False,
-        _norm_class="FusedRMSNorm",
+        _norm_class="RMSNorm",
         norm_eps=1e-5,
         _mlp_class="LLaMAMLP",
         intermediate_size=4096,
@@ -632,7 +635,7 @@ def norm_class(self) -> Type:
         rotary_percentage=1.0,
         parallel_residual=False,
         bias=False,
-        _norm_class="FusedRMSNorm",
+        _norm_class="RMSNorm",
         norm_eps=1e-5,
         _mlp_class="LLaMAMLP",
         intermediate_size=4096,
@@ -653,7 +656,7 @@ def norm_class(self) -> Type:
         rotary_percentage=1.0,
         parallel_residual=False,
         bias=False,
-        _norm_class="FusedRMSNorm",
+        _norm_class="RMSNorm",
         norm_eps=1e-5,
         _mlp_class="LLaMAMLP",
         intermediate_size=6144,
@@ -673,7 +676,7 @@ def norm_class(self) -> Type:
         rotary_percentage=1.0,
         parallel_residual=False,
         bias=False,
-        _norm_class="FusedRMSNorm",
+        _norm_class="RMSNorm",
         norm_eps=1e-5,
         _mlp_class="LLaMAMLP",
         intermediate_size=6144,
@@ -693,7 +696,7 @@ def norm_class(self) -> Type:
         rotary_percentage=1.0,
         parallel_residual=False,
         bias=False,
-        _norm_class="FusedRMSNorm",
+        _norm_class="RMSNorm",
         norm_eps=1e-5,
         _mlp_class="LLaMAMLP",
         intermediate_size=6144,
@@ -712,7 +715,7 @@ def norm_class(self) -> Type:
         rotary_percentage=1.0,
         parallel_residual=False,
         bias=False,
-        _norm_class="FusedRMSNorm",
+        _norm_class="RMSNorm",
         norm_eps=1e-5,
         _mlp_class="LLaMAMLP",
         intermediate_size=6144,
@@ -731,7 +734,7 @@ def norm_class(self) -> Type:
         rotary_percentage=1.0,
         parallel_residual=False,
         bias=False,
-        _norm_class="FusedRMSNorm",
+        _norm_class="RMSNorm",
         norm_eps=1e-5,
         _mlp_class="LLaMAMLP",
         intermediate_size=6144,
@@ -750,7 +753,7 @@ def norm_class(self) -> Type:
         rotary_percentage=1.0,
         parallel_residual=False,
         bias=False,
-        _norm_class="FusedRMSNorm",
+        _norm_class="RMSNorm",
         norm_eps=1e-5,
         _mlp_class="LLaMAMLP",
         intermediate_size=6144,
@@ -769,7 +772,7 @@ def norm_class(self) -> Type:
         rotary_percentage=1.0,
         parallel_residual=False,
         bias=False,
-        _norm_class="FusedRMSNorm",
+        _norm_class="RMSNorm",
         norm_eps=1e-5,
         _mlp_class="LLaMAMLP",
         intermediate_size=6144,
@@ -787,7 +790,7 @@ def norm_class(self) -> Type:
         rotary_percentage=1.0,
         parallel_residual=False,
         bias=False,
-        _norm_class="FusedRMSNorm",
+        _norm_class="RMSNorm",
         norm_eps=1e-5,
         _mlp_class="LLaMAMLP",
         intermediate_size=8192,