Skip to content

Commit

Permalink
IPoIB: Connected mode experimental support
Browse files Browse the repository at this point in the history
The following patch adds experimental support for IPoIB connected
mode, as defined by the draft from the IETF ipoib working group.  The
idea is to increase performance by increasing the MTU from the maximum
of 2K (theoretically 4K) supported by IPoIB on top of UD.  With this
code, I'm able to get 800MByte/sec or more with netperf without
options on a Mellanox 4x back-to-back DDR system.

Some notes on code:
1. SRQ is used for scalability to large cluster sizes
2. Only RC connections are used (UC does not support SRQ now)
3. Retry count is set to 0 since spec draft warns against retries
4. Each connection is used for data transfers in only 1 direction, so
   each connection is either active(TX) or passive (RX).  2 sides that
   want to communicate create 2 connections.
5. Each active (TX) connection has a separate CQ for send completions -
   this keeps the code simple without CQ resize and other tricks
6. To detect stale passive side connections (where the remote side is
   down), we keep an LRU list of passive connections (updated once per
   second per connection) and destroy a connection after it has been
   unused for several seconds. The LRU rule makes it possible to avoid
   scanning connections that have recently been active.

Signed-off-by: Michael S. Tsirkin <mst@mellanox.co.il>
Signed-off-by: Roland Dreier <rolandd@cisco.com>
  • Loading branch information
Michael S. Tsirkin authored and Roland Dreier committed Feb 10, 2007
1 parent 9a6b090 commit 839fcab
Show file tree
Hide file tree
Showing 9 changed files with 1,575 additions and 32 deletions.
16 changes: 15 additions & 1 deletion drivers/infiniband/ulp/ipoib/Kconfig
@@ -1,13 +1,27 @@
config INFINIBAND_IPOIB
tristate "IP-over-InfiniBand"
depends on INFINIBAND && NETDEVICES && INET
depends on INFINIBAND && NETDEVICES && INET && (IPV6 || IPV6=n)
---help---
Support for the IP-over-InfiniBand protocol (IPoIB). This
transports IP packets over InfiniBand so you can use your IB
device as a fancy NIC.

See Documentation/infiniband/ipoib.txt for more information

config INFINIBAND_IPOIB_CM
bool "IP-over-InfiniBand Connected Mode support"
depends on INFINIBAND_IPOIB && EXPERIMENTAL
default n
---help---
This option enables experimental support for IPoIB connected mode.
After enabling this option, you need to switch to connected mode through
/sys/class/net/ibXXX/mode to actually create connections, and then increase
the interface MTU with e.g. ifconfig ib0 mtu 65520.

WARNING: Enabling connected mode will trigger some
packet drops for multicast and UD mode traffic from this interface,
unless you limit mtu for these destinations to 2044.

config INFINIBAND_IPOIB_DEBUG
bool "IP-over-InfiniBand debugging" if EMBEDDED
depends on INFINIBAND_IPOIB
Expand Down
1 change: 1 addition & 0 deletions drivers/infiniband/ulp/ipoib/Makefile
Expand Up @@ -5,5 +5,6 @@ ib_ipoib-y := ipoib_main.o \
ipoib_multicast.o \
ipoib_verbs.o \
ipoib_vlan.o
ib_ipoib-$(CONFIG_INFINIBAND_IPOIB_CM) += ipoib_cm.o
ib_ipoib-$(CONFIG_INFINIBAND_IPOIB_DEBUG) += ipoib_fs.o

215 changes: 215 additions & 0 deletions drivers/infiniband/ulp/ipoib/ipoib.h
Expand Up @@ -62,6 +62,10 @@ enum {

IPOIB_ENCAP_LEN = 4,

IPOIB_CM_MTU = 0x10000 - 0x10, /* padding to align header to 16 */
IPOIB_CM_BUF_SIZE = IPOIB_CM_MTU + IPOIB_ENCAP_LEN,
IPOIB_CM_HEAD_SIZE = IPOIB_CM_BUF_SIZE % PAGE_SIZE,
IPOIB_CM_RX_SG = ALIGN(IPOIB_CM_BUF_SIZE, PAGE_SIZE) / PAGE_SIZE,
IPOIB_RX_RING_SIZE = 128,
IPOIB_TX_RING_SIZE = 64,
IPOIB_MAX_QUEUE_SIZE = 8192,
Expand All @@ -81,6 +85,8 @@ enum {
IPOIB_MCAST_RUN = 6,
IPOIB_STOP_REAPER = 7,
IPOIB_MCAST_STARTED = 8,
IPOIB_FLAG_NETIF_STOPPED = 9,
IPOIB_FLAG_ADMIN_CM = 10,

IPOIB_MAX_BACKOFF_SECONDS = 16,

Expand All @@ -90,6 +96,13 @@ enum {
IPOIB_MCAST_FLAG_ATTACHED = 3,
};

#define IPOIB_OP_RECV (1ul << 31)
#ifdef CONFIG_INFINIBAND_IPOIB_CM
#define IPOIB_CM_OP_SRQ (1ul << 30)
#else
#define IPOIB_CM_OP_SRQ (0)
#endif

/* structs */

struct ipoib_header {
Expand All @@ -113,6 +126,59 @@ struct ipoib_tx_buf {
u64 mapping;
};

struct ib_cm_id;

struct ipoib_cm_data {
__be32 qpn; /* High byte MUST be ignored on receive */
__be32 mtu;
};

struct ipoib_cm_rx {
struct ib_cm_id *id;
struct ib_qp *qp;
struct list_head list;
struct net_device *dev;
unsigned long jiffies;
};

struct ipoib_cm_tx {
struct ib_cm_id *id;
struct ib_cq *cq;
struct ib_qp *qp;
struct list_head list;
struct net_device *dev;
struct ipoib_neigh *neigh;
struct ipoib_path *path;
struct ipoib_tx_buf *tx_ring;
unsigned tx_head;
unsigned tx_tail;
unsigned long flags;
u32 mtu;
struct ib_wc ibwc[IPOIB_NUM_WC];
};

struct ipoib_cm_rx_buf {
struct sk_buff *skb;
u64 mapping[IPOIB_CM_RX_SG];
};

struct ipoib_cm_dev_priv {
struct ib_srq *srq;
struct ipoib_cm_rx_buf *srq_ring;
struct ib_cm_id *id;
struct list_head passive_ids;
struct work_struct start_task;
struct work_struct reap_task;
struct work_struct skb_task;
struct delayed_work stale_task;
struct sk_buff_head skb_queue;
struct list_head start_list;
struct list_head reap_list;
struct ib_wc ibwc[IPOIB_NUM_WC];
struct ib_sge rx_sge[IPOIB_CM_RX_SG];
struct ib_recv_wr rx_wr;
};

/*
* Device private locking: tx_lock protects members used in TX fast
* path (and we use LLTX so upper layers don't do extra locking).
Expand Down Expand Up @@ -179,6 +245,10 @@ struct ipoib_dev_priv {
struct list_head child_intfs;
struct list_head list;

#ifdef CONFIG_INFINIBAND_IPOIB_CM
struct ipoib_cm_dev_priv cm;
#endif

#ifdef CONFIG_INFINIBAND_IPOIB_DEBUG
struct list_head fs_list;
struct dentry *mcg_dentry;
Expand Down Expand Up @@ -212,6 +282,9 @@ struct ipoib_path {

struct ipoib_neigh {
struct ipoib_ah *ah;
#ifdef CONFIG_INFINIBAND_IPOIB_CM
struct ipoib_cm_tx *cm;
#endif
union ib_gid dgid;
struct sk_buff_head queue;

Expand Down Expand Up @@ -315,6 +388,146 @@ int ipoib_vlan_delete(struct net_device *pdev, unsigned short pkey);
void ipoib_pkey_poll(struct work_struct *work);
int ipoib_pkey_dev_delay_open(struct net_device *dev);

#ifdef CONFIG_INFINIBAND_IPOIB_CM

#define IPOIB_FLAGS_RC 0x80
#define IPOIB_FLAGS_UC 0x40

/* We don't support UC connections at the moment */
#define IPOIB_CM_SUPPORTED(ha) (ha[0] & (IPOIB_FLAGS_RC))

static inline int ipoib_cm_admin_enabled(struct net_device *dev)
{
struct ipoib_dev_priv *priv = netdev_priv(dev);
return IPOIB_CM_SUPPORTED(dev->dev_addr) &&
test_bit(IPOIB_FLAG_ADMIN_CM, &priv->flags);
}

static inline int ipoib_cm_enabled(struct net_device *dev, struct neighbour *n)
{
struct ipoib_dev_priv *priv = netdev_priv(dev);
return IPOIB_CM_SUPPORTED(n->ha) &&
test_bit(IPOIB_FLAG_ADMIN_CM, &priv->flags);
}

static inline int ipoib_cm_up(struct ipoib_neigh *neigh)

{
return test_bit(IPOIB_FLAG_OPER_UP, &neigh->cm->flags);
}

static inline struct ipoib_cm_tx *ipoib_cm_get(struct ipoib_neigh *neigh)
{
return neigh->cm;
}

static inline void ipoib_cm_set(struct ipoib_neigh *neigh, struct ipoib_cm_tx *tx)
{
neigh->cm = tx;
}

void ipoib_cm_send(struct net_device *dev, struct sk_buff *skb, struct ipoib_cm_tx *tx);
int ipoib_cm_dev_open(struct net_device *dev);
void ipoib_cm_dev_stop(struct net_device *dev);
int ipoib_cm_dev_init(struct net_device *dev);
int ipoib_cm_add_mode_attr(struct net_device *dev);
void ipoib_cm_dev_cleanup(struct net_device *dev);
struct ipoib_cm_tx *ipoib_cm_create_tx(struct net_device *dev, struct ipoib_path *path,
struct ipoib_neigh *neigh);
void ipoib_cm_destroy_tx(struct ipoib_cm_tx *tx);
void ipoib_cm_skb_too_long(struct net_device* dev, struct sk_buff *skb,
unsigned int mtu);
void ipoib_cm_handle_rx_wc(struct net_device *dev, struct ib_wc *wc);
#else

struct ipoib_cm_tx;

static inline int ipoib_cm_admin_enabled(struct net_device *dev)
{
return 0;
}
static inline int ipoib_cm_enabled(struct net_device *dev, struct neighbour *n)

{
return 0;
}

static inline int ipoib_cm_up(struct ipoib_neigh *neigh)

{
return 0;
}

static inline struct ipoib_cm_tx *ipoib_cm_get(struct ipoib_neigh *neigh)
{
return NULL;
}

static inline void ipoib_cm_set(struct ipoib_neigh *neigh, struct ipoib_cm_tx *tx)
{
}

static inline
void ipoib_cm_send(struct net_device *dev, struct sk_buff *skb, struct ipoib_cm_tx *tx)
{
return;
}

static inline
int ipoib_cm_dev_open(struct net_device *dev)
{
return 0;
}

static inline
void ipoib_cm_dev_stop(struct net_device *dev)
{
return;
}

static inline
int ipoib_cm_dev_init(struct net_device *dev)
{
return -ENOSYS;
}

static inline
void ipoib_cm_dev_cleanup(struct net_device *dev)
{
return;
}

static inline
struct ipoib_cm_tx *ipoib_cm_create_tx(struct net_device *dev, struct ipoib_path *path,
struct ipoib_neigh *neigh)
{
return NULL;
}

static inline
void ipoib_cm_destroy_tx(struct ipoib_cm_tx *tx)
{
return;
}

static inline
int ipoib_cm_add_mode_attr(struct net_device *dev)
{
return 0;
}

static inline void ipoib_cm_skb_too_long(struct net_device* dev, struct sk_buff *skb,
unsigned int mtu)
{
dev_kfree_skb_any(skb);
}

static inline void ipoib_cm_handle_rx_wc(struct net_device *dev, struct ib_wc *wc)
{
}

#endif

#ifdef CONFIG_INFINIBAND_IPOIB_DEBUG
void ipoib_create_debug_files(struct net_device *dev);
void ipoib_delete_debug_files(struct net_device *dev);
Expand Down Expand Up @@ -392,4 +605,6 @@ extern int ipoib_debug_level;

#define IPOIB_GID_ARG(gid) IPOIB_GID_RAW_ARG((gid).raw)

#define IPOIB_QPN(ha) (be32_to_cpup((__be32 *) ha) & 0xffffff)

#endif /* _IPOIB_H */

0 comments on commit 839fcab

Please sign in to comment.