总述

代码来自4.1.19版本。

先找到namespace在task_struct中的结构，

// include/linux/sched.h
struct task_struct {
......
/* namespaces */
struct nsproxy *nsproxy;
......
}

以及nsproxy的定义，

// include/linux/nsproxy.h
/*
 * A structure to contain pointers to all per-process
 * namespaces - fs (mount), uts, network, sysvipc, etc.
 *
 * The pid namespace is an exception -- it's accessed using
 * task_active_pid_ns.  The pid namespace here is the
 * namespace that children will use.
 *
 * 'count' is the number of tasks holding a reference.
 * The count for each namespace, then, will be the number
 * of nsproxies pointing to it, not the number of tasks.
 *
 * The nsproxy is shared by tasks which share all namespaces.
 * As soon as a single namespace is cloned or unshared, the
 * nsproxy is copied.
 */
struct nsproxy {
	atomic_t count;
	struct uts_namespace *uts_ns;
	struct ipc_namespace *ipc_ns;
	struct mnt_namespace *mnt_ns;
	struct pid_namespace *pid_ns_for_children;
	struct net 	     *net_ns;
	struct cgroup_namespace *cgroup_ns;
};
extern struct nsproxy init_nsproxy;

其中 user namespace 是和其他 namespace 耦合在一起的，所以没出现在上述结构中。

task_struct，nsproxy，几种 namespace 之间的关系如下所示（这里的nsproxy应该是写错了，写成task_struct了）：

同时，nsproxy.h 中还定义了一些对 namespace 的操作，包括 copy_namespaces 等，

int copy_namespaces(unsigned long flags, struct task_struct *tsk);
void exit_task_namespaces(struct task_struct *tsk);
void switch_task_namespaces(struct task_struct *tsk, struct nsproxy *new);
void free_nsproxy(struct nsproxy *ns);
int unshare_nsproxy_namespaces(unsigned long, struct nsproxy **,
 struct fs_struct *);
int __init nsproxy_cache_init(void);

nsproxy 有个 init_nsproxy 函数，init_nsproxy 在 task 初始化的时候会被初始化，

// include/linux/init_task.h

/*
*  INIT_TASK is used to set up the first task table, touch at
* your own risk!. Base=0, limit=0x1fffff (=2MB)
*/
#define INIT_TASK(tsk)  \
{
......
 .nsproxy  = &init_nsproxy,
......
}

// kernel/nsproxy.c

struct nsproxy init_nsproxy = {
	.count			= ATOMIC_INIT(1),
	.uts_ns			= &init_uts_ns,
#if defined(CONFIG_POSIX_MQUEUE) || defined(CONFIG_SYSVIPC)
	.ipc_ns			= &init_ipc_ns,
#endif
	.mnt_ns			= NULL,
	.pid_ns_for_children	= &init_pid_ns,
#ifdef CONFIG_NET
	.net_ns			= &init_net,
#endif
#ifdef CONFIG_CGROUPS
	.cgroup_ns		= &init_cgroup_ns,
#endif
};

init_nsproxy 中，对 uts, ipc, pid, net 都进行了初始化，但 mount 却没有。

看看 do_fork() 函数是如何处理一个新进程的 namespace 的：

/*
*  Ok, this is the main fork-routine.
*
* It copies the process, and if successful kick-starts
* it and waits for it to finish using the VM if required.
*/
long do_fork(unsigned long clone_flags,
       unsigned long stack_start,
       unsigned long stack_size,
       int __user *parent_tidptr,
       int __user *child_tidptr)
{
 // 创建进程描述符指针
 struct task_struct *p;
 int trace = 0;
 long nr;

 /*
  * Determine whether and which event to report to ptracer.  When
  * called from kernel_thread or CLONE_UNTRACED is explicitly
  * requested, no event is reported; otherwise, report if the event
  * for the type of forking is enabled.
  */
 if (!(clone_flags & CLONE_UNTRACED)) {
   if (clone_flags & CLONE_VFORK)
     trace = PTRACE_EVENT_VFORK;
   else if ((clone_flags & CSIGNAL) != SIGCHLD)
     trace = PTRACE_EVENT_CLONE;
   else
     trace = PTRACE_EVENT_FORK;

   if (likely(!ptrace_event_enabled(current, trace)))
     trace = 0;
 }

 // 复制进程描述符，返回值是 task_struct
 p = copy_process(clone_flags, stack_start, stack_size,
      child_tidptr, NULL, trace);
 /*
  * Do this prior waking up the new thread - the thread pointer
  * might get invalid after that point, if the thread exits quickly.
  */
 if (!IS_ERR(p)) {
   struct completion vfork;
   struct pid *pid;

   trace_sched_process_fork(current, p);

   // 得到新进程描述符的 pid
   pid = get_task_pid(p, PIDTYPE_PID);
   nr = pid_vnr(pid);

   if (clone_flags & CLONE_PARENT_SETTID)
     put_user(nr, parent_tidptr);

   // 调用 vfork() 方法，完成相关的初始化工作  
   if (clone_flags & CLONE_VFORK) {
     p->vfork_done = &vfork;
     init_completion(&vfork);
     get_task_struct(p);
   }

   // 将新进程加入到调度器中，为其分配 CPU，准备执行
   wake_up_new_task(p);

   // fork() 完成，子进程开始运行，并让 ptrace 跟踪
   /* forking complete and child started to run, tell ptracer */
   if (unlikely(trace))
     ptrace_event_pid(trace, pid);

   // 如果是 vfork()，将父进程加入等待队列，等待子进程完成
   if (clone_flags & CLONE_VFORK) {
     if (!wait_for_vfork_done(p, &vfork))
       ptrace_event_pid(PTRACE_EVENT_VFORK_DONE, pid);
   }

   put_pid(pid);
 } else {
   nr = PTR_ERR(p);
 }
 return nr;
}

和 namespace 相关的内容在 copy_process() 中，这个函数将父进程信息复制给子进程，

static struct task_struct *copy_process(unsigned long clone_flags,
         unsigned long stack_start,
         unsigned long stack_size,
         int __user *child_tidptr,
         struct pid *pid,
         int trace)
{
 int retval;
 // 创建进程描述符指针
 struct task_struct *p;
    
 ......
 // !!! 复制 namespace
 retval = copy_namespaces(clone_flags, p);
 ......
 }

 // ......

 // 返回新进程 p
 return p;
}

再看 copy_namespaces()，核心函数是 create_new_namespaces()，

/*
* called from clone.  This now handles copy for nsproxy and all
* namespaces therein.
*/
int copy_namespaces(unsigned long flags, struct task_struct *tsk)
{
 // 老的 namespace 信息是从父进程中复制过来的
 struct nsproxy *old_ns = tsk->nsproxy;
 struct user_namespace *user_ns = task_cred_xxx(tsk, user_ns);
 struct nsproxy *new_ns;

 // 在5.17.11中，此处还会另外检查两个flag：CLONE_NEWCGROUP、CLONE_NEWTIME
 if (likely(!(flags & (CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC |
           CLONE_NEWPID | CLONE_NEWNET)))) {	
   get_nsproxy(old_ns);
   return 0;
 }

 if (!ns_capable(user_ns, CAP_SYS_ADMIN))
   return -EPERM;

 /*
  * CLONE_NEWIPC must detach from the undolist: after switching
  * to a new ipc namespace, the semaphore arrays from the old
  * namespace are unreachable.  In clone parlance, CLONE_SYSVSEM
  * means share undolist with parent, so we must forbid using
  * it along with CLONE_NEWIPC.
  */
 if ((flags & (CLONE_NEWIPC | CLONE_SYSVSEM)) ==
   (CLONE_NEWIPC | CLONE_SYSVSEM)) 
   return -EINVAL;

 // 新的 namespace 信息根据 flag 做调整
 new_ns = create_new_namespaces(flags, tsk, user_ns, tsk->fs);
 if (IS_ERR(new_ns))
   return  PTR_ERR(new_ns);

 tsk->nsproxy = new_ns;
 return 0;
}

看 create_new_namespaces()，先给 nsproxy 结构申请了新内存，然后依次复制5个 namespace 的信息，

/*
* Create new nsproxy and all of its the associated namespaces.
* Return the newly created nsproxy.  Do not attach this to the task,
* leave it to the caller to do proper locking and attach it to task.
*/
static struct nsproxy *create_new_namespaces(unsigned long flags,
 struct task_struct *tsk, struct user_namespace *user_ns,
 struct fs_struct *new_fs)
{
 struct nsproxy *new_nsp;
 int err;

// 创建新的 nsproxy
 new_nsp = create_nsproxy();
 if (!new_nsp)
   return ERR_PTR(-ENOMEM);

//创建 mnt namespace
 new_nsp->mnt_ns = copy_mnt_ns(flags, tsk->nsproxy->mnt_ns, user_ns, new_fs);
 if (IS_ERR(new_nsp->mnt_ns)) {
   err = PTR_ERR(new_nsp->mnt_ns);
   goto out_ns;
 }
//创建 uts namespace
 new_nsp->uts_ns = copy_utsname(flags, user_ns, tsk->nsproxy->uts_ns);
 if (IS_ERR(new_nsp->uts_ns)) {
   err = PTR_ERR(new_nsp->uts_ns);
   goto out_uts;
 }
//创建 ipc namespace
 new_nsp->ipc_ns = copy_ipcs(flags, user_ns, tsk->nsproxy->ipc_ns);
 if (IS_ERR(new_nsp->ipc_ns)) {
   err = PTR_ERR(new_nsp->ipc_ns);
   goto out_ipc;
 }
//创建 pid namespace
 new_nsp->pid_ns_for_children =
   copy_pid_ns(flags, user_ns, tsk->nsproxy->pid_ns_for_children);
 if (IS_ERR(new_nsp->pid_ns_for_children)) {
   err = PTR_ERR(new_nsp->pid_ns_for_children);
   goto out_pid;
 }
//创建 network namespace
 new_nsp->net_ns = copy_net_ns(flags, user_ns, tsk->nsproxy->net_ns);
 if (IS_ERR(new_nsp->net_ns)) {
   err = PTR_ERR(new_nsp->net_ns);
   goto out_net;
 }

 return new_nsp;
// 错误处理
out_net:
 if (new_nsp->pid_ns_for_children)
   put_pid_ns(new_nsp->pid_ns_for_children);
out_pid:
 if (new_nsp->ipc_ns)
   put_ipc_ns(new_nsp->ipc_ns);
out_ipc:
 if (new_nsp->uts_ns)
   put_uts_ns(new_nsp->uts_ns);
out_uts:
 if (new_nsp->mnt_ns)
   put_mnt_ns(new_nsp->mnt_ns);
out_ns:
 kmem_cache_free(nsproxy_cachep, new_nsp);
 return ERR_PTR(err);
}

看 create_nsproxy()，

static inline struct nsproxy *create_nsproxy(void)
{
	struct nsproxy *nsproxy;

	nsproxy = kmem_cache_alloc(nsproxy_cachep, GFP_KERNEL);	// 从缓存中分配内存
	if (nsproxy)
		atomic_set(&nsproxy->count, 1);		// 如果没问题就计数+1
	return nsproxy;							// 返回这个新的指针
}

在 create_nsproxy() 之后，就是依次复制各个 namespace 的信息了，这些代码放到下面各个 namespace 中分析。每个 copy_xxx 的函数都有两个实现，其中一个需要开启对应的编译选项，不开启的话只会默认复制父进程的信息，且使用对应的 flag 会报错。

UTS namespace

UTS命名空间主要用于隔离系统中的主机名和域名等系统标识信息。

copy_utsname() 函数

复制 uts_namespace 的函数 copy_utsname() 比较简单，

struct uts_namespace *copy_utsname(unsigned long flags,
	struct user_namespace *user_ns, struct uts_namespace *old_ns)
{
	struct uts_namespace *new_ns;

	BUG_ON(!old_ns);		// 使用BUG_ON()宏检查旧的UTS命名空间是否为空指针，并增加引用计数以确保该UTS命名空间不会在函数执行期间被删除
	get_uts_ns(old_ns);

	if (!(flags & CLONE_NEWUTS))	// 检查是否带有 CLONE_NEWUTS 标志，若无，则返回老的 ns
		return old_ns;

	new_ns = clone_uts_ns(user_ns, old_ns);	// user_ns 和 old_ns 的关系还没弄明白

	put_uts_ns(old_ns);
	return new_ns;
}

flags用于确定是否需要为新的UTS命名空间创建一个新的用户命名空间；如果flags参数中不包含CLONE_NEWUTS标志，则表示新的UTS命名空间将与当前进程的UTS命名空间相同。

函数的输入参数user_ns表示新的UTS命名空间所属的用户命名空间。

函数的输入参数old_ns表示需要复制的旧的UTS命名空间。该参数不能为空指针。

重要结构

看 struct uts_namespace 的结构：

struct uts_namespace {
	struct kref kref;	// 用于对UTS命名空间进行引用计数的结构，这个结构里面只有一个变量：atomic_t refcount
	struct new_utsname name;	// 包含了UTS命名空间的主机名和域名等信息
	struct user_namespace *user_ns;	// 对该UTS命名空间进行引用的用户命名空间数量
	struct ns_common ns;		// 该UTS命名空间所属的用户命名空间
};

struct new_utsname {
	char sysname[__NEW_UTS_LEN + 1];
	char nodename[__NEW_UTS_LEN + 1];	// host name
	char release[__NEW_UTS_LEN + 1];
	char version[__NEW_UTS_LEN + 1];
	char machine[__NEW_UTS_LEN + 1];
	char domainname[__NEW_UTS_LEN + 1];	// domain name
};

在相关系统调用中看对于这个 namespace 的处理

再看看在 sethostname 函数里 UTS namespace 是怎么被处理的。这个函数在 Linux man page 中是这么描述的：sethostname() sets the hostname to the value given in the character array name. The len argument specifies the number of bytes in name. (Thus, name does not require a terminating null byte.)，

SYSCALL_DEFINE2(sethostname, char __user *, name, int, len)
{
	int errno;
	char tmp[__NEW_UTS_LEN];

    // 检查当前进程是否具有足够权限修改 hostname
	if (!ns_capable(current->nsproxy->uts_ns->user_ns, CAP_SYS_ADMIN))
		return -EPERM;

    // 检查传进来的长度参数 len 是否合法
	if (len < 0 || len > __NEW_UTS_LEN)	
		return -EINVAL;
	down_write(&uts_sem);				// uts_sem是信号量
	errno = -EFAULT;
	if (!copy_from_user(tmp, name, len)) {	// 把 name 从用户空间复制到内核空间
		struct new_utsname *u = utsname();	// utsname() 返回 &current->nsproxy->uts_ns->name

		memcpy(u->nodename, tmp, len);		// 把需要设置的 hostname 复制到 current->nsproxy->uts_ns->name->nodename
		memset(u->nodename + len, 0, sizeof(u->nodename) - len);	// 末尾置0作为结束
		errno = 0;
		uts_proc_notify(UTS_PROC_HOSTNAME);
	}
	up_write(&uts_sem);	// 释放写锁定
	return errno;
}

由 current 找到关联的 nsproxy，然后找到 uts_ns，然后直接改 hostname 就行了，大部分进程关联的都是根 uts_ns，在容器内的进程关联的是子 uts_ns。

在 gethostname 函数中，流程也差不多，先锁住信号量，然后用 utsname() 函数获取 current->nsproxy->uts_ns->name->nodename，然后复制到用户空间，

SYSCALL_DEFINE2(gethostname, char __user *, name, int, len)
{
	int i, errno;
	struct new_utsname *u;

	if (len < 0)
		return -EINVAL;
	down_read(&uts_sem);
	u = utsname();
	i = 1 + strlen(u->nodename);
	if (i > len)
		i = len;
	errno = 0;
	if (copy_to_user(name, u->nodename, i))
		errno = -EFAULT;
	up_read(&uts_sem);
	return errno;
}

IPC（Interprocess Communication）namespace

copy_ipcs() 函数

在 create_new_namespaces() 函数的流程中，再看复制 ipc_namespace 的 copy_ipcs 函数：若编译参数 CONFIG_IPC_NS 未被定义过，则 clone 时遇到 CLONE_NEWIPC 参数直接报错；若编译参数 CONFIG_IPC_NS 被定义过，且检查到 CLONE_NEWIPC 参数，则会调用 create_ipc_ns 函数来创建一个新的 namespace，

// include/linux/ipc_namespace.h

#if defined(CONFIG_IPC_NS)
extern struct ipc_namespace *copy_ipcs(unsigned long flags,
	struct user_namespace *user_ns, struct ipc_namespace *ns);

......
    
#else
// #if defined(CONFIG_IPC_NS) 条件不满足则在执行这个函数时直接返回父进程相应的 namespace
static inline struct ipc_namespace *copy_ipcs(unsigned long flags,
	struct user_namespace *user_ns, struct ipc_namespace *ns)
{
	if (flags & CLONE_NEWIPC)
		return ERR_PTR(-EINVAL);

	return ns;
}

......
#endif


// ipc/namespace.c
    
// 若 #if defined(CONFIG_IPC_NS) 条件满足则允许创建新的 namespace，也即调用 create_ipc_ns 函数
struct ipc_namespace *copy_ipcs(unsigned long flags,
 struct user_namespace *user_ns, struct ipc_namespace *ns)
{
 if (!(flags & CLONE_NEWIPC))
   return get_ipc_ns(ns);	// 引用数+1并返回现有的指针 ns
 return create_ipc_ns(user_ns, ns);
}

看这里创建新 namespace 的核心函数 create_ipc_ns，

static struct ipc_namespace *create_ipc_ns(struct user_namespace *user_ns,
					   struct ipc_namespace *old_ns)
{
	struct ipc_namespace *ns;
	int err;

	ns = kmalloc(sizeof(struct ipc_namespace), GFP_KERNEL);	// 分配内存
	if (ns == NULL)
		return ERR_PTR(-ENOMEM);

	err = ns_alloc_inum(&ns->ns);
	if (err) {
		kfree(ns);
		return ERR_PTR(err);
	}
	ns->ns.ops = &ipcns_operations;

	atomic_set(&ns->count, 1);	// 设置计数值
	err = mq_init_ns(ns);		// 初始化消息队列 mq 对应的一些参数
	if (err) {
		ns_free_inum(&ns->ns);
		kfree(ns);
		return ERR_PTR(err);
	}
	atomic_inc(&nr_ipc_ns);

	sem_init_ns(ns);	// 初始化信号量 sem 对应的 idr 池和一些参数
	msg_init_ns(ns);	// 初始化消息 msg 对应的 idr 池和一些参数
	shm_init_ns(ns);	// 初始化共享内存 shm 对应的 idr 池和一些参数

	ns->user_ns = get_user_ns(user_ns);

	return ns;
}

重要结构

ipc_namespace：

struct ipc_namespace {
	atomic_t	count;
	struct ipc_ids	ids[3];

	int		sem_ctls[4];
	int		used_sems;

	unsigned int	msg_ctlmax;
	unsigned int	msg_ctlmnb;
	unsigned int	msg_ctlmni;
	atomic_t	msg_bytes;
	atomic_t	msg_hdrs;

	size_t		shm_ctlmax;
	size_t		shm_ctlall;
	unsigned long	shm_tot;
	int		shm_ctlmni;
	/*
	 * Defines whether IPC_RMID is forced for _all_ shm segments regardless
	 * of shmctl()
	 */
	int		shm_rmid_forced;

	struct notifier_block ipcns_nb;

	/* The kern_mount of the mqueuefs sb.  We take a ref on it */
	struct vfsmount	*mq_mnt;

	/* # queues in this ns, protected by mq_lock */
	unsigned int    mq_queues_count;

	/* next fields are set through sysctl */
	unsigned int    mq_queues_max;   /* initialized to DFLT_QUEUESMAX */
	unsigned int    mq_msg_max;      /* initialized to DFLT_MSGMAX */
	unsigned int    mq_msgsize_max;  /* initialized to DFLT_MSGSIZEMAX */
	unsigned int    mq_msg_default;
	unsigned int    mq_msgsize_default;

	/* user_ns which owns the ipc ns */
	struct user_namespace *user_ns;

	struct ns_common ns;
};

// v2.6.17-rc4, ipc/msg.c
// 在没有 namespace 的版本中，内核定义了三个全局的 ipc_ids 结构实例，分别代表信号量、消息队列和共享内存，其中用于消息队列的是 msg_ids
static struct ipc_ids msg_ids;

// v4.1.19, ipc/msg.c
// 这个宏定义了给定 ns 中属于消息队列的 ipc_ids 结构，这意味着不同的 IPC 命名空间不能访问彼此的 ipc_ids 成员
#define msg_ids(ns)	((ns)->ids[IPC_MSG_IDS])

// 每一个 struct ipc_ids 结构描述了一类 IPC 资源，用这个结构就可以找到具体的消息队列结构
struct ipc_ids {
	int in_use;
	unsigned short seq;
	struct rw_semaphore rwsem;
	struct idr ipcs_idr;
	int next_id;
};

要理清楚 IPC 全部的逻辑还需要大量的代码，就不放在这里了。看看有没有时间新写一个专门学习 IPC 的文档吧。

在相关系统调用中看对于这个 namespace 的处理

semget()

SYSCALL_DEFINE2(msgget, key_t, key, int, msgflg)
{
	struct ipc_namespace *ns;
    
    // 参数初始化
    // msg_ops 是一个操作集合，根据 msgflg 确定该执行集合中的哪个函数
	static const struct ipc_ops msg_ops = {	
		.getnew = newque,
		.associate = msg_security,
	};			
	struct ipc_params msg_params;

	ns = current->nsproxy->ipc_ns;	// ns 是创建该消息队列的进程的 IPC namespace

	msg_params.key = key;
	msg_params.flg = msgflg;

    // 调用 ipcget，由 IPC 模块统一处理
	return ipcget(ns, &msg_ids(ns), &msg_ops, &msg_params);	
}

int ipcget(struct ipc_namespace *ns, struct ipc_ids *ids,
			const struct ipc_ops *ops, struct ipc_params *params)
{
	if (params->key == IPC_PRIVATE)
		return ipcget_new(ns, ids, ops, params);	// 建立私有消息队列，自发自收
	else
		return ipcget_public(ns, ids, ops, params);	// 主要看这个函数
}

static int ipcget_public(struct ipc_namespace *ns, struct ipc_ids *ids,
		const struct ipc_ops *ops, struct ipc_params *params)
{
	struct kern_ipc_perm *ipcp;
	int flg = params->flg;
	int err;

	/*
	 * Take the lock as a writer since we are potentially going to add
	 * a new entry + read locks are not "upgradable"
	 */
	down_write(&ids->rwsem);
	ipcp = ipc_findkey(ids, params->key);
	if (ipcp == NULL) {
		/* key not used */
		if (!(flg & IPC_CREAT))	// key 没找到但又不需要创建，就报错
			err = -ENOENT;
		else
			err = ops->getnew(ns, params);	// key 没找到且需要创建
	} else {
		/* ipc object has been locked by ipc_findkey() */

		if (flg & IPC_CREAT && flg & IPC_EXCL)
			err = -EEXIST;	// key 指定的消息队列已存在，而 msgflg 中同时指定了 IPC_CREAT 和 IPC_EXCL 标志，会报错
		else {
			err = 0;
			if (ops->more_checks)
				err = ops->more_checks(ipcp, params);
			if (!err)
				/*
				 * ipc_check_perms returns the IPC id on
				 * success
				 */
				err = ipc_check_perms(ns, ipcp, ops, params);
		}
		ipc_unlock(ipcp);
	}
	up_write(&ids->rwsem);

	return err;
}

接着看 ops->getnew(ns, params) 这一步，这里的 getnew 函数被设置为 newque 函数，

/**
 * newque - Create a new msg queue
 * @ns: namespace
 * @params: ptr to the structure that contains the key and msgflg
 *
 * Called with msg_ids.rwsem held (writer)
 */
static int newque(struct ipc_namespace *ns, struct ipc_params *params)
{
	struct msg_queue *msq;
	int id, retval;
	key_t key = params->key;
	int msgflg = params->flg;

	msq = ipc_rcu_alloc(sizeof(*msq));
	if (!msq)
		return -ENOMEM;

	msq->q_perm.mode = msgflg & S_IRWXUGO;
	msq->q_perm.key = key;

	msq->q_perm.security = NULL;
	retval = security_msg_queue_alloc(msq);
	if (retval) {
		ipc_rcu_putref(msq, ipc_rcu_free);
		return retval;
	}

	msq->q_stime = msq->q_rtime = 0;
	msq->q_ctime = get_seconds();
	msq->q_cbytes = msq->q_qnum = 0;
	msq->q_qbytes = ns->msg_ctlmnb;
	msq->q_lspid = msq->q_lrpid = 0;
	INIT_LIST_HEAD(&msq->q_messages);
	INIT_LIST_HEAD(&msq->q_receivers);
	INIT_LIST_HEAD(&msq->q_senders);

	/* ipc_addid() locks msq upon success. */
	id = ipc_addid(&msg_ids(ns), &msq->q_perm, ns->msg_ctlmni);	// 分配一个标识号；这里用到了 ipc namespace
	if (id < 0) {
		ipc_rcu_putref(msq, msg_rcu_free);
		return id;
	}

	ipc_unlock_object(&msq->q_perm);
	rcu_read_unlock();

	return msq->q_perm.id;
}

/**
 * ipc_addid - add an ipc identifier
 * @ids: ipc identifier set
 * @new: new ipc permission set
 * @size: limit for the number of used ids
 *
 * Add an entry 'new' to the ipc ids idr. The permissions object is
 * initialised and the first free entry is set up and the id assigned
 * is returned. The 'new' entry is returned in a locked state on success.
 * On failure the entry is not locked and a negative err-code is returned.
 *
 * Called with writer ipc_ids.rwsem held.
 */
int ipc_addid(struct ipc_ids *ids, struct kern_ipc_perm *new, int size)
{
	kuid_t euid;
	kgid_t egid;
	int id;
	int next_id = ids->next_id;

	if (size > IPCMNI)
		size = IPCMNI;

	if (ids->in_use >= size)
		return -ENOSPC;

	idr_preload(GFP_KERNEL);

	spin_lock_init(&new->lock);
	new->deleted = false;
	rcu_read_lock();
	spin_lock(&new->lock);

	current_euid_egid(&euid, &egid);
	new->cuid = new->uid = euid;
	new->gid = new->cgid = egid;

	id = idr_alloc(&ids->ipcs_idr, new,
		       (next_id < 0) ? 0 : ipcid_to_idx(next_id), 0,
		       GFP_NOWAIT);
	idr_preload_end();
	if (id < 0) {
		spin_unlock(&new->lock);
		rcu_read_unlock();
		return id;
	}

	ids->in_use++;

	if (next_id < 0) {
		new->seq = ids->seq++;
		if (ids->seq > IPCID_SEQ_MAX)
			ids->seq = 0;
	} else {
		new->seq = ipcid_to_seqx(next_id);
		ids->next_id = -1;
	}

	new->id = ipc_buildid(id, new->seq);
	return id;
}

ipc_addid函数的作用：在&msg_ids(ns)管理的idr树中找到一个空闲节点，将&msq->perm存入这个节点中，同时返回这个节点的序号，也就是这个函数的返回值给id，最后这个函数返回的msq->q_perm.id也就是节点的序号，最后msgget返回的句柄也就是这个id号，程序可以用这个id号进行发送消息。

这里其实并没有全都看懂，但关于 namespace 的主要思想就是把原有的全局变量 msg_ids 变成了各个 namespace 实例中的成员 (ns)->ids[IPC_MSG_IDS]。进程在做通信时内核会用 current 中的 ipc namespace 去替换 (ns)->ids[IPC_MSG_IDS]，也就是说，当前进程能找到的全部进程间通信的消息只会是归属于同一个 ipc namespace 的进程发出的。

PID namespace

copy_pid_ns() 函数

再看复制 pid_namespace 的 copy_pid_ns 函数，

struct pid_namespace *copy_pid_ns(unsigned long flags,
	struct user_namespace *user_ns, struct pid_namespace *old_ns)
{
	if (!(flags & CLONE_NEWPID))	// 检查 flag
		return get_pid_ns(old_ns);
	if (task_active_pid_ns(current) != old_ns)	// 检查当前的 pid_namespace 和 old_ns 是否一致？
		return ERR_PTR(-EINVAL);
	return create_pid_namespace(user_ns, old_ns);	// 创建新的 pid_namespace
}

看 create_pid_namespace 函数，基本就是新建一个 struct pid_namespace 变量，然后设置它的各变量并返回的过程，

static struct pid_namespace *create_pid_namespace(struct user_namespace *user_ns,
	struct pid_namespace *parent_pid_ns)
{
	struct pid_namespace *ns;
	unsigned int level = parent_pid_ns->level + 1;	// pid namespace 的套娃层数+1？
	int i;
	int err;

    // MAX_PID_NS_LEVEL 定义为32，说明 pid namespace 的套娃不能超过32层？
	if (level > MAX_PID_NS_LEVEL) {	
		err = -EINVAL;
		goto out;
	}

	err = -ENOMEM;
	ns = kmem_cache_zalloc(pid_ns_cachep, GFP_KERNEL);	// 申请内存？
	if (ns == NULL)
		goto out;

	ns->pidmap[0].page = kzalloc(PAGE_SIZE, GFP_KERNEL);
	if (!ns->pidmap[0].page)
		goto out_free;

	ns->pid_cachep = create_pid_cachep(level + 1);
	if (ns->pid_cachep == NULL)
		goto out_free_map;

	err = ns_alloc_inum(&ns->ns);
	if (err)
		goto out_free_map;
	ns->ns.ops = &pidns_operations;

	kref_init(&ns->kref);
	ns->level = level;
	ns->parent = get_pid_ns(parent_pid_ns);
	ns->user_ns = get_user_ns(user_ns);
	ns->nr_hashed = PIDNS_HASH_ADDING;
	INIT_WORK(&ns->proc_work, proc_cleanup_work);

	set_bit(0, ns->pidmap[0].page);
	atomic_set(&ns->pidmap[0].nr_free, BITS_PER_PAGE - 1);

	for (i = 1; i < PIDMAP_ENTRIES; i++)
		atomic_set(&ns->pidmap[i].nr_free, BITS_PER_PAGE);

	return ns;

out_free_map:
	kfree(ns->pidmap[0].page);
out_free:
	kmem_cache_free(pid_ns_cachep, ns);
out:
	return ERR_PTR(err);
}

重要结构

struct pid_namespace 的结构：

struct pid_namespace {
	struct kref kref;	// 引用计数
	struct pidmap pidmap[PIDMAP_ENTRIES];	// pid 分配的 bitmap，为1表示已分配
	struct rcu_head rcu;
	int last_pid;		// 记录上次分配的 pid，默认当前分配的 pid=last_pid+1
	unsigned int nr_hashed;
	struct task_struct *child_reaper;
	struct kmem_cache *pid_cachep;	// 用于分配 pid 结构的 slab 缓存
	unsigned int level;	// 记录该 pidns 的深度
	struct pid_namespace *parent;	// 父 pidns
#ifdef CONFIG_PROC_FS
	struct vfsmount *proc_mnt;
	struct dentry *proc_self;
	struct dentry *proc_thread_self;
#endif
#ifdef CONFIG_BSD_PROCESS_ACCT
	struct fs_pin *bacct;
#endif
	struct user_namespace *user_ns;
	struct work_struct proc_work;
	kgid_t pid_gid;
	int hide_pid;
	int reboot;	/* group exit code if this pidns was rebooted */
	struct ns_common ns;
};

理解 pid 还需要再了解其他几个重要结构。

一个进程对应一个 task struct，但是这个进程在多个 pidns 中可以看到不同的 pid，对于这些 pid 的管理，主要通过两个结构体来实现。

struct upid {	// 这个结构记录了某个进程在某个深度下的 pid 值和对应的 pid_namespace 结构
	/* Try to keep pid_chain in the same cacheline as nr for find_vpid */
	int nr;		// pid 的数值
	struct pid_namespace *ns;		// 所在命名空间
	struct hlist_node pid_chain;	// 链表节点
};

struct pid	// 这个结构和进程在绝大多数情况下一一对应
{
	atomic_t count;
	unsigned int level;		// 这个 pid 结构体的深度
	/* lists of tasks that use this pid */
	struct hlist_head tasks[PIDTYPE_MAX];	// 使用这个 pid 的进程链表
	struct rcu_head rcu;
	struct upid numbers[1];	// 这个 pid 在不同命名空间中的表示；这里数组大小虽然为1，但其实可以在申请内存时调整
};

在相关系统调用中看对于这个 namespace 的处理

看一下在 getpid 函数中，进程获取 pid 时是怎么处理 pid namespace 的：

SYSCALL_DEFINE0(getpid)
{
	return task_tgid_vnr(current);
}

static inline pid_t task_tgid_vnr(struct task_struct *tsk)
{
    // 这里 task_tgid(tsk) 函数就是获得当前进程的 group leader（进程的 task_group 就是它自己，线程的 task_group 是它的父进程，调用 pthread_create 的那个进程）的 pid 结构
    // 但这里为啥是请求 group leader 的 pid 呢？
	return pid_vnr(task_tgid(tsk));
}

pid_t pid_vnr(struct pid *pid)
{
	return pid_nr_ns(pid, task_active_pid_ns(current));
}

pid_t pid_nr_ns(struct pid *pid, struct pid_namespace *ns)
{
	struct upid *upid;
	pid_t nr = 0;

	if (pid && ns->level <= pid->level) {
		upid = &pid->numbers[ns->level];	// 取出当前 pid 在特定深度下的 upid
		if (upid->ns == ns)	// 如果 upid 对应的 ns 和传入的 ns（current 的 ns）一致，就返回 nr（也即该深度下的 pid 值）；但什么时候这个条件会不成立呢？
			nr = upid->nr;
	}
	return nr;
}

Mount namespaces

copy_mnt_ns() 函数

/*
参数:
unsigned long flags: 标记位，指示如何创建新的mnt namespace。
struct mnt_namespace *ns: 要复制的mnt namespace。
struct user_namespace *user_ns: 指向新mnt namespace所属的user namespace。
struct fs_struct *new_fs: 指向将新mnt namespace的根和当前工作目录移动到不同vfsmount的进程的fs_struct。
*/
struct mnt_namespace *copy_mnt_ns(unsigned long flags, struct mnt_namespace *ns,
		struct user_namespace *user_ns, struct fs_struct *new_fs)
{
	struct mnt_namespace *new_ns;
	struct vfsmount *rootmnt = NULL, *pwdmnt = NULL;
	struct mount *p, *q;
	struct mount *old;
	struct mount *new;
	int copy_flags;

	BUG_ON(!ns);	// 检查ns指针是否非空，如果不是则触发BUG（错误检测）

	if (likely(!(flags & CLONE_NEWNS))) {
		get_mnt_ns(ns);
		return ns;
	}

	old = ns->root;

	new_ns = alloc_mnt_ns(user_ns);	// 为新的 mnt namespace 分配内存空间，并在创建失败时返回错误指针
	if (IS_ERR(new_ns))
		return new_ns;

	namespace_lock();	// 锁定mnt namespace链表
	/* 第一遍扫描：拷贝整个 mnt namespace 树的拓扑结构 */
	copy_flags = CL_COPY_UNBINDABLE | CL_EXPIRE;
	if (user_ns != ns->user_ns)
		copy_flags |= CL_SHARED_TO_SLAVE | CL_UNPRIVILEGED;
	new = copy_tree(old, old->mnt.mnt_root, copy_flags);	// copy_tree函数复制原始mnt namespace的根vfsmount，其中copy_flags参数指定了在复制期间要执行的操作
	if (IS_ERR(new)) {
		namespace_unlock();
		free_mnt_ns(new_ns);
		return ERR_CAST(new);
	}
	new_ns->root = new;
	list_add_tail(&new_ns->list, &new->mnt_list);

	/*
	 * Second pass: switch the tsk->fs->* elements and mark new vfsmounts
	 * as belonging to new namespace.  We have already acquired a private
	 * fs_struct, so tsk->fs->lock is not needed.
	 */
    // 深度复制挂载树，把原namespace的挂载内容复制到新的namespace中？
	p = old;
	q = new;
	while (p) {
		q->mnt_ns = new_ns;
		if (new_fs) {
			if (&p->mnt == new_fs->root.mnt) {
				new_fs->root.mnt = mntget(&q->mnt);
				rootmnt = &p->mnt;
			}
			if (&p->mnt == new_fs->pwd.mnt) {
				new_fs->pwd.mnt = mntget(&q->mnt);
				pwdmnt = &p->mnt;
			}
		}
		p = next_mnt(p, old);
		q = next_mnt(q, new);
		if (!q)
			break;
		while (p->mnt.mnt_root != q->mnt.mnt_root)
			p = next_mnt(p, old);
	}
	namespace_unlock();

	if (rootmnt)
		mntput(rootmnt);
	if (pwdmnt)
		mntput(pwdmnt);

	return new_ns;	// 返回新的 mnt namespace
}

重要结构

struct mnt_namespace {
	atomic_t		count;	// 用于引用计数，表示当前结构体被引用的次数，当计数为0时，结构体可以被释放
	struct ns_common	ns;	// 用于实现命名空间的公共成员，包括命名空间类型和指向命名空间的引用计数
	struct mount *	root;	// 当前namespace下的根文件系统
	struct list_head	list;	// //当前namespace下的文件系统链表（vfsmount list）
	struct user_namespace	*user_ns;	// 指向该命名空间关联的用户命名空间，以限制命名空间中的权限
	u64			seq;		// 用于防止死循环的序列号，防止在命名空间的重复引用中发生死循环
	wait_queue_head_t poll;	// 一个等待队列头，用于将等待挂载事件的进程链接在一起，以便挂载事件完成后通知它们
	u64 event;	// 一个表示挂载事件类型的标志，用于在多个进程之间共享挂载事件
};

// 安装的文件系统描述符
struct vfsmount {
	struct list_head mnt_hash;
	struct vfsmount *mnt_parent;	/* fs we are mounted on */
	struct dentry *mnt_mountpoint;	/* dentry of mountpoint,挂载点目录 */
	struct dentry *mnt_root;	/* root of the mounted tree,文件系统根目录 */
	struct super_block *mnt_sb;	/* pointer to superblock */
	struct list_head mnt_mounts;	/* list of children, anchored here,子文件系统链表 */
	struct list_head mnt_child;	/* and going through their mnt_child,构成mnt_mounts list*/
	int mnt_flags;
	__u32 rh_reserved;		/* for use with fanotify */
	struct hlist_head rh_reserved2;	/* for use with fanotify */
	const char *mnt_devname;	/* Name of device e.g. /dev/dsk/hda1 */
	struct list_head mnt_list;  ///构成mnt_namespace->list
	struct list_head mnt_expire;	/* link in fs-specific expiry list */
	struct list_head mnt_share;	/* circular list of shared mounts,构成shared mount list */
	struct list_head mnt_slave_list;/* list of slave mounts, 所有slave mount组成的链表 */
	struct list_head mnt_slave;	/* slave list entry,构成slave mount list */
	struct vfsmount *mnt_master;	/* slave is on master->mnt_slave_list */
	struct mnt_namespace *mnt_ns;	/* containing namespace,所属的namespace */
	int mnt_id;			/* mount identifier */
	int mnt_group_id;		/* peer group identifier */
	/*
	 * We put mnt_count & mnt_expiry_mark at the end of struct vfsmount
	 * to let these frequently modified fields in a separate cache line
	 * (so that reads of mnt_flags wont ping-pong on SMP machines)
	 */
	atomic_t mnt_count;
	int mnt_expiry_mark;		/* true if marked for expiry */
	int mnt_pinned;
	int mnt_ghosts;
#ifdef CONFIG_SMP
	int *mnt_writers;
#else
	int mnt_writers;
#endif
};

在相关系统调用中看对于这个 namespace 的处理

mount() 里到底在哪处理 mnt namespace 的还没看到，这部分尚未完成。

// fs/namespace.c
// 参数：dev_name表示要挂载的设备名称，dir_name表示挂载点目录名称，type表示文件系统类型，flags表示挂载标志，data表示用于挂载的数据
SYSCALL_DEFINE5(mount, char __user *, dev_name, char __user *, dir_name,
		char __user *, type, unsigned long, flags, void __user *, data)
{
	int ret;
	char *kernel_type;
	char *kernel_dev;
	unsigned long data_page;

	kernel_type = copy_mount_string(type);	// 将type参数从用户空间复制到内核空间中
	ret = PTR_ERR(kernel_type);
	if (IS_ERR(kernel_type))
		goto out_type;

	kernel_dev = copy_mount_string(dev_name);	// 将dev_name参数从用户空间复制到内核空间中
	ret = PTR_ERR(kernel_dev);
	if (IS_ERR(kernel_dev))
		goto out_dev;

	ret = copy_mount_options(data, &data_page);	// 将data参数从用户空间复制到内核空间中，并返回一个指向新分配的内存页面的指针
	if (ret < 0)
		goto out_data;

	ret = do_mount(kernel_dev, dir_name, kernel_type, flags,
		(void *) data_page);	// 进行实际的挂载操作

	free_page(data_page);
out_data:
	kfree(kernel_dev);
out_dev:
	kfree(kernel_type);
out_type:
	return ret;
}

/*
dev_name: 字符串类型，表示要挂载的设备名，如果不需要设备名，则该参数为 NULL。
dir_name: 字符串类型，表示要挂载的目录路径。
type_page: 字符串类型，表示要挂载的文件系统类型，以 NULL 结尾，如果为 NULL，则表示需要自动检测文件系统类型。
flags: 一个位掩码，表示挂载时的选项。各个选项可以通过按位或（|）的方式组合使用。
data_page: 挂载选项的数据。它是一个指针，指向用于传递挂载选项的数据缓冲区。如果不需要挂载选项，则该参数为 NULL。
*/
long do_mount(char *dev_name, char *dir_name, char *type_page,
		  unsigned long flags, void *data_page)
{
	struct path path;
	int retval = 0;
	int mnt_flags = 0;

	/* Discard magic */
	if ((flags & MS_MGC_MSK) == MS_MGC_VAL)
		flags &= ~MS_MGC_MSK;

	/* Basic sanity checks */
	// 检查目录名是否为空或者是否超出了内存限制
	if (!dir_name || !*dir_name || !memchr(dir_name, 0, PAGE_SIZE))
		return -EINVAL;
	// 将缓冲区的最后一个字节设置为 0
	if (data_page)
		((char *)data_page)[PAGE_SIZE - 1] = 0;

	/* ... and get the mountpoint */
	retval = kern_path(dir_name, LOOKUP_FOLLOW, &path);
	if (retval)
		return retval;

	retval = security_sb_mount(dev_name, &path,
				   type_page, flags, data_page);
	if (retval)
		goto dput_out;

	/* Default to relatime unless overriden */
	if (!(flags & MS_NOATIME))
		mnt_flags |= MNT_RELATIME;

	/* Separate the per-mountpoint flags */
	if (flags & MS_NOSUID)
		mnt_flags |= MNT_NOSUID;
	if (flags & MS_NODEV)
		mnt_flags |= MNT_NODEV;
	if (flags & MS_NOEXEC)
		mnt_flags |= MNT_NOEXEC;
	if (flags & MS_NOATIME)
		mnt_flags |= MNT_NOATIME;
	if (flags & MS_NODIRATIME)
		mnt_flags |= MNT_NODIRATIME;
	if (flags & MS_STRICTATIME)
		mnt_flags &= ~(MNT_RELATIME | MNT_NOATIME);
	if (flags & MS_RDONLY)
		mnt_flags |= MNT_READONLY;

	flags &= ~(MS_NOSUID | MS_NOEXEC | MS_NODEV | MS_ACTIVE | MS_BORN |
		   MS_NOATIME | MS_NODIRATIME | MS_RELATIME| MS_KERNMOUNT |
		   MS_STRICTATIME);

	if (flags & MS_REMOUNT)
		retval = do_remount(&path, flags & ~MS_REMOUNT, mnt_flags,
				    data_page);
	else if (flags & MS_BIND)
		retval = do_loopback(&path, dev_name, flags & MS_REC);
	else if (flags & (MS_SHARED | MS_PRIVATE | MS_SLAVE | MS_UNBINDABLE))
		retval = do_change_type(&path, flags);
	else if (flags & MS_MOVE)
		retval = do_move_mount(&path, dev_name);
	else
		retval = do_new_mount(&path, type_page, flags, mnt_flags,
				      dev_name, data_page);
dput_out:
	path_put(&path);
	return retval;
}

static int do_new_mount(struct path *path, const char *fstype, int flags,
			int mnt_flags, const char *name, void *data)
{
	struct file_system_type *type;
	struct user_namespace *user_ns = current->nsproxy->mnt_ns->user_ns;
	struct vfsmount *mnt;
	int err;

	......

	err = do_add_mount(real_mount(mnt), path, mnt_flags);
	if (err)
		mntput(mnt);
	return err;
}

static int do_add_mount(struct mount *newmnt, struct path *path, int mnt_flags)
{
	struct mountpoint *mp;
	struct mount *parent;
	int err;

	mnt_flags &= ~MNT_INTERNAL_FLAGS;

	mp = lock_mount(path);
	if (IS_ERR(mp))
		return PTR_ERR(mp);

	parent = real_mount(path->mnt);
	err = -EINVAL;
	if (unlikely(!check_mnt(parent))) {
		/* that's acceptable only for automounts done in private ns */
		if (!(mnt_flags & MNT_SHRINKABLE))
			goto unlock;
		/* ... and for those we'd better have mountpoint still alive */
		if (!parent->mnt_ns)
			goto unlock;
	}

	/* Refuse the same filesystem on the same mount point */
	err = -EBUSY;
	if (path->mnt->mnt_sb == newmnt->mnt.mnt_sb &&
	    path->mnt->mnt_root == path->dentry)
		goto unlock;

	err = -EINVAL;
	if (d_is_symlink(newmnt->mnt.mnt_root))
		goto unlock;

	newmnt->mnt.mnt_flags = mnt_flags;
	err = graft_tree(newmnt, parent, mp);

unlock:
	unlock_mount(mp);
	return err;
}

Network namespace

copy_net_ns() 函数

struct net *copy_net_ns(unsigned long flags,
			struct user_namespace *user_ns, struct net *old_net)
{
	struct net *net;
	int rv;

	if (!(flags & CLONE_NEWNET))	// 没 CLONE_NEWNET 标志就用老的 net namespace
		return get_net(old_net);

	net = net_alloc();				// 分配内存
	if (!net)
		return ERR_PTR(-ENOMEM);

	get_user_ns(user_ns);

	mutex_lock(&net_mutex);			// 这里为什么要上锁啊？
	rv = setup_net(net, user_ns);	// 应该主要是这个函数来设置 net namespace
	if (rv == 0) {
		rtnl_lock();
		list_add_tail_rcu(&net->list, &net_namespace_list);
		rtnl_unlock();
	}
	mutex_unlock(&net_mutex);		// 解锁
	if (rv < 0) {
		put_user_ns(user_ns);
		net_drop_ns(net);
		return ERR_PTR(rv);
	}
	return net;
}

/*
 * setup_net runs the initializers for the network namespace object.
 */
static __net_init int setup_net(struct net *net, struct user_namespace *user_ns)
{
	/* Must be called with net_mutex held */
	const struct pernet_operations *ops, *saved_ops;
	int error = 0;
	LIST_HEAD(net_exit_list);

	atomic_set(&net->count, 1);
	atomic_set(&net->passive, 1);	// 表示网络命名空间是否处于被动模式（仅用于 IPv6）
	net->dev_base_seq = 1;			// 设备编号的起始值
	net->user_ns = user_ns;
	idr_init(&net->netns_ids);

    // 遍历 pernet_list 链表中的每个网络子系统，并调用其 ops_init 函数初始化网络子系统
	list_for_each_entry(ops, &pernet_list, list) {
		error = ops_init(ops, net);	
		if (error < 0)
			goto out_undo;
	}
out:
	return error;

out_undo:
	/* Walk through the list backwards calling the exit functions
	 * for the pernet modules whose init functions did not fail.
	 */
	list_add(&net->exit_list, &net_exit_list);
	saved_ops = ops;
	list_for_each_entry_continue_reverse(ops, &pernet_list, list)
		ops_exit_list(ops, &net_exit_list);

	ops = saved_ops;
	list_for_each_entry_continue_reverse(ops, &pernet_list, list)
		ops_free_list(ops, &net_exit_list);

	rcu_barrier();
	goto out;
}

重要结构

// file: include/linux/netdevice.h
struct net_device{
 //设备名
 char   name[IFNAMSIZ];

 //网络命名空间
 possible_net_t  *nd_net;

 ...
}

//file:include/net/net_namespace.h
struct net {
	atomic_t		passive;	/* To decided when the network
						 * namespace should be freed.
						 */
	atomic_t		count;		/* To decided when the network
						 *  namespace should be shut down.
						 */
	spinlock_t		rules_mod_lock;

	atomic64_t		cookie_gen;

	struct list_head	list;	/* list of network namespaces 链表头，用于连接所有网络命名空间 */
	struct list_head	cleanup_list;	/* namespaces on death row 链表头，用于管理即将销毁的网络命名空间 */
	struct list_head	exit_list;	/* Use only net_mutex */

	struct user_namespace   *user_ns;	/* Owning user namespace */
	struct idr		netns_ids;	// IDR，用于管理网络命名空间的唯一标识符

	struct ns_common	ns;

	struct proc_dir_entry 	*proc_net;	// 指向 procfs 中 /proc/net 目录的指针
	struct proc_dir_entry 	*proc_net_stat;	// 指向 procfs 中 /proc/net/stat 目录的指针

#ifdef CONFIG_SYSCTL
	struct ctl_table_set	sysctls;
#endif

	struct sock 		*rtnl;			/* rtnetlink socket */
	struct sock		*genl_sock;

	struct list_head 	dev_base_head;
	struct hlist_head 	*dev_name_head;
	struct hlist_head	*dev_index_head;
	unsigned int		dev_base_seq;	/* protected by rtnl_mutex */
	int			ifindex;
	unsigned int		dev_unreg_count;

	/* core fib_rules */
	struct list_head	rules_ops;

	// 每个 net 中都有一个回环设备
	struct net_device       *loopback_dev;          /* The loopback */
	struct netns_core	core;
	struct netns_mib	mib;
	struct netns_packet	packet;
	struct netns_unix	unx;
    // 在这个数据结构里，定义了每一个网络空间专属的路由表、ipfilter 以及各种内核参数
	struct netns_ipv4	ipv4;	
#if IS_ENABLED(CONFIG_IPV6)
	struct netns_ipv6	ipv6;
#endif
#if IS_ENABLED(CONFIG_IEEE802154_6LOWPAN)
	struct netns_ieee802154_lowpan	ieee802154_lowpan;
#endif
#if defined(CONFIG_IP_SCTP) || defined(CONFIG_IP_SCTP_MODULE)
	struct netns_sctp	sctp;
#endif
#if defined(CONFIG_IP_DCCP) || defined(CONFIG_IP_DCCP_MODULE)
	struct netns_dccp	dccp;
#endif
#ifdef CONFIG_NETFILTER
	struct netns_nf		nf;
	struct netns_xt		xt;
#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)
	struct netns_ct		ct;
#endif
#if defined(CONFIG_NF_TABLES) || defined(CONFIG_NF_TABLES_MODULE)
	struct netns_nftables	nft;
#endif
#if IS_ENABLED(CONFIG_NF_DEFRAG_IPV6)
	struct netns_nf_frag	nf_frag;
#endif
	struct sock		*nfnl;
	struct sock		*nfnl_stash;
#endif
#ifdef CONFIG_WEXT_CORE
	struct sk_buff_head	wext_nlevents;
#endif
	struct net_generic __rcu	*gen;

	/* Note : following structs are cache line aligned */
#ifdef CONFIG_XFRM
	struct netns_xfrm	xfrm;
#endif
#if IS_ENABLED(CONFIG_IP_VS)
	struct netns_ipvs	*ipvs;
#endif
#if IS_ENABLED(CONFIG_MPLS)
	struct netns_mpls	mpls;
#endif
	struct sock		*diag_nlsk;
	atomic_t		fnhe_genid;
};

在相关系统调用中看对于这个 namespace 的处理

socket 创建：

// net/socket.c
// socket 创建
int sock_create(int family, int type, int protocol, struct socket **res)
{
 return __sock_create(current->nsproxy->net_ns, family, type, protocol, res, 0);
}

// include/net/sock.h
// 调用流程：__sock_create => inet_create => sk_alloc
static inline
void sock_net_set(struct sock *sk, struct net *net)
{
	write_pnet(&sk->sk_net, net);	// 设置新 socket 和 netns 的关联关系
}

// include/net/net_namespace.h
static inline void write_pnet(possible_net_t *pnet, struct net *net)
{
#ifdef CONFIG_NET_NS
	pnet->net = net;
#endif
}

User namespaces

User namespace 不和其他 namespace 放在一起可能是因为 user namespace 会影响其他 namespace 的执行结果。具体的例子还没看到。

user namespace 的内容和其他5个 namespace 比起来有点特殊，所以很多东西就放在这里单独看。先来一张结构图：

一个 task_struct 结构内不止有一个 user_namespace：

本进程所在的 user namespace 为 task->real_cred->user_ns，调用 getpid()/getuid() 所获得的返回值就是由这个 user_namespace 决定的。
其他类型的 namespace 中会有指针成员链接到 user_namespace（即 xxx_ns->user_ns），这个 user namespace 就是创建相应 namespace 时进程所属的 user namespace，相当于每个 namespace 都有一个 owner(user namespace)。

这两个 user_namespace 不一定相等。猜测：如果一个进程继承了父进程的 user_namespace，但某个 namespace 用的是别的进程的（也即该进程并非所有 namespace 都是用的父进程的），那是不是就有可能出现自己的 user_namespace 和某个 namespace 的 user_namespace 不相同的情况？

user_namespace 是呈现树状结构的，上述的两种 user_namespace，每一个都是其中的节点。

create_user_ns()

创建 user namespace 的地方和创建其他 namespace 的不一样，调用路径为 copy_process() → copy_creds() → create_user_ns()，

/*
 * Create a new user namespace, deriving the creator from the user in the
 * passed credentials, and replacing that user with the new root user for the
 * new namespace.
 *
 * This is called by copy_creds(), which will finish setting the target task's
 * credentials.
 */
int create_user_ns(struct cred *new)
{
	struct user_namespace *ns, *parent_ns = new->user_ns;
	kuid_t owner = new->euid;
	kgid_t group = new->egid;
	int ret;

	if (parent_ns->level > 32)	// user namespace 有深度限制
		return -EUSERS;

	/*
	 * Verify that we can not violate the policy of which files
	 * may be accessed that is specified by the root directory,
	 * by verifing that the root directory is at the root of the
	 * mount namespace which allows all files to be accessed.
	 */
	if (current_chrooted())
		return -EPERM;

	/* The creator needs a mapping in the parent user namespace
	 * or else we won't be able to reasonably tell userspace who
	 * created a user_namespace.
	 */
	if (!kuid_has_mapping(parent_ns, owner) ||
	    !kgid_has_mapping(parent_ns, group))
		return -EPERM;

	ns = kmem_cache_zalloc(user_ns_cachep, GFP_KERNEL);	// 分配新的 user namespace 内存空间
	if (!ns)
		return -ENOMEM;

	ret = ns_alloc_inum(&ns->ns);
	if (ret) {
		kmem_cache_free(user_ns_cachep, ns);
		return ret;
	}
	ns->ns.ops = &userns_operations;

	atomic_set(&ns->count, 1);
    // 初始化 user namespace 的各个成员
	/* Leave the new->user_ns reference with the new user namespace. */
	ns->parent = parent_ns;				// 父节点
	ns->level = parent_ns->level + 1;	// 在父节点基础上增加level
	ns->owner = owner;					// 设置 user ns 的 owner uid
	ns->group = group;					// 设置 user ns 的 owner gid

	/* Inherit USERNS_SETGROUPS_ALLOWED from our parent */
	mutex_lock(&userns_state_mutex);
	ns->flags = parent_ns->flags;
	mutex_unlock(&userns_state_mutex);

	set_cred_user_ns(new, ns);	// 将新的 user namespace 设置到 cred->user_ns

#ifdef CONFIG_PERSISTENT_KEYRINGS
	init_rwsem(&ns->persistent_keyring_register_sem);
#endif
	return 0;
}

static void set_cred_user_ns(struct cred *cred, struct user_namespace *user_ns)
{
	/* Start with the same capabilities as init but useless for doing
	 * anything as the capabilities are bound to the new user namespace.
	 */
    // 对于本 user namespace 初始的第一进程，赋予所有的 capability
	cred->securebits = SECUREBITS_DEFAULT;
	cred->cap_inheritable = CAP_EMPTY_SET;
	cred->cap_permitted = CAP_FULL_SET;
	cred->cap_effective = CAP_FULL_SET;
	cred->cap_bset = CAP_FULL_SET;
#ifdef CONFIG_KEYS
	key_put(cred->request_key_auth);
	cred->request_key_auth = NULL;
#endif
	/* tgcred will be cleared in our caller bc CLONE_THREAD won't be set */
	cred->user_ns = user_ns;	// 替换 cred 中的 user_ns 为新值
}

get_user_ns()

user namespace 在 create_new_namespaces() 中是没有相应的 copy_xxx 函数的，因为它作为成员被包含在其他5个 namespace 里了。

在 clone_uts_ns()、create_ipc_ns() 和 create_pid_namespace() 中（其他俩还没看），都有这个语句：

ns->user_ns = get_user_ns(user_ns);

static inline struct user_namespace *get_user_ns(struct user_namespace *ns)
{
	if (ns)	// 检查 ns 是否为空
		atomic_inc(&ns->count);	// 引用计数++
	return ns;	// 把 ns 原样返回
}

也就是说，如果需要新建某个 namespace 的话，user_ns 成员就赋值为原有的 user_ns。

重要结构

// 本结构体记录的是进程的用户和组信息的命名空间，包括用户 ID 范围、组 ID 范围、用户、组、密码等信息
struct user_namespace {
	struct uid_gid_map	uid_map;
	struct uid_gid_map	gid_map;
	struct uid_gid_map	projid_map;
	atomic_t		count;	// 命名空间引用计数，表示当前有多少个进程引用了这个命名空间
	struct user_namespace	*parent;	// 父 user_namespace
	int			level;		// 深度
	kuid_t			owner;
	kgid_t			group;
	struct ns_common	ns;	// 用于命名空间管理的公共命名空间结构体
	unsigned long		flags;

	/* Register of per-UID persistent keyrings for this namespace */
#ifdef CONFIG_PERSISTENT_KEYRINGS
	struct key		*persistent_keyring_register;
	struct rw_semaphore	persistent_keyring_register_sem;
#endif
};

/*
 * The security context of a task
 *
 * The parts of the context break down into two categories:
 *
 *  (1) The objective context of a task.  These parts are used when some other
 *	task is attempting to affect this one.
 *
 *  (2) The subjective context.  These details are used when the task is acting
 *	upon another object, be that a file, a task, a key or whatever.
 *
 * Note that some members of this structure belong to both categories - the
 * LSM security pointer for instance.
 *
 * A task has two security pointers.  task->real_cred points to the objective
 * context that defines that task's actual details.  The objective part of this
 * context is used whenever that task is acted upon.
 *
 * task->cred points to the subjective context that defines the details of how
 * that task is going to act upon another object.  This may be overridden
 * temporarily to point to another security context, but normally points to the
 * same context as task->real_cred.
 */
struct cred {	// 本结构体用于表示进程的身份认证信息，包括用户 ID（UID）、组 ID（GID）、安全令牌等
	atomic_t	usage;
#ifdef CONFIG_DEBUG_CREDENTIALS
	atomic_t	subscribers;	/* number of processes subscribed */
	void		*put_addr;
	unsigned	magic;
#define CRED_MAGIC	0x43736564
#define CRED_MAGIC_DEAD	0x44656144
#endif
	kuid_t		uid;		// 进程的真实用户 ID
	kgid_t		gid;		// 进程的真实组 ID
	kuid_t		suid;		/* saved UID of the task */
	kgid_t		sgid;		/* saved GID of the task */
	kuid_t		euid;		// 进程的有效用户 ID
	kgid_t		egid;		// 进程的有效组 ID
	kuid_t		fsuid;		// 进程的文件系统用户 ID，用于文件系统操作的权限检查
	kgid_t		fsgid;		// 进程的文件系统组 ID，用于文件系统操作的权限检查
	unsigned	securebits;	/* SUID-less security management */
	kernel_cap_t	cap_inheritable; /* caps our children can inherit */
	kernel_cap_t	cap_permitted;	/* caps we're permitted */
	kernel_cap_t	cap_effective;	/* caps we can actually use */
	kernel_cap_t	cap_bset;	/* capability bounding set */
#ifdef CONFIG_KEYS
	unsigned char	jit_keyring;	/* default keyring to attach requested
					 * keys to */
	struct key __rcu *session_keyring; /* keyring inherited over fork */
	struct key	*process_keyring; /* keyring private to this process */
	struct key	*thread_keyring; /* keyring private to this thread */
	struct key	*request_key_auth; /* assumed request_key authority */
#endif
#ifdef CONFIG_SECURITY
	void		*security;	/* subjective LSM security */
#endif
	struct user_struct *user;	/* real user ID subscription */
	struct user_namespace *user_ns;	// 指向进程所属的 user_namespace 结构体
	struct group_info *group_info;	/* supplementary groups for euid/fsgid */
	struct rcu_head	rcu;		/* RCU deletion hook */
};

在相关系统调用中看对于这个 namespace 的处理

getuid()

SYSCALL_DEFINE0(getuid)
{
	/* Only we change this so SMP safe */
    // current_uid() 获取的是全局 uid，current_user_ns() 获取的是当前 user ns；将全局 uid 转换成当前 user ns 中的 uid
	return from_kuid_munged(current_user_ns(), current_uid());
}

uid_t from_kuid_munged(struct user_namespace *targ, kuid_t kuid)
{
	uid_t uid;
	uid = from_kuid(targ, kuid); // 通过查询 uid_map 把全局 uid 转换成对应 user ns 的 uid

	if (uid == (uid_t) -1)
		uid = overflowuid;
	return uid;
}

其他

map_write() 中有过一个漏洞，CVE-2018-18955。

参考

4.1.19版本的Linux内核源码

Pid_namespace分析

Linux ns 5. IPC Namespace 详解

Linux内核情景分析之消息队列

User Namespace 详解

Mnt Namespace 详解 - 泰晓科技 (tinylab.org)

Namespace 在 Kernel 里是怎么实现的？以 mount namespace 为例 – 肥叉烧 feichashao.com

Linux Namespace分析——mnt namespace的实现与应用 (hustcat.github.io)

动手实验+源码分析，彻底弄懂 Linux 网络命名空间 - 腾讯云开发者社区-腾讯云 (tencent.com)

Module	Syscall	Descript
sem	semget()	创建信号量
-	semctl()	初始化信号量
-	semop()	信号量的PV操作
msg	msgget()	创建消息队列
-	msgctl()	获取和设置消息队列的属性
-	msgsnd()	将消息写入到消息队列
-	msgrcv()	从消息队列读取消息
shm	shmget()	创建共享内存对象
-	shmctl()	共享内存管理
-	shmat()	把共享内存区对象映射到调用进程的地址空间
-	shmdt()	断开共享内存连接

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

在内核源码中的实现.md

在内核源码中的实现.md

目录

总述

UTS namespace

copy_utsname() 函数

重要结构

在相关系统调用中看对于这个 namespace 的处理

IPC（Interprocess Communication）namespace

copy_ipcs() 函数

重要结构

相关系统调用

在相关系统调用中看对于这个 namespace 的处理

semget()

PID namespace

copy_pid_ns() 函数

重要结构

在相关系统调用中看对于这个 namespace 的处理

Mount namespaces

copy_mnt_ns() 函数

重要结构

在相关系统调用中看对于这个 namespace 的处理

Network namespace

copy_net_ns() 函数

重要结构

在相关系统调用中看对于这个 namespace 的处理

User namespaces

create_user_ns()

get_user_ns()

重要结构

在相关系统调用中看对于这个 namespace 的处理

getuid()

其他

参考

Files

在内核源码中的实现.md

Latest commit

History

在内核源码中的实现.md

File metadata and controls

目录

总述

UTS namespace

copy_utsname() 函数

重要结构

在相关系统调用中看对于这个 namespace 的处理

IPC（Interprocess Communication）namespace

copy_ipcs() 函数

重要结构

相关系统调用

在相关系统调用中看对于这个 namespace 的处理

semget()

PID namespace

copy_pid_ns() 函数

重要结构

在相关系统调用中看对于这个 namespace 的处理

Mount namespaces

copy_mnt_ns() 函数

重要结构

在相关系统调用中看对于这个 namespace 的处理

Network namespace

copy_net_ns() 函数

重要结构

在相关系统调用中看对于这个 namespace 的处理

User namespaces

create_user_ns()

get_user_ns()

重要结构

在相关系统调用中看对于这个 namespace 的处理

getuid()

其他

参考