diff --git a/dcgan/README.md b/dcgan/README.md
index 3f7bdef6b1..e1e9bd4398 100644
--- a/dcgan/README.md
+++ b/dcgan/README.md
@@ -24,7 +24,7 @@ usage: main.py [-h] --dataset DATASET --dataroot DATAROOT [--workers WORKERS]
                [--batchSize BATCHSIZE] [--imageSize IMAGESIZE] [--nz NZ]
                [--ngf NGF] [--ndf NDF] [--niter NITER] [--lr LR]
                [--beta1 BETA1] [--cuda] [--ngpu NGPU] [--netG NETG]
-               [--netD NETD] [--mps]
+               [--netD NETD]
 
 optional arguments:
   -h, --help            show this help message and exit
@@ -40,7 +40,6 @@ optional arguments:
   --lr LR               learning rate, default=0.0002
   --beta1 BETA1         beta1 for adam. default=0.5
   --cuda                enables cuda
-  --mps                 enables macOS GPU
   --ngpu NGPU           number of GPUs to use
   --netG NETG           path to netG (to continue training)
   --netD NETD           path to netD (to continue training)
diff --git a/dcgan/main.py b/dcgan/main.py
index ec05cb9518..96d1feb90b 100644
--- a/dcgan/main.py
+++ b/dcgan/main.py
@@ -25,7 +25,7 @@
 parser.add_argument('--niter', type=int, default=25, help='number of epochs to train for')
 parser.add_argument('--lr', type=float, default=0.0002, help='learning rate, default=0.0002')
 parser.add_argument('--beta1', type=float, default=0.5, help='beta1 for adam. default=0.5')
-parser.add_argument('--cuda', action='store_true', default=False, help='enables cuda')
+parser.add_argument('--cuda', action='store_true', help='enables cuda')
 parser.add_argument('--dry-run', action='store_true', help='check a single training cycle works')
 parser.add_argument('--ngpu', type=int, default=1, help='number of GPUs to use')
 parser.add_argument('--netG', default='', help="path to netG (to continue training)")
@@ -33,7 +33,6 @@
 parser.add_argument('--outf', default='.', help='folder to output images and model checkpoints')
 parser.add_argument('--manualSeed', type=int, help='manual seed')
 parser.add_argument('--classes', default='bedroom', help='comma separated list of classes for the lsun data set')
-parser.add_argument('--mps', action='store_true', default=False, help='enables macOS GPU training')
 
 opt = parser.parse_args()
 print(opt)
@@ -53,9 +52,6 @@
 
 if torch.cuda.is_available() and not opt.cuda:
     print("WARNING: You have a CUDA device, so you should probably run with --cuda")
-
-if torch.backends.mps.is_available() and not opt.mps:
-    print("WARNING: You have mps device, to enable macOS GPU run with --mps")
   
 if opt.dataroot is None and str(opt.dataset).lower() != 'fake':
     raise ValueError("`dataroot` parameter is required for dataset \"%s\"" % opt.dataset)
@@ -107,13 +103,7 @@
 dataloader = torch.utils.data.DataLoader(dataset, batch_size=opt.batchSize,
                                          shuffle=True, num_workers=int(opt.workers))
 
-if opt.cuda:
-    device = torch.device("cuda:0")
-elif opt.mps:
-    device = torch.device("mps")
-else:
-    device = torch.device("cpu")
-
+device = torch.device("cuda:0" if opt.cuda else "cpu")
 ngpu = int(opt.ngpu)
 nz = int(opt.nz)
 ngf = int(opt.ngf)
diff --git a/fast_neural_style/README.md b/fast_neural_style/README.md
index 8057847214..93198570f3 100644
--- a/fast_neural_style/README.md
+++ b/fast_neural_style/README.md
@@ -27,7 +27,6 @@ python neural_style/neural_style.py eval --content-image </path/to/content/image
 - `--output-image`: path for saving the output image.
 - `--content-scale`: factor for scaling down the content image if memory is an issue (eg: value of 2 will halve the height and width of content-image)
 - `--cuda`: set it to 1 for running on GPU, 0 for CPU.
-- `--mps`: set it to 1 for running on macOS GPU
 
 Train model
 
@@ -41,7 +40,6 @@ There are several command line arguments, the important ones are listed below
 - `--style-image`: path to style-image.
 - `--save-model-dir`: path to folder where trained model will be saved.
 - `--cuda`: set it to 1 for running on GPU, 0 for CPU.
-- `--mps`: set it to 1 for running on macOS GPU
 
 Refer to `neural_style/neural_style.py` for other command line arguments. For training new models you might have to tune the values of `--content-weight` and `--style-weight`. The mosaic style model shown above was trained with `--content-weight 1e5` and `--style-weight 1e10`. The remaining 3 models were also trained with similar order of weight parameters with slight variation in the `--style-weight` (`5e10` or `1e11`).
 
diff --git a/fast_neural_style/neural_style/neural_style.py b/fast_neural_style/neural_style/neural_style.py
index 91bf642d82..5c0f9a429d 100644
--- a/fast_neural_style/neural_style/neural_style.py
+++ b/fast_neural_style/neural_style/neural_style.py
@@ -29,12 +29,7 @@ def check_paths(args):
 
 
 def train(args):
-    if args.cuda:
-        device = torch.device("cuda")
-    elif args.mps:
-        device = torch.device("mps")
-    else:
-        device = torch.device("cpu")
+    device = torch.device("cuda" if args.cuda else "cpu")
 
     np.random.seed(args.seed)
     torch.manual_seed(args.seed)
@@ -229,11 +224,10 @@ def main():
                                  help="path for saving the output image")
     eval_arg_parser.add_argument("--model", type=str, required=True,
                                  help="saved model to be used for stylizing the image. If file ends in .pth - PyTorch path is used, if in .onnx - Caffe2 path")
-    eval_arg_parser.add_argument("--cuda", type=int, default=False,
-                                 help="set it to 1 for running on cuda, 0 for CPU")
+    eval_arg_parser.add_argument("--cuda", type=int, required=True,
+                                 help="set it to 1 for running on GPU, 0 for CPU")
     eval_arg_parser.add_argument("--export_onnx", type=str,
                                  help="export ONNX model to a given file")
-    eval_arg_parser.add_argument('--mps', action='store_true', default=False, help='enable macOS GPU training')
 
     args = main_arg_parser.parse_args()
 
@@ -243,8 +237,6 @@ def main():
     if args.cuda and not torch.cuda.is_available():
         print("ERROR: cuda is not available, try running on CPU")
         sys.exit(1)
-    if not args.mps and torch.backends.mps.is_available():
-        print("WARNING: mps is available, run with --mps to enable macOS GPU")
 
     if args.subcommand == "train":
         check_paths(args)
diff --git a/imagenet/main.py b/imagenet/main.py
index e84ca0b74c..369e446b6e 100644
--- a/imagenet/main.py
+++ b/imagenet/main.py
@@ -104,10 +104,7 @@ def main():
 
     args.distributed = args.world_size > 1 or args.multiprocessing_distributed
 
-    if torch.cuda.is_available():
-        ngpus_per_node = torch.cuda.device_count()
-    else:
-        ngpus_per_node = 1
+    ngpus_per_node = torch.cuda.device_count()
     if args.multiprocessing_distributed:
         # Since we have ngpus_per_node processes per node, the total world_size
         # needs to be adjusted accordingly
@@ -144,33 +141,29 @@ def main_worker(gpu, ngpus_per_node, args):
         print("=> creating model '{}'".format(args.arch))
         model = models.__dict__[args.arch]()
 
-    if not torch.cuda.is_available() and not torch.backends.mps.is_available():
+    if not torch.cuda.is_available():
         print('using CPU, this will be slow')
     elif args.distributed:
         # For multiprocessing distributed, DistributedDataParallel constructor
         # should always set the single device scope, otherwise,
         # DistributedDataParallel will use all available devices.
-        if torch.cuda.is_available():
-            if args.gpu is not None:
-                torch.cuda.set_device(args.gpu)
-                model.cuda(args.gpu)
-                # When using a single GPU per process and per
-                # DistributedDataParallel, we need to divide the batch size
-                # ourselves based on the total number of GPUs of the current node.
-                args.batch_size = int(args.batch_size / ngpus_per_node)
-                args.workers = int((args.workers + ngpus_per_node - 1) / ngpus_per_node)
-                model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.gpu])
-            else:
-                model.cuda()
-                # DistributedDataParallel will divide and allocate batch_size to all
-                # available GPUs if device_ids are not set
-                model = torch.nn.parallel.DistributedDataParallel(model)
-    elif args.gpu is not None and torch.cuda.is_available():
+        if args.gpu is not None:
+            torch.cuda.set_device(args.gpu)
+            model.cuda(args.gpu)
+            # When using a single GPU per process and per
+            # DistributedDataParallel, we need to divide the batch size
+            # ourselves based on the total number of GPUs of the current node.
+            args.batch_size = int(args.batch_size / ngpus_per_node)
+            args.workers = int((args.workers + ngpus_per_node - 1) / ngpus_per_node)
+            model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.gpu])
+        else:
+            model.cuda()
+            # DistributedDataParallel will divide and allocate batch_size to all
+            # available GPUs if device_ids are not set
+            model = torch.nn.parallel.DistributedDataParallel(model)
+    elif args.gpu is not None:
         torch.cuda.set_device(args.gpu)
         model = model.cuda(args.gpu)
-    elif torch.backends.mps.is_available():
-        device = torch.device("mps")
-        model = model.to(device)
     else:
         # DataParallel will divide and allocate batch_size to all available GPUs
         if args.arch.startswith('alexnet') or args.arch.startswith('vgg'):
@@ -179,17 +172,8 @@ def main_worker(gpu, ngpus_per_node, args):
         else:
             model = torch.nn.DataParallel(model).cuda()
 
-    if torch.cuda.is_available():
-        if args.gpu:
-            device = torch.device('cuda:{}'.format(args.gpu))
-        else:
-            device = torch.device("cuda")
-    elif torch.backends.mps.is_available():
-        device = torch.device("mps")
-    else:
-        device = torch.device("cpu")
     # define loss function (criterion), optimizer, and learning rate scheduler
-    criterion = nn.CrossEntropyLoss().to(device)
+    criterion = nn.CrossEntropyLoss().cuda(args.gpu)
 
     optimizer = torch.optim.SGD(model.parameters(), args.lr,
                                 momentum=args.momentum,
@@ -204,7 +188,7 @@ def main_worker(gpu, ngpus_per_node, args):
             print("=> loading checkpoint '{}'".format(args.resume))
             if args.gpu is None:
                 checkpoint = torch.load(args.resume)
-            elif torch.cuda.is_available():
+            else:
                 # Map model to be loaded to specified single gpu.
                 loc = 'cuda:{}'.format(args.gpu)
                 checkpoint = torch.load(args.resume, map_location=loc)
@@ -318,13 +302,10 @@ def train(train_loader, model, criterion, optimizer, epoch, args):
         # measure data loading time
         data_time.update(time.time() - end)
 
-        if args.gpu is not None and torch.cuda.is_available():
+        if args.gpu is not None:
             images = images.cuda(args.gpu, non_blocking=True)
-        elif not args.gpu and torch.cuda.is_available():
+        if torch.cuda.is_available():
             target = target.cuda(args.gpu, non_blocking=True)
-        elif torch.backends.mps.is_available():
-            images = images.to('mps')
-            target = target.to('mps')
 
         # compute output
         output = model(images)
@@ -356,11 +337,8 @@ def run_validate(loader, base_progress=0):
             end = time.time()
             for i, (images, target) in enumerate(loader):
                 i = base_progress + i
-                if args.gpu is not None and torch.cuda.is_available():
+                if args.gpu is not None:
                     images = images.cuda(args.gpu, non_blocking=True)
-                if torch.backends.mps.is_available():
-                    images = images.to('mps')
-                    target = target.to('mps')
                 if torch.cuda.is_available():
                     target = target.cuda(args.gpu, non_blocking=True)
 
@@ -443,12 +421,7 @@ def update(self, val, n=1):
         self.avg = self.sum / self.count
 
     def all_reduce(self):
-        if torch.cuda.is_available():
-            device = torch.device("cuda")
-        elif torch.backends.mps.is_available():
-            device = torch.device("mps")
-        else:
-            device = torch.device("cpu")
+        device = "cuda" if torch.cuda.is_available() else "cpu"
         total = torch.tensor([self.sum, self.count], dtype=torch.float32, device=device)
         dist.all_reduce(total, dist.ReduceOp.SUM, async_op=False)
         self.sum, self.count = total.tolist()
diff --git a/legacy/snli/train.py b/legacy/snli/train.py
index aa70aef45a..27ed176cb0 100644
--- a/legacy/snli/train.py
+++ b/legacy/snli/train.py
@@ -17,8 +17,6 @@
 if torch.cuda.is_available():
     torch.cuda.set_device(args.gpu)
     device = torch.device('cuda:{}'.format(args.gpu))
-elif torch.backends.mps.is_available():
-    device = torch.device('mps')
 else:
     device = torch.device('cpu')
 
diff --git a/mnist/main.py b/mnist/main.py
index 29d81d60a2..378b5b730b 100644
--- a/mnist/main.py
+++ b/mnist/main.py
@@ -85,8 +85,6 @@ def main():
                         help='Learning rate step gamma (default: 0.7)')
     parser.add_argument('--no-cuda', action='store_true', default=False,
                         help='disables CUDA training')
-    parser.add_argument('--no-mps', action='store_true', default=False,
-                        help='disables macOS GPU training')
     parser.add_argument('--dry-run', action='store_true', default=False,
                         help='quickly check a single pass')
     parser.add_argument('--seed', type=int, default=1, metavar='S',
@@ -97,16 +95,10 @@ def main():
                         help='For Saving the current Model')
     args = parser.parse_args()
     use_cuda = not args.no_cuda and torch.cuda.is_available()
-    use_mps = not args.no_mps and torch.backends.mps.is_available()
 
     torch.manual_seed(args.seed)
 
-    if use_cuda:
-        device = torch.device("cuda")
-    elif use_mps:
-        device = torch.device("mps")
-    else:
-        device = torch.device("cpu")
+    device = torch.device("cuda" if use_cuda else "cpu")
 
     train_kwargs = {'batch_size': args.batch_size}
     test_kwargs = {'batch_size': args.test_batch_size}
diff --git a/mnist_hogwild/main.py b/mnist_hogwild/main.py
index 82c820ed79..8e50ddb1b1 100644
--- a/mnist_hogwild/main.py
+++ b/mnist_hogwild/main.py
@@ -29,8 +29,6 @@
                     help='how many training processes to use (default: 2)')
 parser.add_argument('--cuda', action='store_true', default=False,
                     help='enables CUDA training')
-parser.add_argument('--mps', action='store_true', default=False,
-                        help='enables macOS GPU training')
 parser.add_argument('--dry-run', action='store_true', default=False,
                     help='quickly check a single pass')
 
@@ -57,14 +55,7 @@ def forward(self, x):
     args = parser.parse_args()
 
     use_cuda = args.cuda and torch.cuda.is_available()
-    use_mps = args.mps and torch.backends.mps.is_available()
-    if use_cuda:
-        device = torch.device("cuda")
-    elif use_mps:
-        device = torch.device("mps")
-    else:
-        device = torch.device("cpu")
-
+    device = torch.device("cuda" if use_cuda else "cpu")
     transform=transforms.Compose([
         transforms.ToTensor(),
         transforms.Normalize((0.1307,), (0.3081,))
diff --git a/run_python_examples.sh b/run_python_examples.sh
index 8f0777b27e..1d236ca94d 100755
--- a/run_python_examples.sh
+++ b/run_python_examples.sh
@@ -56,7 +56,7 @@ function start() {
 
 function dcgan() {
   start
-  python main.py --dataset fake $CUDA_FLAG --mps --dry-run || error "dcgan failed"
+  python main.py --dataset fake $CUDA_FLAG --dry-run || error "dcgan failed"
 }
 
 function distributed() {
@@ -74,7 +74,7 @@ function fast_neural_style() {
   test -d "saved_models" || { error "saved models not found"; return; }
 
   echo "running fast neural style model"
-  python neural_style/neural_style.py eval --content-image images/content-images/amber.jpg --model saved_models/candy.pth --output-image images/output-images/amber-candy.jpg --cuda $CUDA --mps || error "neural_style.py failed"
+  python neural_style/neural_style.py eval --content-image images/content-images/amber.jpg --model saved_models/candy.pth --output-image images/output-images/amber-candy.jpg --cuda $CUDA || error "neural_style.py failed"
 }
 
 function imagenet() {
@@ -82,7 +82,7 @@ function imagenet() {
   if [[ ! -d "sample/val" || ! -d "sample/train" ]]; then
     mkdir -p sample/val/n
     mkdir -p sample/train/n
-    curl -O "https://upload.wikimedia.org/wikipedia/commons/5/5a/Socks-clinton.jpg" || { error "couldn't download sample image for imagenet"; return; }
+    wget "https://upload.wikimedia.org/wikipedia/commons/5/5a/Socks-clinton.jpg" || { error "couldn't download sample image for imagenet"; return; }
     mv Socks-clinton.jpg sample/train/n
     cp sample/train/n/* sample/val/n/
   fi
@@ -137,7 +137,7 @@ function fx() {
 
 function super_resolution() {
   start
-  python main.py --upscale_factor 3 --batchSize 4 --testBatchSize 100 --nEpochs 1 --lr 0.001 --mps || error "super resolution failed"
+  python main.py --upscale_factor 3 --batchSize 4 --testBatchSize 100 --nEpochs 1 --lr 0.001  || error "super resolution failed"
 }
 
 function time_sequence_prediction() {
@@ -153,7 +153,7 @@ function vae() {
 
 function word_language_model() {
   start
-  python main.py --epochs 1 --dry-run $CUDA_FLAG --mps || error "word_language_model failed"
+  python main.py --epochs 1 --dry-run $CUDA_FLAG || error "word_language_model failed"
 }
 
 function clean() {
diff --git a/siamese_network/main.py b/siamese_network/main.py
index 37767dffc4..33a5f71517 100644
--- a/siamese_network/main.py
+++ b/siamese_network/main.py
@@ -249,8 +249,6 @@ def main():
                         help='Learning rate step gamma (default: 0.7)')
     parser.add_argument('--no-cuda', action='store_true', default=False,
                         help='disables CUDA training')
-    parser.add_argument('--no-mps', action='store_true', default=False,
-                        help='disables macOS GPU training')
     parser.add_argument('--dry-run', action='store_true', default=False,
                         help='quickly check a single pass')
     parser.add_argument('--seed', type=int, default=1, metavar='S',
@@ -262,16 +260,10 @@ def main():
     args = parser.parse_args()
     
     use_cuda = not args.no_cuda and torch.cuda.is_available()
-    use_mps = not args.no_mps and torch.backends.mps.is_available()
 
     torch.manual_seed(args.seed)
 
-    if use_cuda:
-        device = torch.device("cuda")
-    elif use_mps:
-        device = torch.device("mps")
-    else:
-        device = torch.device("cpu")
+    device = torch.device("cuda" if use_cuda else "cpu")
 
     train_kwargs = {'batch_size': args.batch_size}
     test_kwargs = {'batch_size': args.test_batch_size}
diff --git a/super_resolution/README.md b/super_resolution/README.md
index 6b5fe831d9..d3f7cab359 100644
--- a/super_resolution/README.md
+++ b/super_resolution/README.md
@@ -17,7 +17,6 @@ optional arguments:
   --nEpochs             number of epochs to train for
   --lr                  Learning Rate. Default=0.01
   --cuda                use cuda
-  --mps                 enable GPU on macOS
   --threads             number of threads for data loader to use Default=4
   --seed                random seed to use. Default=123
 ```
diff --git a/super_resolution/main.py b/super_resolution/main.py
index 8c5519582a..f9f5f8b190 100644
--- a/super_resolution/main.py
+++ b/super_resolution/main.py
@@ -17,7 +17,6 @@
 parser.add_argument('--nEpochs', type=int, default=2, help='number of epochs to train for')
 parser.add_argument('--lr', type=float, default=0.01, help='Learning Rate. Default=0.01')
 parser.add_argument('--cuda', action='store_true', help='use cuda?')
-parser.add_argument('--mps', action='store_true', default=False, help='enables macOS GPU training')
 parser.add_argument('--threads', type=int, default=4, help='number of threads for data loader to use')
 parser.add_argument('--seed', type=int, default=123, help='random seed to use. Default=123')
 opt = parser.parse_args()
@@ -26,18 +25,10 @@
 
 if opt.cuda and not torch.cuda.is_available():
     raise Exception("No GPU found, please run without --cuda")
-if not opt.mps and torch.backends.mps.is_available():
-    raise Exception("Found mps device, please run with --mps to enable macOS GPU")
 
 torch.manual_seed(opt.seed)
-use_mps = opt.mps and torch.backends.mps.is_available()
-
-if opt.cuda:
-    device = torch.device("cuda")
-elif use_mps:
-    device = torch.device("mps")
-else:
-    device = torch.device("cpu")
+
+device = torch.device("cuda" if opt.cuda else "cpu")
 
 print('===> Loading datasets')
 train_set = get_training_set(opt.upscale_factor)
diff --git a/vae/README.md b/vae/README.md
index cda6a33672..edf463a606 100644
--- a/vae/README.md
+++ b/vae/README.md
@@ -15,7 +15,6 @@ optional arguments:
   --batch-size		input batch size for training (default: 128)
   --epochs		number of epochs to train (default: 10)
   --no-cuda		enables CUDA training
-  --mps         enables GPU on macOS
   --seed		random seed (default: 1)
   --log-interval	how many batches to wait before logging training status
-```
\ No newline at end of file
+```
diff --git a/vae/main.py b/vae/main.py
index f9b2bfbb99..d7df3360dd 100644
--- a/vae/main.py
+++ b/vae/main.py
@@ -15,24 +15,16 @@
                     help='number of epochs to train (default: 10)')
 parser.add_argument('--no-cuda', action='store_true', default=False,
                     help='disables CUDA training')
-parser.add_argument('--no-mps', action='store_true', default=False,
-                        help='disables macOS GPU training')
 parser.add_argument('--seed', type=int, default=1, metavar='S',
                     help='random seed (default: 1)')
 parser.add_argument('--log-interval', type=int, default=10, metavar='N',
                     help='how many batches to wait before logging training status')
 args = parser.parse_args()
 args.cuda = not args.no_cuda and torch.cuda.is_available()
-use_mps = not args.no_mps and torch.backends.mps.is_available()
 
 torch.manual_seed(args.seed)
 
-if args.cuda:
-    device = torch.device("cuda")
-elif use_mps:
-    device = torch.device("mps")
-else:
-    device = torch.device("cpu")
+device = torch.device("cuda" if args.cuda else "cpu")
 
 kwargs = {'num_workers': 1, 'pin_memory': True} if args.cuda else {}
 train_loader = torch.utils.data.DataLoader(
diff --git a/word_language_model/README.md b/word_language_model/README.md
index 18341e84c9..b809f5098b 100644
--- a/word_language_model/README.md
+++ b/word_language_model/README.md
@@ -38,7 +38,6 @@ optional arguments:
   --tied                tie the word embedding and softmax weights
   --seed SEED           random seed
   --cuda                use CUDA
-  --mps                 enable GPU on macOS
   --log-interval N      report interval
   --save SAVE           path to save the final model
   --onnx-export ONNX_EXPORT
diff --git a/word_language_model/main.py b/word_language_model/main.py
index 15310cea09..9fff5c15a9 100644
--- a/word_language_model/main.py
+++ b/word_language_model/main.py
@@ -37,10 +37,8 @@
                     help='tie the word embedding and softmax weights')
 parser.add_argument('--seed', type=int, default=1111,
                     help='random seed')
-parser.add_argument('--cuda', action='store_true', default=False,
+parser.add_argument('--cuda', action='store_true',
                     help='use CUDA')
-parser.add_argument('--mps', action='store_true', default=False,
-                        help='enables macOS GPU training')
 parser.add_argument('--log-interval', type=int, default=200, metavar='N',
                     help='report interval')
 parser.add_argument('--save', type=str, default='model.pt',
@@ -58,16 +56,8 @@
 if torch.cuda.is_available():
     if not args.cuda:
         print("WARNING: You have a CUDA device, so you should probably run with --cuda.")
-if torch.backends.mps.is_available():
-    if not args.mps:
-        print("WARNING: You have mps device, to enable macOS GPU run with --mps.")
-
-if args.cuda:
-    device = torch.device("cuda")
-elif args.mps:
-    device = torch.device("mps")
-else:
-    device = torch.device("cpu")
+
+device = torch.device("cuda" if args.cuda else "cpu")
 
 ###############################################################################
 # Load data