# Querying using regular expressions


----
### Connecting to MongoDB

----

In [1]:
# Importing the required libraries
import pymongo
import pprint as pp
pp.sorted = lambda x, key=None: x

In [2]:
# Connect to the mongoclient
# client = pymongo.MongoClient('<connection_string>')

In [3]:
# List databases
client.list_database_names()

['sample_airbnb',
 'sample_analytics',
 'sample_geospatial',
 'sample_mflix',
 'sample_restaurants',
 'sample_supplies',
 'sample_training',
 'sample_weatherdata',
 'admin',
 'local']

In [4]:
# Select database
db = client.sample_mflix

In [5]:
# List collections
db.list_collection_names()

['sessions', 'theaters', 'comments', 'movies', 'users']

In [6]:
# Sample document
pp.pprint(
    db.movies.find_one()
)

{'_id': ObjectId('573a1390f29313caabcd4135'),
 'plot': 'Three men hammer on an anvil and pass a bottle of beer around.',
 'genres': ['Short'],
 'runtime': 1,
 'cast': ['Charles Kayser', 'John Ott'],
 'num_mflix_comments': 1,
 'title': 'Blacksmith Scene',
 'fullplot': 'A stationary camera looks at a large anvil with a blacksmith '
             'behind it and one on either side. The smith in the middle draws '
             'a heated metal rod from the fire, places it on the anvil, and '
             'all three begin a rhythmic hammering. After several blows, the '
             'metal goes back in the fire. One smith pulls out a bottle of '
             'beer, and they each take a swig. Then, out comes the glowing '
             'metal and the hammering resumes.',
 'countries': ['USA'],
 'released': datetime.datetime(1893, 5, 9, 0, 0),
 'directors': ['William K.L. Dickson'],
 'rated': 'UNRATED',
 'awards': {'wins': 1, 'nominations': 0, 'text': '1 win.'},
 'lastupdated': '2015-08-26 00:03:

---
---
### `$regex` operator

[$regex](https://docs.mongodb.com/manual/reference/operator/query/regex/#-regex) operator for pattern matching strings in queries.

---
**^ pattern** 

For example, retrieve movies where the `title` starts with `A`.


----

In [7]:
# Movies starting with letter 'A'

cur = db.movies.find(
                        # query
                        {
                            'title':{'$regex':'^A'}
                        },
                        # project
                        {
                            'title':1,
                            '_id':0
                        }
            )

for doc in cur:
    pp.pprint(doc)

{'title': 'A Corner in Wheat'}
{'title': 'A Woman of Paris: A Drama of Fate'}
{'title': 'Applause'}
{'title': 'Asphalt'}
{'title': 'All Quiet on the Western Front'}
{'title': 'A Free Soul'}
{'title': 'A Farewell to Arms'}
{'title': 'A Story of Floating Weeds'}
{'title': 'Anna Karenina'}
{'title': 'Alice Adams'}
{'title': "A Midsummer Night's Dream"}
{'title': 'A Night at the Opera'}
{'title': 'Anthony Adverse'}
{'title': 'A Day in the Country'}
{'title': 'A Damsel in Distress'}
{'title': 'A Star Is Born'}
{'title': 'Alexander Nevsky'}
{'title': 'Algiers'}
{'title': 'Angels with Dirty Faces'}
{'title': "Alexander's Ragtime Band"}
{'title': "A Woman's Face"}
{'title': 'A Man to Remember'}
{'title': 'Arise, My Love'}
{'title': 'Across the Pacific'}
{'title': 'Air Force'}
{'title': 'Arsenic and Old Lace'}
{'title': 'Anchors Aweigh'}
{'title': 'And Then There Were None'}
{'title': 'A Song to Remember'}
{'title': 'A Tree Grows in Brooklyn'}
{'title': 'A Walk in the Sun'}
{'title': 'Anna and 

---
Distinct movies that start with `A`.

----

In [8]:
# Distinct movies starting with letter 'A'

db.movies.find(
                {
                    'title':{'$regex':'^A'}
                }
            ).distinct('title')

['A Alma do Osso',
 'A Bag of Hammers',
 'A Band Called Death',
 'A Barefoot Dream',
 'A Bay of Blood',
 'A Beautiful Mind',
 "A Beginner's Guide to Endings",
 'A Better Life',
 'A Better Place',
 'A Better Tomorrow',
 'A Better Tomorrow II',
 'A Better Tomorrow III: Love and Death in Saigon',
 'A Big Hand for the Little Lady',
 'A Big Love Story',
 'A Bigger Splash',
 "A Birder's Guide to Everything",
 'A Bit of Bad Luck',
 'A Bittersweet Life',
 'A Blast',
 'A Borrowed Identity',
 'A Borrowed Life',
 'A Bottle in the Gaza Sea',
 'A Boy Called Hate',
 'A Boy Named Charlie Brown',
 'A Boy and His Dog',
 'A Boy from Calabria',
 'A Boyfriend for My Wife',
 'A Brand New Life',
 'A Brave Heart: The Lizzie Velasquez Story',
 'A Bridge Too Far',
 'A Brief History of Time',
 'A Brief Vacation',
 'A Bright Shining Lie',
 'A Brighter Summer Day',
 'A Brilliant Young Mind',
 'A Bronx Tale',
 'A Brony Tale',
 'A Brooklyn State of Mind',
 'A Buddha',
 "A Bug's Life",
 'A Burning Hot Summer',
 'A C

---
**\w pattern**

Distinct movies that start with alphanumeric characters.

----

In [9]:
# Distinct movies start with alphanumeric characters

db.movies.find(
                {
                    'title':{'$regex':'^\w'}
                }
            ).distinct('title')

['009 Re: Cyborg',
 '06/05',
 '1',
 '1, 2, 3, Freeze',
 '1,000 Times Good Night',
 '1-900',
 '1. Mai',
 '10',
 '10 Cent Pistol',
 '10 Items or Less',
 '10 MPH',
 '10 Minutes',
 '10 Mountains 10 Years',
 '10 Questions for the Dalai Lama',
 '10 Things I Hate About You',
 '10%: What Makes a Hero?',
 '10,000 Black Men Named George',
 '10.000 Km',
 '10.5',
 '10.5: Apocalypse',
 '100 Bloody Acres',
 '100 Girls',
 '100% Love',
 '1001 Grams',
 '101 Dalmatians',
 "101 Dalmatians II: Patch's London Adventure",
 '101 Reykjavèk',
 '102 Dalmatians',
 '102 Minutes That Changed America',
 '1066: The Battle for Middle Earth',
 '11 Flowers',
 '11 Minutes',
 '11 Minutes Ago',
 '11.6',
 '11:11',
 '11:14',
 '12',
 '12 Angry Men',
 '12 Dates of Christmas',
 "12 O'Clock Boys",
 '12 Storeys',
 '12 Years a Slave',
 '12 and Holding',
 '12 in a Box',
 '127 Hours',
 '12:01',
 '12:08 East of Bucharest',
 '12th & Delaware',
 '13',
 '13 Assassins',
 '13 Going on 30',
 '13 Hours in a Warehouse',
 '13 Lakes',
 '13 Mi

---
Distinct movies that start with a alphanumeric character, followed by 'A' and then one or more alphanumeric characters.

----

In [10]:
# Distinct movies start with a alphanumeric char, followed by 'A' and alphanumeric chars

db.movies.find(
                {
                    'title':{'$regex':'^\wA\w+'}
                }
            ).distinct('title')

['BASEketball', 'MASH', 'SAGA: Curse of the Shadow', 'WALLèE']

---
**\d pattern**

Distinct movies that start with digits.

---

In [11]:
# Distinct movies start with digits

db.movies.find(
                {
                    'title':{'$regex':'^\d+'}
                }
            ).distinct('title')

['009 Re: Cyborg',
 '06/05',
 '1',
 '1, 2, 3, Freeze',
 '1,000 Times Good Night',
 '1-900',
 '1. Mai',
 '10',
 '10 Cent Pistol',
 '10 Items or Less',
 '10 MPH',
 '10 Minutes',
 '10 Mountains 10 Years',
 '10 Questions for the Dalai Lama',
 '10 Things I Hate About You',
 '10%: What Makes a Hero?',
 '10,000 Black Men Named George',
 '10.000 Km',
 '10.5',
 '10.5: Apocalypse',
 '100 Bloody Acres',
 '100 Girls',
 '100% Love',
 '1001 Grams',
 '101 Dalmatians',
 "101 Dalmatians II: Patch's London Adventure",
 '101 Reykjavèk',
 '102 Dalmatians',
 '102 Minutes That Changed America',
 '1066: The Battle for Middle Earth',
 '11 Flowers',
 '11 Minutes',
 '11 Minutes Ago',
 '11.6',
 '11:11',
 '11:14',
 '12',
 '12 Angry Men',
 '12 Dates of Christmas',
 "12 O'Clock Boys",
 '12 Storeys',
 '12 Years a Slave',
 '12 and Holding',
 '12 in a Box',
 '127 Hours',
 '12:01',
 '12:08 East of Bucharest',
 '12th & Delaware',
 '13',
 '13 Assassins',
 '13 Going on 30',
 '13 Hours in a Warehouse',
 '13 Lakes',
 '13 Mi

----
Distinct movies that start with two digits.

---

In [12]:
# Distinct movies start with two digits

db.movies.find(
                {
                    'title':{'$regex':'^\d{2}'}
                }
            ).distinct('title')

['009 Re: Cyborg',
 '06/05',
 '10',
 '10 Cent Pistol',
 '10 Items or Less',
 '10 MPH',
 '10 Minutes',
 '10 Mountains 10 Years',
 '10 Questions for the Dalai Lama',
 '10 Things I Hate About You',
 '10%: What Makes a Hero?',
 '10,000 Black Men Named George',
 '10.000 Km',
 '10.5',
 '10.5: Apocalypse',
 '100 Bloody Acres',
 '100 Girls',
 '100% Love',
 '1001 Grams',
 '101 Dalmatians',
 "101 Dalmatians II: Patch's London Adventure",
 '101 Reykjavèk',
 '102 Dalmatians',
 '102 Minutes That Changed America',
 '1066: The Battle for Middle Earth',
 '11 Flowers',
 '11 Minutes',
 '11 Minutes Ago',
 '11.6',
 '11:11',
 '11:14',
 '12',
 '12 Angry Men',
 '12 Dates of Christmas',
 "12 O'Clock Boys",
 '12 Storeys',
 '12 Years a Slave',
 '12 and Holding',
 '12 in a Box',
 '127 Hours',
 '12:01',
 '12:08 East of Bucharest',
 '12th & Delaware',
 '13',
 '13 Assassins',
 '13 Going on 30',
 '13 Hours in a Warehouse',
 '13 Lakes',
 '13 Minutes',
 '13 Sins',
 '13 Tzameti',
 '13/13/13',
 '13: Game of Death',
 '14

---
**$ pattern**

Distinct movies that end with `Man`.

----

In [13]:
# Disinct movies that end with 'man'

db.movies.find(
                {
                    'title':{'$regex':'Man$'}
                }
            ).distinct('title')

['9-Man',
 '9/11: The Falling Man',
 'A Common Man',
 'A Fighting Man',
 'A Funny Man',
 'A Guide for the Married Man',
 'A Most Wanted Man',
 'A Patriotic Man',
 'A Screaming Man',
 'A Serious Man',
 'A Single Man',
 'A Somewhat Gentle Man',
 'An Average Little Man',
 'An Unreasonable Man',
 'Ant-Man',
 'Bag Man',
 'Bicentennial Man',
 'Big River Man',
 'Burning Man',
 'Business Man',
 'Cemetery Man',
 'Choking Man',
 'Cinderella Man',
 'Dead Man',
 'Demolition Man',
 'Encino Man',
 'Far Out Man',
 'Fate of a Man',
 'Grizzly Man',
 'Harley Davidson and the Marlboro Man',
 'He Was a Quiet Man',
 'Holding the Man',
 'Hollow Man',
 'Holy Man',
 'Honkytonk Man',
 'I Love You, Man',
 'If I Were a Rich Man',
 'Inside Man',
 'Ip Man',
 'Iron Man',
 "Leonard Cohen: I'm Your Man",
 'Letters from the Big Man',
 'Little Big Man',
 'Little Man',
 'Lovely Man',
 'Man to Man',
 'Marathon Man',
 'Medicine Man',
 'Memoirs of an Invisible Man',
 'Mercury Man',
 "Momma's Man",
 'Moon Man',
 'My Man',
 

---
Distinct movies starting with `A` and ending with `Man`

----

In [14]:
# Distinct title starting with 'A' and ending with 'man'

db.movies.find(
                {
                    '$and':[
                                # starts with 'A'
                                {
                                        'title':{'$regex':'^A'}
                                },
                                # ends with 'man'
                                {
                                    'title':{'$regex':'Man$'}
                                }
                            ]
                }
        ).distinct('title')

['A Common Man',
 'A Fighting Man',
 'A Funny Man',
 'A Guide for the Married Man',
 'A Most Wanted Man',
 'A Patriotic Man',
 'A Screaming Man',
 'A Serious Man',
 'A Single Man',
 'A Somewhat Gentle Man',
 'An Average Little Man',
 'An Unreasonable Man',
 'Ant-Man']

---
----
**MongoDB regex $options**

[$options](https://docs.mongodb.com/manual/reference/operator/query/regex/#mongodb-query-op.-options) available for use with regex.

The `$options` `i` uses case insensitivity to match upper and lower cases. 

---
For example, finding distinct movies starting with `A`, ending with case insensitive `Man`. 'man', 'MAN', 'WOMAN', 'woman', etc. all will match.

---

In [15]:
# Distinct movies starting with 'A', ending with case insensitive 'man'

db.movies.find({
                    '$and':[
                                # Start with 'A'
                                {
                                    'title':{'$regex':'^A'}
                                },
                                # Ends with case insensitive 'man'
                                {
                                    'title':{
                                                '$regex':'Man$',
                                                '$options':'i'
                                            }
                                }
                        ]
                }
        ).distinct('title')

['A Common Man',
 'A Dangerous Woman',
 'A Fighting Man',
 'A Funny Man',
 'A Gentle Woman',
 'A Guide for the Married Man',
 'A Man and a Woman',
 'A Most Wanted Man',
 'A Patriotic Man',
 'A Screaming Man',
 'A Serious Man',
 'A Single Man',
 'A Somewhat Gentle Man',
 'A Taxing Woman',
 'A Woman Is a Woman',
 'Adanggaman',
 'All-Star Superman',
 'Altman',
 'An Average Little Man',
 'An Officer and a Gentleman',
 'An Unmarried Woman',
 'An Unreasonable Man',
 'And God Created Woman',
 'Another Woman',
 'Ant-Man',
 'Attack of the 50 Ft. Woman']

---
### Question -

Distinct movies starting with `A`, ending with case insensitive `Man`, and does not contain a digit.


----

In [16]:
# Question
db.movies.find({
                    '$and':[
                                # Start with 'A'
                                {
                                    'title':{'$regex':'^A'}
                                },
                                # Ends with case insensitive 'man'
                                {
                                    'title':{
                                                '$regex':'Man$',
                                                '$options':'i'
                                            }
                                },
                                # Does not contain digits
                                {
                                    'title':{
                                                    '$not':{'$regex':'\d+'}
                                            }
                                }
                    ]
}).distinct('title')

['A Common Man',
 'A Dangerous Woman',
 'A Fighting Man',
 'A Funny Man',
 'A Gentle Woman',
 'A Guide for the Married Man',
 'A Man and a Woman',
 'A Most Wanted Man',
 'A Patriotic Man',
 'A Screaming Man',
 'A Serious Man',
 'A Single Man',
 'A Somewhat Gentle Man',
 'A Taxing Woman',
 'A Woman Is a Woman',
 'Adanggaman',
 'All-Star Superman',
 'Altman',
 'An Average Little Man',
 'An Officer and a Gentleman',
 'An Unmarried Woman',
 'An Unreasonable Man',
 'And God Created Woman',
 'Another Woman',
 'Ant-Man']

----
### Question - 

Movies that start with the letter `T` but not with `The` case-insensitive.

---

In [17]:
# Question
db.movies.find({
                    '$and':[
                                # Start with 'T'
                                {
                                    'title':{'$regex':'^T'}
                                },
                                # Do not start with 'The' case-insensitive
                                {
                                    'title':{
                                                    '$not':{'$regex':'^the',
                                                            '$options':'i'}
                                            }
                                }
                        ]
                }
        ).distinct('title')

['T-Rex: Back to the Cretaceous',
 'T2 3-D: Battle Across Time',
 'THX 1138',
 'TINY: A Story About Living Small',
 'TMNT',
 'TRON',
 'TRON: Legacy',
 'TT3D: Closer to the Edge',
 'TV Junkie',
 'Taal',
 'Taarzan: The Wonder Car',
 'Tab Hunter Confidential',
 'Table for Five',
 'Tabloid',
 'Tabu',
 'Tabu: A Story of the South Seas',
 'Tad, the Lost Explorer',
 'Tadpole',
 'Tae Guk Gi: The Brotherhood of War',
 'Tag: The Assassination Game',
 'Tagebuch eines Liebenden',
 'Tai Chi Hero',
 'Tai Chi Zero',
 'Tai-Chi Master',
 'Tai-Pan',
 'Tail Gunner Joe',
 'Tainah, an Amazon Adventure',
 'Taipei Exchanges',
 'Taira Clan Saga',
 'Tajja: sineui son',
 'Take Care',
 'Take Care of My Cat',
 'Take Care of Your Scarf, Tatiana',
 "Take Her, She's Mine",
 'Take Me Home',
 'Take Me Home Tonight',
 'Take Me Out to the Ball Game',
 'Take Me to the River',
 'Take My Eyes',
 'Take Out',
 'Take Shelter',
 'Take This Waltz',
 'Take a Giant Step',
 'Take the Lead',
 'Take the Money and Run',
 'Take the Tr