In [1]:
import re

### RegEx 

https://docs.python.org/3/library/re.html

Functions:

- `findall`	Returns a list containing all matches
- `search`	Returns a Match object if there is a match anywhere in the string. If there is more than one match, only the first occurrence of the match will be returned.
- `split`	Returns a list where the string has been split at each match
- `sub`	Replaces one or many matches with a string

In [2]:
my_string = "Raul is a great TA, he deserves some holidays, maybe in Poland?"

In [4]:
# return all occurrances of 'Pol' using re.findall()

re.findall("Raul", my_string)

['Raul']

In [5]:
# use re.split() to split my_string whenever there's a comma
re.split(",", my_string)

['Raul is a great TA', ' he deserves some holidays', ' maybe in Poland?']

In [6]:
# use re.sub() to replace "TA" by "Work Alcoholic"

my_string = re.sub("TA", "Work Alcoholic", my_string)

In [7]:
my_string

'Raul is a great Work Alcoholic, he deserves some holidays, maybe in Poland?'

In [None]:
# use re.search() to find "TA" in my_string. Explore the "match object" returned

In [8]:
TA_match = re.search("TA", my_string)

In [9]:
TA_match.string

AttributeError: ignored

In [11]:
TA_match = re.search("great", my_string)
TA_match

<re.Match object; span=(10, 15), match='great'>

In [12]:
my_string

'Raul is a great Work Alcoholic, he deserves some holidays, maybe in Poland?'

In [13]:
TA_match.group()

'great'

In [14]:
TA_match.start()

10

In [15]:
TA_match.end()

15

In [16]:
TA_match.span()

(10, 15)

In [17]:
my_string[TA_match.start():]

'great Work Alcoholic, he deserves some holidays, maybe in Poland?'

In [18]:
TA_match.string

'Raul is a great Work Alcoholic, he deserves some holidays, maybe in Poland?'

**The Match object** has properties and methods used to retrieve information about the search, and the result:

- `.span()` returns a tuple containing the start-, and end positions of the match.
- `.string` returns the string passed into the function
- `.group()` returns the part of the string where there was a match

### Metacharacters

Some characters are special metacharacters, and don’t match themselves. Instead, they signal that some out-of-the-ordinary thing should be matched, or they affect other portions of the RE by repeating them or changing their meaning.

` . ^ $ * + ? { } [ ] \ | ( )`

 #### `[]` means set of characters:
 
 - `[abc]` will match any of the characters a, b, or c
 - `[a-c]` will do the same
 - `[a-z]` will match any lowercase letter

In [19]:
alphanumeric = "4298fsfsDFGHv012rvv21v9"

In [20]:
re.findall("[A-z]", alphanumeric)

['f', 's', 'f', 's', 'D', 'F', 'G', 'H', 'v', 'r', 'v', 'v', 'v']

In [27]:
re.findall("\d", alphanumeric)

['4', '2', '9', '8', '0', '1', '2', '2', '1', '9']

In [28]:
re.findall("\d{4}", alphanumeric)

['4298']

In [29]:
re.findall("\d+", alphanumeric)

['4298', '012', '21', '9']

In [30]:
re.findall("\d*", alphanumeric)

['4298',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '012',
 '',
 '',
 '',
 '21',
 '',
 '9',
 '']

In [26]:
re.findall("\d{1,3}", alphanumeric)

['429', '8', '012', '21', '9']

#### `\` Signals a special sequence (can also be used to escape special characters)

In [31]:
string_with_specials = "fea8b21[a-m]39f fewv02"
re.findall("[a-m]", string_with_specials)

['f', 'e', 'a', 'b', 'a', 'm', 'f', 'f', 'e']

In [32]:
# use \ to escape the square brackets
re.findall("\[a-m]", string_with_specials)

['[a-m]']

#### Some special sequences:

- `\A` or `^`- Returns a match if the specified characters are at the beginning of the string
- `\b` - Returns a match where the specified characters are at the beginning or at the end of a word
- `\d` - 	Returns a match where the string contains digits (numbers from 0-9) (`\D` for where the string DOES NOT contain digits)
- `\s`- Returns a match where the string contains a white space character (`\S` for where the string DOES NOT contain a white space)

In [33]:
strings = ["there is this rat and there is that cat", 
           "if you capitalize this string you die",
           "this is the end"]

In [37]:
# Use a special sequence to capitalize the strings above without dying
for string in strings:
    print(re.sub("^t", "T", string))

There is this rat and there is that cat
if you capitalize this string you die
This is the end


In [56]:
# Use a special sequence to capitalize the strings above without dying
for string in strings:
    print(re.sub("^(\w & ![i])", "T", string))

there is this rat and there is that cat
if you capitalize this string you die
this is the end


In [41]:
# Use a special sequence to capitalize the strings above without dying
for string in strings:
    print(re.sub("^\w", "T", string))

There is this rat and there is that cat
Tf you capitalize this string you die
This is the end


In [38]:
quotes = ["work hard all day, all days", 
          "There are 3 types of people: those who can count and those who can't",
          "Nice to be nice",
          "Some people feel the rain, others just get wet",
          "could you complete the exercise? wow"
         ]

In [39]:
for i in range(len(quotes)):
    quotes[i]= re.sub("\sw"," W", quotes[i])

In [40]:
quotes

['work hard all day, all days',
 "There are 3 types of people: those Who can count and those Who can't",
 'Nice to be nice',
 'Some people feel the rain, others just get Wet',
 'could you complete the exercise? Wow']

In [50]:
for quote in quotes:
    print(re.sub(r"(^w|w$)","W",quote))

Work hard all day, all days
There are 3 types of people: those Who can count and those Who can't
Nice to be nice
Some people feel the rain, others just get Wet
could you complete the exercise? WoW


In [57]:
some_nums = "I have had 3 coffees this morning and I plan to drink 7 more"

In [None]:
# use a special sequence to find the numbers in the string above

In [58]:
re.findall("\D", some_nums)

['I',
 ' ',
 'h',
 'a',
 'v',
 'e',
 ' ',
 'h',
 'a',
 'd',
 ' ',
 ' ',
 'c',
 'o',
 'f',
 'f',
 'e',
 'e',
 's',
 ' ',
 't',
 'h',
 'i',
 's',
 ' ',
 'm',
 'o',
 'r',
 'n',
 'i',
 'n',
 'g',
 ' ',
 'a',
 'n',
 'd',
 ' ',
 'I',
 ' ',
 'p',
 'l',
 'a',
 'n',
 ' ',
 't',
 'o',
 ' ',
 'd',
 'r',
 'i',
 'n',
 'k',
 ' ',
 ' ',
 'm',
 'o',
 'r',
 'e']

### `.`	Any character (except newline character)
### `*`	Zero or more occurences


In [59]:
similar_words = ["hey", "hay", "how", "h i j k", "h", "ha", "oops"]


In [60]:
# use "." and "*" to return everything following an "h" (including the "h")
for word in similar_words:
    print(re.findall(r"h.*", word))

['hey']
['hay']
['how']
['h i j k']
['h']
['ha']
[]


In [61]:
stupid_string = ["yessss", "yes", "yeah", "yep", "ye", "no", "nothing"]

In [62]:
for string in stupid_string:
    print(re.findall(r"ye.*", string))

['yessss']
['yes']
['yeah']
['yep']
['ye']
[]
[]


In [63]:
re.findall(r'ye.*',"nothing")

[]

In [64]:
a = []
for word in stupid_string:
    a.append(re.findall(r'ye.*',word))
print(a)

[['yessss'], ['yes'], ['yeah'], ['yep'], ['ye'], [], []]


In [65]:
new_stupid_string = []

for string in stupid_string:
    new_stupid_string.append(re.sub(r"ye.*", "yes", string))

In [66]:
new_stupid_string

['yes', 'yes', 'yes', 'yes', 'yes', 'no', 'nothing']

In [68]:
re.findall("h*", stupid_string)

TypeError: ignored

### `+`One or more occurrences

In [69]:
# use re.sub() together with + to fix the occurrance of too many whitespaces

spaces = "I   have too   many     spaces"
re.sub(" +", " ", spaces)

'I have too many spaces'

### `{}`- Exactly the specified number of occurrences

In [70]:
# we only want to fix spaces if there are between 2 and 5
spaces2 = "I hhhave too many spaces hhhhhhhere but this is ok"
re.sub(r"h{4,}", "H", spaces2)

'I hhhave too many spaces Here but this is ok'

In [71]:
# we only want to fix spaces if there are between 2 and 5
spaces2 = "I hhhave too many spaces hhhhhhhere but this is ok"
re.sub(r"h{4}", "H", spaces2)

'I hhhave too many spaces Hhhhere but this is ok'

In [72]:
# we only want to fix spaces if there are between 2 and 5
spaces2 = "I hhhave too many spaces hhhhhhhere but this is ok"
re.sub(r"h+", "H", spaces2)

'I Have too many spaces Here but tHis is ok'

In [73]:
# we only want to fix spaces if there are between 2 and 5
spaces2 = "I hhhave too many spaces hhhhhhhere but this is ok"
re.sub(r"h{1,}", "H", spaces2)

'I Have too many spaces Here but tHis is ok'

### `^`- Starts with

In [75]:
# print all veggies that start with a
veggies = ["tomato", "potato", "apple juice",
           "pear", "asparagus are tasty", "peach"]
           
for veg in veggies:
    print(re.findall(r"^a\S*", veg))

[]
[]
['apple']
[]
['asparagus']
[]


In [90]:
# print all veggies that start with a
veggies = ["tomato", "potato", "apple juice",
           "pear", "asparagus are tasty", "peach","a potato"]
           
for veg in veggies:
    print(re.findall(r"^a.+", veg)) # 'a' at the beggining of the string followed by anything

[]
[]
['apple juice']
[]
['asparagus are tasty']
[]
['a potato']


In [87]:
# print all veggies that start with a
veggies = ["tomato", "potato", "apple juice",
           "pear", "asparagus are tasty", "peach","a potato"]
           
for veg in veggies:
    print(re.findall(r"^a\S+", veg)) # 'a' at the beggining and anything which i not an space

[]
[]
['apple']
[]
['asparagus']
[]
[]


### `()`Group a regular expression, so that you can use regex operators on it...

In [99]:
ratsandcats = "there is this rat and there is that rat cat"

In [98]:
# find all words that either start with t 
re.findall(r"\bt\S*", ratsandcats)

['there', 'this', 'there', 'that']

In [93]:
re.findall(r"\S*t\b", ratsandcats)

['rat', 'that', 'cat']

In [105]:
t_words = re.findall(r"(\bt\S*)|(\S*t\b)", ratsandcats)

In [106]:
t_words

[('there', ''),
 ('this', ''),
 ('', 'rat'),
 ('there', ''),
 ('that', ''),
 ('', 'rat'),
 ('', 'cat')]

In [102]:
t_words2 = re.findall(r"(\bt\S+)|(\S+t\b)", ratsandcats)
t_words2

[('there', ''),
 ('this', ''),
 ('', 'rat'),
 ('there', ''),
 ('that', ''),
 ('', 'rat'),
 ('', 'cat')]

In [None]:
ratsandcats = "there is this rat and there is that rat cat"

In [108]:
t_words3 = re.findall(r"(\bt\w+|\w+t\b)", ratsandcats)
t_words3

['there', 'this', 'rat', 'there', 'that', 'rat', 'cat']

In [112]:
[i[0]+i[1] for i in t_words]

['there', 'this', 'rat', 'there', 'that', 'rat', 'cat']

### Pokemon exercise

In [113]:
poke = "VenusaurMega Venusaur"

In [114]:
re.sub("^Venusaur", "" , poke)

'Mega Venusaur'

In [115]:
re.sub("\AVenusaur", "" , poke)

'Mega Venusaur'

### `?` Makes an expression optional

In [116]:
color = "British people say colour but Americans say color"

In [118]:
re.sub("B\w*","American", color)

'American people say colour but Americans say color'

#### Raw text

Prevents built-in python especial characters to mess with regular expressions

In [119]:
print("\tbefore this sentence there is a large whitespace, known as tab")

	before this sentence there is a large whitespace, known as tab


In [120]:
print(r"\bI don't have a tab anymore")

\bI don't have a tab anymore


#### Compiling the regex expression to reuse it

In [122]:
pattern = re.compile(r"abc")
pattern

re.compile(r'abc', re.UNICODE)

In [123]:
re.findall(pattern, "oiwejabcoiqwefj")

['abc']

In [124]:
re.findall(r"abc","oiwejabcoiqwefj")

['abc']

### Using regex in pandas dataframes

In [125]:
import pandas as pd

In [127]:
name = ["YIDI CHI","Nick Deitmers","Erik Termes","Carla Feriche","Víctor López","Marc Sarrau","Reka Varga","Oscar Tomás","Anna Fonte" ,"Pol Serramalera","André Santa Clara","Miguel Simon","Miguel Chacón","Áine Gates","Alex Gómez","Daria Gavrilova","Raul Castrillo","Sergio Monge","Filipe Santos","Sara Peña","Maria Gonzalez","Pau Sancho","Arnau Angerri","Jorge 'Yuyu' Gonzalez","Toni Espadas"]
favourite_animal = ["dog", "Dolphin", "Cats", "Elephant", "Tiger", "I love the tropical fish when scubbadiving", "Panda", "Wolf", "Fish", "Otter", "Orca", "Tiger", "Nan", "Nan", "Squirrel", "None", "ants", "Chimp", "tiger", "Dog", "Panda", "cat", "Eagle", "Dog", "Hippogriff"]
coronavirus = [0,1,0,0,0,0,0,0,0,0,0,0,"NaN","NaN",0,0,0,0,0,0,0,0,0,0,0]
drive_license = ['B','B','B','B','None','B','B','B','B','A1,B','B','B','NaN','NaN','AM, B','B','A','B','B','None','B','B','B','B','B']
Heights = [163, 185, 182, 170, 171, 183, 161, 178, 168, 178, 179, 185, "N/A", "N/A", 187, 165, 178, 181, 169, 170, 164 ,178, 172, 175, 174]
Own_vehicle=[0,1,1,1,0,0,0,0,0,1,1,1,None,None,1,1,1,1,0,0,0,1,1,0,1]
favourite_food = ["Japanese", "Indian", "Spanish", "Pasta", "Everything", "Everything", "Curry", "burger", "Cheese", "Mediterranean", "Mexican", "Mexican", "Nan", "Nan", "Seafood pasta", "Burger", "Steak", "Sushi", "Picanha", "Ajiaco", "sushi", "sushi", "Catalan", "Octopus", "Burger"]
nicest_TA = ["Mar","Pol","Javi","Mar","Mar","Can't decide","Mar","Javi","Pol","Mar","Javi","Javi", " "," ","Mar","Mar","Sea","Pol","trying to find out","Mar","Javi","Pol","Javi","Javi","Mar"] 
beers_consumed_last_Ironbeers = [0, 4, 4-5, 0, 1, 0, 0, 0, 4, 0, 4, 3, 0, 0, 1, 0, 2, 5, "too many to be counted", 3, 4, 0, "NaN", 4, 2]
pets=[0,1,1,1,1,0,0,0,0,0,0,0,0,"none",0,0,1,0,0,1,0,1,0,0,0]
film_genre = ["Comedy","Action","Thriller","Comedy","Thriller","Horror, Suspense, Comedy","Comedy","Drama","Drama","Thriller","Comedy","Horror","Horror","Null","Comedy","Action","Thriller","Action","Action","Drama","Action","Thriller","Cience Fiction","Action","Adventure"]
sex_week = ["NaN","No","Wrote some sexy code.","Nan","Yes","No","Yes","No","No","Yes","Yes, with code. Python love","yeah baby","Nan","NaN","Yes","yeap","Yes","NaN","NaN","lots", "plenty", "not yet","Kind of","No","NaN"]
Siblings= [1, 2, 1, 2, 1, 3, 2, 1 ,0, 3, 2, 1, None, None, 1, 2, 2, 1, 1, 2, 1, 0, 2, 0, 1]
favourite_season = ["Spring", "Summer", "Summer", "Spring", "Autumn", "Spring", "Spring", "Winter", "Spring", "Spring", "Summer", "Spring", "", "", "Summer", "Winter", "Spring", "Summer", "Summer", "Spring", "Summer", "Autumn", "Spring", "Summer", "Spring"]
coffee = [1, 0, 3, 0, 0, 0, 2, 1, 3, 0, 3, 0, "NaN", "NaN", 0, 1, 1, 1, 4, 0, 2, 2, 3, 2, 0]
crush_javi = ["nan", "Yeah", "nan", "nan", "Pythonically","That accent tho, talk dirty to me", "No", "nan","Encoded answer", "Too young for me", "Yes", "Since the first day", "nan", "nan", "Not yet", "nan", "Of course", "nan", "nan", "can't help it","Of course", "No", "Platonicaly", "nan", "nan"]
sports_per_week = [3, 3, "6 usually, 4 in bootcamp", 2, 1, "It depends", 4, 4, 0, 3, 3, 3, "NaN", "NaN", 3, 0, 0, 3, 3,3, 5, 2, 3, 4, 3]
ironbeers = ['no', 'Yes', 'Yes', 'No', 'Virtual', 'Nothing', 'nothing', 'no', 'Yes', 'No', 'Yes', 'Yes', 'Yes', 'NaN', 'Virtual', 'no', 'Virtual', 'Yes', '0', 'Yes', 'Yes', 'No', 'YES, I cleaned the mess', 'Yes', 'Yes']
football_team = ['None','Real Madrid','FCB','RCDE','Sant Andreu','None','FCB','None','FCB','RCD ESPANYOL','Benfica','Sabadell','Steelers','','FC Barcelona','None','Atletico de Madrid','Fc Barcelona','Benfica','None','None','F.C. Barcelona','FC Barcelona','FC Barcelona','Referee VAR Team']
sex_partners = ["N/A", 12, 99, 1, 1, 1, 99, 31, 10, 20, 69, 40, 69, "N/A", 4, 1, "N/A", 7, 31, 6, 1, 2, 3245, 1, "N/A"]
Favourite_sport = ['swim', 'Kitesurf', 'Muay Thai & MMA', 'Basket', 'Football', 'Artistic Rollerskating', 'Cycling', 'Ski', 'Swimming', 'Surf', 'Surf', 'Mountain hike', None, None, 'Padel', 'Mountain Ski', 'Football', 'Football', 'Football', 'Cycling', 'Yoga', 'quidditch', 'Football', 'Hiking', 'Quidditch']
music_genre= ["Pop","Hip Hop","Techno, Rock, Rap","Deep house","Hip Hop","EDM, Tropical House, Pop","NaN","Techno","Flamenco","Hip Hop","Disney","Trap","Hip Hop","NaN","Pop","Electronic","NaN","Punk-rock,Hip Hop","Reggaeton","Gospel","Pop" ,"Jazz","Techno,Hard-rock","Pop-rock","Classical Music"]

In [128]:
data_analysts = pd.DataFrame({
    "names":name,
    "favourite_animal":favourite_animal,
    "coronavirus":coronavirus,
    "drive_license":drive_license,
    "Heights":Heights,
    "Own_vehicle":Own_vehicle,
    "favourite_food":favourite_food,
    "nicest_TA":nicest_TA,
    "beers_consumed":beers_consumed_last_Ironbeers,
    "pets":pets,
    "film_genre":film_genre,
    "sex_week":sex_week,
    "Siblings":Siblings,
    "favourite_season":favourite_season,
    "coffee":coffee,
    "crush_javi":crush_javi,
    "sports_per_week":sports_per_week,
    "ironbeers":ironbeers,
    "football_team":football_team,
    "sex_partners":sex_partners,
    "Favourite_sport":Favourite_sport,
    "music_genre":music_genre
})

In [129]:
data_analysts.head()

Unnamed: 0,names,favourite_animal,coronavirus,drive_license,Heights,Own_vehicle,favourite_food,nicest_TA,beers_consumed,pets,film_genre,sex_week,Siblings,favourite_season,coffee,crush_javi,sports_per_week,ironbeers,football_team,sex_partners,Favourite_sport,music_genre
0,YIDI CHI,dog,0,B,163,0.0,Japanese,Mar,0,0,Comedy,,1.0,Spring,1,,3,no,,,swim,Pop
1,Nick Deitmers,Dolphin,1,B,185,1.0,Indian,Pol,4,1,Action,No,2.0,Summer,0,Yeah,3,Yes,Real Madrid,12.0,Kitesurf,Hip Hop
2,Erik Termes,Cats,0,B,182,1.0,Spanish,Javi,-1,1,Thriller,Wrote some sexy code.,1.0,Summer,3,,"6 usually, 4 in bootcamp",Yes,FCB,99.0,Muay Thai & MMA,"Techno, Rock, Rap"
3,Carla Feriche,Elephant,0,B,170,1.0,Pasta,Mar,0,1,Comedy,Nan,2.0,Spring,0,,2,No,RCDE,1.0,Basket,Deep house
4,Víctor López,Tiger,0,,171,0.0,Everything,Mar,1,1,Thriller,Yes,1.0,Autumn,0,Pythonically,1,Virtual,Sant Andreu,1.0,Football,Hip Hop


In [None]:
# We will replace anything starting with "no" (case insensitive) to "No"
# "No", "no", "Nothing" and "nothing" should become "No"

In [130]:
data_analysts["ironbeers"].value_counts()

Yes                        11
No                          3
no                          3
Virtual                     3
Nothing                     1
NaN                         1
nothing                     1
YES, I cleaned the mess     1
0                           1
Name: ironbeers, dtype: int64

In [131]:
no_regex = re.compile(r"^[nN][oO].*")

# ^     ---> starting with
# [nN]  ---> either "n" or "N"
# o     ---> followed by either "o" or "O"
# .*    ---> followed by either nothing or any character/s

In [132]:
# a compiled regex can be used within the replace function
data_analysts["ironbeers"].replace(to_replace=no_regex, value="No").value_counts()

Yes                        11
No                          8
Virtual                     3
YES, I cleaned the mess     1
0                           1
NaN                         1
Name: ironbeers, dtype: int64

In [133]:
yes_regex = re.compile("^ye.*", flags=re.IGNORECASE)

# ^     ---> starting with
# ye    ---> "ye"
# .*    ---> followed by either nothing or any character/s

In [134]:
data_analysts["ironbeers"].replace(to_replace=yes_regex, value="yes").value_counts()

yes        12
No          3
no          3
Virtual     3
Nothing     1
NaN         1
nothing     1
0           1
Name: ironbeers, dtype: int64

In [135]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [136]:
# load the cars dataset!
cars = pd.read_csv("/content/drive/MyDrive/OLD_CURR/UNIT2/DAY2/data/vehicles.csv")

In [137]:
# the "Transmission" column is a mess. These are its unique values:
cars.Transmission.unique()

array(['Automatic 3-spd', 'Automatic 4-spd', 'Manual 5-spd',
       'Automatic (S5)', 'Manual 6-spd', 'Automatic 5-spd', 'Auto(AM8)',
       'Auto(AM-S8)', 'Auto(AV-S7)', 'Automatic (S6)', 'Automatic (S9)',
       'Automatic (S4)', 'Auto(AM-S9)', 'Automatic (S7)', 'Auto(AM7)',
       'Auto(AM-S7)', 'Auto(AM6)', 'Automatic 6-spd', 'Manual 4-spd',
       'Automatic (S8)', 'Manual(M7)', 'Auto(AM-S6)',
       'Automatic (variable gear ratios)', 'Automatic (AV)',
       'Auto(AV-S8)', 'Automatic (AM6)', 'Automatic 8-spd', 'Auto(A1)',
       'Automatic (A1)', 'Automatic (A6)', 'Auto(AV-S6)', 'Manual 3-spd',
       'Manual 7-spd', 'Automatic 9-spd', 'Auto (AV)', 'Automatic 6spd',
       'Auto(L4)', 'Auto(L3)', 'Auto (AV-S6)', 'Auto (AV-S8)',
       'Automatic (AV-S6)', 'Automatic 7-spd', 'Manual 5 spd',
       'Auto(AM5)', 'Automatic (AM5)'], dtype=object)

In [None]:
re.compiler(r"\A   [Aa]   uto  .8")

In [139]:
# the "Transmission" column is a mess. These are its unique values:
cars['Transmission'].unique()

array(['Automatic 3-spd', 'Automatic 4-spd', 'Manual 5-spd',
       'Automatic (S5)', 'Manual 6-spd', 'Automatic 5-spd', 'Auto(AM8)',
       'Auto(AM-S8)', 'Auto(AV-S7)', 'Automatic (S6)', 'Automatic (S9)',
       'Automatic (S4)', 'Auto(AM-S9)', 'Automatic (S7)', 'Auto(AM7)',
       'Auto(AM-S7)', 'Auto(AM6)', 'Automatic 6-spd', 'Manual 4-spd',
       'Automatic (S8)', 'Manual(M7)', 'Auto(AM-S6)',
       'Automatic (variable gear ratios)', 'Automatic (AV)',
       'Auto(AV-S8)', 'Automatic (AM6)', 'Automatic 8-spd', 'Auto(A1)',
       'Automatic (A1)', 'Automatic (A6)', 'Auto(AV-S6)', 'Manual 3-spd',
       'Manual 7-spd', 'Automatic 9-spd', 'Auto (AV)', 'Automatic 6spd',
       'Auto(L4)', 'Auto(L3)', 'Auto (AV-S6)', 'Auto (AV-S8)',
       'Automatic (AV-S6)', 'Automatic 7-spd', 'Manual 5 spd',
       'Auto(AM5)', 'Automatic (AM5)'], dtype=object)

In [157]:
regex = re.compile(r"^Auto\s*\(\w{1}")

for string in cars['Transmission'].unique().tolist():
    if ( re.findall(regex,string) ):
        print(re.findall(regex,string))

['Auto(A']
['Auto(A']
['Auto(A']
['Auto(A']
['Auto(A']
['Auto(A']
['Auto(A']
['Auto(A']
['Auto(A']
['Auto(A']
['Auto(A']
['Auto (A']
['Auto(L']
['Auto(L']
['Auto (A']
['Auto (A']
['Auto(A']


In [None]:
regex = re.compile(r"^A\w+\s8")

for string in cars['Transmission'].unique().tolist():
    if ( re.findall(regex,string) ):
        print(re.findall(regex,string))

In [140]:
# We just wanna distinguis cars with automatic & manual transmission

auto = re.compile(r"\A[Aa]uto.*")

re.compiler(r"\A\s{3}[Aa]\s{3}uto\s{2}.8")

# \A    ---> at the beginning of a string...
# [Aa]  ---> either "A" or "a"
# uto   ---> followed by "uto"
# .*    ---> followed by either nothing or any character/s

In [141]:
# We will create a new column, "Automatic"
# and set "True" to the values that match the "auto" regex we created
cars["Automatic"] = cars["Transmission"].str.replace(auto, "True")

In [142]:
# We will set everything else to "False"
cars.loc[cars["Automatic"]!="True", "Automatic"]="False"

In [143]:
# And we will convert the column to boolean
boolean = {"True":True, "False":False}
cars["Automatic"] = cars["Automatic"].map(boolean)

In [144]:
cars["Automatic"].value_counts()

True     24290
False    11662
Name: Automatic, dtype: int64