In [1]:
import re

### RegEx 

https://docs.python.org/3/library/re.html

Functions:

- `findall`	Returns a list containing all matches
- `search`	Returns a Match object if there is a match anywhere in the string. If there is more than one match, only the first occurrence of the match will be returned.
- `split`	Returns a list where the string has been split at each match
- `sub`	Replaces one or many matches with a string

In [2]:
my_string = "Raul is a great TA, he deserves some holidays. Maybe in Raul would like to travel to Bahamas?"

In [3]:
# return all occurrances of 'Raul' using re.findall()

re.findall("Raul", my_string)

['Raul', 'Raul']

In [4]:
# use re.split() to split my_string whenever there's a comma
re.split(",", my_string)

['Raul is a great TA',
 ' he deserves some holidays. Maybe in Raul would like to travel to Bahamas?']

In [5]:
my_string.split(",")

['Raul is a great TA',
 ' he deserves some holidays. Maybe in Raul would like to travel to Bahamas?']

In [8]:
# use re.sub() to replace "TA" by "Teacher Assistant"

my_string = re.sub("TA", "Teacher Assistant", my_string)

In [7]:
my_string.replace("TA","Teacher Assistant")

'Raul is a great Teacher Assistant, he deserves some holidays. Maybe in Raul would like to travel to Bahamas?'

In [9]:
my_string

'Raul is a great Teacher Assistant, he deserves some holidays. Maybe in Raul would like to travel to Bahamas?'

In [None]:
# use re.search() to find "e" in my_string. Explore the "match object" returned

In [10]:
"e" in my_string

True

In [12]:
TA_match = re.search("e", my_string)
TA_match

<re.Match object; span=(12, 13), match='e'>

**The Match object** has properties and methods used to retrieve information about the search, and the result:

- `.span()` returns a tuple containing the start-, and end positions of the match.
- `.string` returns the string passed into the function
- `.group()` returns the part of the string where there was a match

In [13]:
TA_match.group()

'e'

### Metacharacters

Some characters are special metacharacters, and don’t match themselves. Instead, they signal that some out-of-the-ordinary thing should be matched, or they affect other portions of the RE by repeating them or changing their meaning.

` . ^ $ * + ? { } [ ] \ | ( )`

 #### `[]` means set of characters:
 
 - `[abc]` will match any of the characters a, b, or c
 - `[a-c]` will do the same
 - `[a-z]` will match any lowercase letter

In [14]:
alphanumeric = "4298fsfsDFGHv012rvv21v9"

In [15]:
re.findall("[A-z]", alphanumeric)

['f', 's', 'f', 's', 'D', 'F', 'G', 'H', 'v', 'r', 'v', 'v', 'v']

In [16]:
re.search("[A-z]", alphanumeric)

<re.Match object; span=(4, 5), match='f'>

#### `\` Signals a special sequence (can also be used to escape special characters)

In [17]:
string_with_specials = "fea8b21[a-m]39f fewv02"
re.findall("[a-m]", string_with_specials)

['f', 'e', 'a', 'b', 'a', 'm', 'f', 'f', 'e']

In [None]:
#[ letter in string_with_specials for letter in ['a','b','c','d','e']]

In [18]:
# use \ to escape the square brackets
re.findall("\[a-m]", string_with_specials)

['[a-m]']

#### Some special sequences:

- `\A`- Returns a match if the specified characters are at the beginning of the string
- `\b` - Returns a match where the specified characters are at the beginning or at the end of a word
- `\d` - 	Returns a match where the string contains digits (numbers from 0-9) (`\D` for where the string DOES NOT contain digits)
- `\s`- Returns a match where the string contains a white space character (`\S` for where the string DOES NOT contain a white space)

In [19]:
strings = ["there is this rat and there is that cat", 
           "if you capitalize this string you die",
           "this is the end"]

In [24]:
# Use a special sequence to capitalize the strings above without dying
for string in strings:
    print(re.sub("^t", "T", string)) 

There is this rat and there is that cat
if you capitalize this string you die
This is the end


In [25]:
some_nums = "I have had 3 coffees this morning and I plan to drink 7 more"

In [26]:
# use a special sequence to find the numbers in the string above
re.findall("\d", some_nums)

['3', '7']

### `.`	Any character (except newline character)
### `*`	Zero or more occurences


In [27]:
similar_words = ["hey", "hay", "how", "h i j k", "h", "ha", "oops"]


In [31]:
# use "." and "*" to return everything following an "h" (including the "h")
for string in similar_words:
    print(string, re.search("h.",string))

hey <re.Match object; span=(0, 2), match='he'>
hay <re.Match object; span=(0, 2), match='ha'>
how <re.Match object; span=(0, 2), match='ho'>
h i j k <re.Match object; span=(0, 2), match='h '>
h None
ha <re.Match object; span=(0, 2), match='ha'>
oops None


In [32]:
for string in similar_words:
    print(string, re.search("h*",string))

hey <re.Match object; span=(0, 1), match='h'>
hay <re.Match object; span=(0, 1), match='h'>
how <re.Match object; span=(0, 1), match='h'>
h i j k <re.Match object; span=(0, 1), match='h'>
h <re.Match object; span=(0, 1), match='h'>
ha <re.Match object; span=(0, 1), match='h'>
oops <re.Match object; span=(0, 0), match=''>


In [33]:
stupid_string = ["yessss", "yes", "yeah", "yep", "ye", "no", "nothing"]

In [42]:
# replace all affirmative strings with "yes"
for string in stupid_string:
    print(re.sub("y.*","yes",string))

yes
yes
yes
yes
yes
no
nothing


In [None]:
new_stupid_string

### `+`One or more occurrences

In [44]:
# use re.sub() together with + to fix the occurrance of too many whitespaces
spaces = "I   have too   many     spaces"
#re.sub(" {1,}"," ",spaces)
re.sub(" +"," ",spaces)

'I have too many spaces'

### `{}`- Exactly the specified number of occurrences

In [45]:
# we only want to fix spaces if there are more than 4
spaces2 = "I   have a  lot   of     spaces here   but                this is too much"
re.sub(" {1,}"," ",spaces2)

'I have a lot of spaces here but this is too much'

In [46]:
re.sub(" {1,3}"," ",spaces2)

'I have a lot of  spaces here but      this is too much'

In [48]:
re.sub("( {1,3}| {1,4})"," ",spaces2)

'I have a lot of  spaces here but      this is too much'

### `^`- Starts with

In [58]:
# print all veggies that start with a
veggies = ["tomato", "potato", "apple juice",
           "pear", "asparagus are tasty", "peach"]

for veggi in veggies:
    print(re.findall("^a.* ",veggi))

[]
[]
['apple ']
[]
['asparagus are ']
[]


### `()`Group a regular expression, so that you can use regex operators on it...

In [55]:
ratsandcats = "there is this rat and there is that cat"

In [62]:
# find all words that either start with t or end with t
re.findall(r"\bt\S*", ratsandcats)

['there', 'this', 'there', 'that']

In [63]:
re.findall(r"\S*t\b", ratsandcats)

['rat', 'that', 'cat']

In [64]:
t_words = re.findall(r"(\bt\S*)|(\S*t\b)", ratsandcats)

In [65]:
t_words

[('there', ''),
 ('this', ''),
 ('', 'rat'),
 ('there', ''),
 ('that', ''),
 ('', 'cat')]

In [66]:
[i[0]+i[1] for i in t_words]

['there', 'this', 'rat', 'there', 'that', 'cat']

### Some exercises

In [67]:
poke = "VenusaurMega Venusaur"

In [69]:
# Capitalize all strings and words that come after special characters like : or ?

quotes = ["work hard all day, all days", 
          "There are 3 types of people: those who can count and those who can't",
          "Nice to be nice",
          "Some people feel the rain, others just get wet",
          "could you complete the exercise? wow"
         ]

In [91]:
# Look for a match and if there is any, look in which index of the string it is and then capitalize.
# Split
# Capitalize
# Join

for quote in quotes:
    if ( re.search("(\:|\?) \w.*",quote) ):
        our_list = re.split("(\:|\?) ",quote) 
        #our_list.pop(1)
        our_list[2] = our_list[2].capitalize()
        #our_list[2] = our_list[2].upper()
        our_string = " ".join(our_list)
        print(our_string)
        #print(re.search("(\:|\?) \w.*",quote))
    else:
        print(quote)

work hard all day, all days
There are 3 types of people : THOSE WHO CAN COUNT AND THOSE WHO CAN'T
Nice to be nice
Some people feel the rain, others just get wet
could you complete the exercise ? WOW


### `?` Makes the preceeding expression optional

In [None]:
color = "British people say colour but Americans say color"

#### Raw text

Prevents built-in python especial characters to mess with regular expressions

In [None]:
print("\tbefore this sentence there is a large whitespace, known as tab")

In [None]:
print(r"\bI don't have a tab anymore")

#### Compiling the regex expression to reuse it

In [92]:
pattern = re.compile(r"abc")

In [93]:
re.findall(pattern, "oiwejabcoiqwefj")

['abc']

### Using regex in pandas dataframes

In [94]:
import pandas as pd

In [96]:
name = ["YIDI CHI","Nick Deitmers","Erik Termes","Carla Feriche","Víctor López","Marc Sarrau","Reka Varga","Oscar Tomás","Anna Fonte" ,"Pol Serramalera","André Santa Clara","Miguel Simon","Miguel Chacón","Áine Gates","Alex Gómez","Daria Gavrilova","Raul Castrillo","Sergio Monge","Filipe Santos","Sara Peña","Maria Gonzalez","Pau Sancho","Arnau Angerri","Jorge 'Yuyu' Gonzalez","Toni Espadas"]
favourite_animal = ["dog", "Dolphin", "Cats", "Elephant", "Tiger", "I love the tropical fish when scubbadiving", "Panda", "Wolf", "Fish", "Otter", "Orca", "Tiger", "Nan", "Nan", "Squirrel", "None", "ants", "Chimp", "tiger", "Dog", "Panda", "cat", "Eagle", "Dog", "Hippogriff"]
coronavirus = [0,1,0,0,0,0,0,0,0,0,0,0,"NaN","NaN",0,0,0,0,0,0,0,0,0,0,0]
drive_license = ['B','B','B','B','None','B','B','B','B','A1,B','B','B','NaN','NaN','AM, B','B','A','B','B','None','B','B','B','B','B']
Heights = [163, 185, 182, 170, 171, 183, 161, 178, 168, 178, 179, 185, "N/A", "N/A", 187, 165, 178, 181, 169, 170, 164 ,178, 172, 175, 174]
Own_vehicle=[0,1,1,1,0,0,0,0,0,1,1,1,None,None,1,1,1,1,0,0,0,1,1,0,1]
favourite_food = ["Japanese", "Indian", "Spanish", "Pasta", "Everything", "Everything", "Curry", "burger", "Cheese", "Mediterranean", "Mexican", "Mexican", "Nan", "Nan", "Seafood pasta", "Burger", "Steak", "Sushi", "Picanha", "Ajiaco", "sushi", "sushi", "Catalan", "Octopus", "Burger"]
nicest_TA = ["Mar","Pol","Javi","Mar","Mar","Can't decide","Mar","Javi","Pol","Mar","Javi","Javi", " "," ","Mar","Mar","Sea","Pol","trying to find out","Mar","Javi","Pol","Javi","Javi","Mar"] 
beers_consumed_last_Ironbeers = [0, 4, 4-5, 0, 1, 0, 0, 0, 4, 0, 4, 3, 0, 0, 1, 0, 2, 5, "too many to be counted", 3, 4, 0, "NaN", 4, 2]
pets=[0,1,1,1,1,0,0,0,0,0,0,0,0,"none",0,0,1,0,0,1,0,1,0,0,0]
film_genre = ["Comedy","Action","Thriller","Comedy","Thriller","Horror, Suspense, Comedy","Comedy","Drama","Drama","Thriller","Comedy","Horror","Horror","Null","Comedy","Action","Thriller","Action","Action","Drama","Action","Thriller","Cience Fiction","Action","Adventure"]
sex_week = ["NaN","No","Wrote some sexy code.","Nan","Yes","No","Yes","No","No","Yes","Yes, with code. Python love","yeah baby","Nan","NaN","Yes","yeap","Yes","NaN","NaN","lots", "plenty", "not yet","Kind of","No","NaN"]
Siblings= [1, 2, 1, 2, 1, 3, 2, 1 ,0, 3, 2, 1, None, None, 1, 2, 2, 1, 1, 2, 1, 0, 2, 0, 1]
favourite_season = ["Spring", "Summer", "Summer", "Spring", "Autumn", "Spring", "Spring", "Winter", "Spring", "Spring", "Summer", "Spring", "", "", "Summer", "Winter", "Spring", "Summer", "Summer", "Spring", "Summer", "Autumn", "Spring", "Summer", "Spring"]
coffee = [1, 0, 3, 0, 0, 0, 2, 1, 3, 0, 3, 0, "NaN", "NaN", 0, 1, 1, 1, 4, 0, 2, 2, 3, 2, 0]
crush_javi = ["nan", "Yeah", "nan", "nan", "Pythonically","That accent tho, talk dirty to me", "No", "nan","Encoded answer", "Too young for me", "Yes", "Since the first day", "nan", "nan", "Not yet", "nan", "Of course", "nan", "nan", "can't help it","Of course", "No", "Platonicaly", "nan", "nan"]
sports_per_week = [3, 3, "6 usually, 4 in bootcamp", 2, 1, "It depends", 4, 4, 0, 3, 3, 3, "NaN", "NaN", 3, 0, 0, 3, 3,3, 5, 2, 3, 4, 3]
ironbeers = ['no', 'Yes', 'Yes', 'No', 'Virtual', 'Nothing', 'nothing', 'no', 'Yes', 'No', 'Yes', 'Yes', 'Yes', 'NaN', 'Virtual', 'no', 'Virtual', 'Yes', '0', 'Yes', 'Yes', 'No', 'YES, I cleaned the mess', 'Yes', 'Yes']
football_team = ['None','Real Madrid','FCB','RCDE','Sant Andreu','None','FCB','None','FCB','RCD ESPANYOL','Benfica','Sabadell','Steelers','','FC Barcelona','None','Atletico de Madrid','Fc Barcelona','Benfica','None','None','F.C. Barcelona','FC Barcelona','FC Barcelona','Referee VAR Team']
sex_partners = ["N/A", 12, 99, 1, 1, 1, 99, 31, 10, 20, 69, 40, 69, "N/A", 4, 1, "N/A", 7, 31, 6, 1, 2, 3245, 1, "N/A"]
Favourite_sport = ['swim', 'Kitesurf', 'Muay Thai & MMA', 'Basket', 'Football', 'Artistic Rollerskating', 'Cycling', 'Ski', 'Swimming', 'Surf', 'Surf', 'Mountain hike', None, None, 'Padel', 'Mountain Ski', 'Football', 'Football', 'Football', 'Cycling', 'Yoga', 'quidditch', 'Football', 'Hiking', 'Quidditch']
music_genre= ["Pop","Hip Hop","Techno, Rock, Rap","Deep house","Hip Hop","EDM, Tropical House, Pop","NaN","Techno","Flamenco","Hip Hop","Disney","Trap","Hip Hop","NaN","Pop","Electronic","NaN","Punk-rock,Hip Hop","Reggaeton","Gospel","Pop" ,"Jazz","Techno,Hard-rock","Pop-rock","Classical Music"]

In [107]:
data_analysts = pd.DataFrame({
    "names":name,
    "favourite_animal":favourite_animal,
    "coronavirus":coronavirus,
    "drive_license":drive_license,
    "Heights":Heights,
    "Own_vehicle":Own_vehicle,
    "favourite_food":favourite_food,
    "nicest_TA":nicest_TA,
    "beers_consumed":beers_consumed_last_Ironbeers,
    "pets":pets,
    "film_genre":film_genre,
    "sex_week":sex_week,
    "Siblings":Siblings,
    "favourite_season":favourite_season,
    "coffee":coffee,
    "crush_javi":crush_javi,
    "sports_per_week":sports_per_week,
    "ironbeers":ironbeers,
    "football_team":football_team,
    "sex_partners":sex_partners,
    "Favourite_sport":Favourite_sport,
    "music_genre":music_genre
})

In [108]:
data_analysts.head()

Unnamed: 0,names,favourite_animal,coronavirus,drive_license,Heights,Own_vehicle,favourite_food,nicest_TA,beers_consumed,pets,film_genre,sex_week,Siblings,favourite_season,coffee,crush_javi,sports_per_week,ironbeers,football_team,sex_partners,Favourite_sport,music_genre
0,YIDI CHI,dog,0,B,163,0.0,Japanese,Mar,0,0,Comedy,,1.0,Spring,1,,3,no,,,swim,Pop
1,Nick Deitmers,Dolphin,1,B,185,1.0,Indian,Pol,4,1,Action,No,2.0,Summer,0,Yeah,3,Yes,Real Madrid,12.0,Kitesurf,Hip Hop
2,Erik Termes,Cats,0,B,182,1.0,Spanish,Javi,-1,1,Thriller,Wrote some sexy code.,1.0,Summer,3,,"6 usually, 4 in bootcamp",Yes,FCB,99.0,Muay Thai & MMA,"Techno, Rock, Rap"
3,Carla Feriche,Elephant,0,B,170,1.0,Pasta,Mar,0,1,Comedy,Nan,2.0,Spring,0,,2,No,RCDE,1.0,Basket,Deep house
4,Víctor López,Tiger,0,,171,0.0,Everything,Mar,1,1,Thriller,Yes,1.0,Autumn,0,Pythonically,1,Virtual,Sant Andreu,1.0,Football,Hip Hop


In [None]:
# Replace anything starting with "no" (case insensitive) to "No"
# "No", "no", "Nothing" and "nothing" should become "No"

In [109]:
data_analysts["ironbeers"].value_counts()

Yes                        11
Virtual                     3
No                          3
no                          3
YES, I cleaned the mess     1
nothing                     1
NaN                         1
Nothing                     1
0                           1
Name: ironbeers, dtype: int64

In [103]:
# "nothing"
re.sub("[Nn].*","No","YES")

'YES'

In [114]:
import numpy as np

In [115]:
np.unique(list( map(lambda x: re.sub("^[Nn].*","No",x) ,data_analysts["ironbeers"]) ) )

array(['0', 'No', 'Virtual', 'YES, I cleaned the mess', 'Yes'],
      dtype='<U23')

In [110]:
data_analysts["ironbeers"] = list( map(lambda x: re.sub("^[Nn].*","No",x) ,data_analysts["ironbeers"]) )

In [111]:
data_analysts["ironbeers"].value_counts()

Yes                        11
No                          9
Virtual                     3
YES, I cleaned the mess     1
0                           1
Name: ironbeers, dtype: int64

In [None]:
# a compiled regex can be used within pandas' replace function
my_regex = re.compile(r"^[Nn].")

In [116]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [130]:
# load the cars dataset!

cars = pd.read_csv("/content/drive/MyDrive/OLD_CURR/UNIT2/DAY2/data/vehicles.csv")

In [131]:
# the "Transmission" column is a mess. These are its unique values:
cars.Transmission.unique()
cars['Transmission'].unique()

array(['Automatic 3-spd', 'Automatic 4-spd', 'Manual 5-spd',
       'Automatic (S5)', 'Manual 6-spd', 'Automatic 5-spd', 'Auto(AM8)',
       'Auto(AM-S8)', 'Auto(AV-S7)', 'Automatic (S6)', 'Automatic (S9)',
       'Automatic (S4)', 'Auto(AM-S9)', 'Automatic (S7)', 'Auto(AM7)',
       'Auto(AM-S7)', 'Auto(AM6)', 'Automatic 6-spd', 'Manual 4-spd',
       'Automatic (S8)', 'Manual(M7)', 'Auto(AM-S6)',
       'Automatic (variable gear ratios)', 'Automatic (AV)',
       'Auto(AV-S8)', 'Automatic (AM6)', 'Automatic 8-spd', 'Auto(A1)',
       'Automatic (A1)', 'Automatic (A6)', 'Auto(AV-S6)', 'Manual 3-spd',
       'Manual 7-spd', 'Automatic 9-spd', 'Auto (AV)', 'Automatic 6spd',
       'Auto(L4)', 'Auto(L3)', 'Auto (AV-S6)', 'Auto (AV-S8)',
       'Automatic (AV-S6)', 'Automatic 7-spd', 'Manual 5 spd',
       'Auto(AM5)', 'Automatic (AM5)'], dtype=object)

In [120]:
re.sub("Auto.*","Automatic",'Auto(AM-S7)')

'Automatic'

In [121]:
y = lambda x: re.sub("Auto.*","Automatic",x) if ( "Auto" in x ) else "Manual"

In [122]:
y("Auto(AM-S7)")

'Automatic'

In [123]:
y("Manual 3-spd")

'Manual'

In [124]:
np.unique(list( map(lambda x: re.sub("Auto.*","Automatic",x) if ( "Auto" in x ) else "Manual",cars['Transmission'] )))

array(['Automatic', 'Manual'], dtype='<U9')

In [133]:
cars['Transmission_new'] = list( map(lambda x: re.sub("Auto.*","Automatic",x) if ( "Auto" in x ) else "Manual",cars['Transmission']) )

In [134]:
cars['Transmission_new'].unique()

array(['Automatic', 'Manual'], dtype=object)

In [None]:
# create a column called "speeds" where you store 
# the number of speeds in a car if known or NaN if unknown

In [135]:
cars.head()

Unnamed: 0,Make,Model,Year,Engine Displacement,Cylinders,Transmission,Drivetrain,Vehicle Class,Fuel Type,Fuel Barrels/Year,City MPG,Highway MPG,Combined MPG,CO2 Emission Grams/Mile,Fuel Cost/Year,Transmission_new
0,AM General,DJ Po Vehicle 2WD,1984,2.5,4.0,Automatic 3-spd,2-Wheel Drive,Special Purpose Vehicle 2WD,Regular,19.388824,18,17,17,522.764706,1950,Automatic
1,AM General,FJ8c Post Office,1984,4.2,6.0,Automatic 3-spd,2-Wheel Drive,Special Purpose Vehicle 2WD,Regular,25.354615,13,13,13,683.615385,2550,Automatic
2,AM General,Post Office DJ5 2WD,1985,2.5,4.0,Automatic 3-spd,Rear-Wheel Drive,Special Purpose Vehicle 2WD,Regular,20.600625,16,17,16,555.4375,2100,Automatic
3,AM General,Post Office DJ8 2WD,1985,4.2,6.0,Automatic 3-spd,Rear-Wheel Drive,Special Purpose Vehicle 2WD,Regular,25.354615,13,13,13,683.615385,2550,Automatic
4,ASC Incorporated,GNX,1987,3.8,6.0,Automatic 4-spd,Rear-Wheel Drive,Midsize Cars,Premium,20.600625,14,21,16,555.4375,2550,Automatic


In [137]:
cars['Transmission'].unique()

array(['Automatic 3-spd', 'Automatic 4-spd', 'Manual 5-spd',
       'Automatic (S5)', 'Manual 6-spd', 'Automatic 5-spd', 'Auto(AM8)',
       'Auto(AM-S8)', 'Auto(AV-S7)', 'Automatic (S6)', 'Automatic (S9)',
       'Automatic (S4)', 'Auto(AM-S9)', 'Automatic (S7)', 'Auto(AM7)',
       'Auto(AM-S7)', 'Auto(AM6)', 'Automatic 6-spd', 'Manual 4-spd',
       'Automatic (S8)', 'Manual(M7)', 'Auto(AM-S6)',
       'Automatic (variable gear ratios)', 'Automatic (AV)',
       'Auto(AV-S8)', 'Automatic (AM6)', 'Automatic 8-spd', 'Auto(A1)',
       'Automatic (A1)', 'Automatic (A6)', 'Auto(AV-S6)', 'Manual 3-spd',
       'Manual 7-spd', 'Automatic 9-spd', 'Auto (AV)', 'Automatic 6spd',
       'Auto(L4)', 'Auto(L3)', 'Auto (AV-S6)', 'Auto (AV-S8)',
       'Automatic (AV-S6)', 'Automatic 7-spd', 'Manual 5 spd',
       'Auto(AM5)', 'Automatic (AM5)'], dtype=object)

In [145]:
re.search("\d(-| )?spd","Automatic 4spd")

<re.Match object; span=(10, 14), match='4spd'>

In [148]:
int("Automatic 4spd"[re.search("\d(-| )?spd","Automatic 4spd").span()[0]])

4

In [None]:
for car in cars['Transmission']:
    if ( re.search("\d(-| )?spd", car ) ):
        speed = int(car[re.search("\d(-| )?spd",car).span()[0]])
        print(speed)
    else:
        speed = np.nan
        print(speed)


In [152]:
y1 = lambda x: int(x[re.search("\d(-| )?spd",x).span()[0]]) if ( re.search("\d(-| )?spd", x ) ) else np.nan

In [154]:
y1("Automatic 4spd")

4

In [155]:
np.unique(list( map(y1,cars['Transmission'] )))

array([ 3.,  4.,  5., ..., nan, nan, nan])

In [156]:
cars['speeds'] = list( map(y1,cars['Transmission'] ))

In [157]:
cars.head()

Unnamed: 0,Make,Model,Year,Engine Displacement,Cylinders,Transmission,Drivetrain,Vehicle Class,Fuel Type,Fuel Barrels/Year,City MPG,Highway MPG,Combined MPG,CO2 Emission Grams/Mile,Fuel Cost/Year,Transmission_new,speeds
0,AM General,DJ Po Vehicle 2WD,1984,2.5,4.0,Automatic 3-spd,2-Wheel Drive,Special Purpose Vehicle 2WD,Regular,19.388824,18,17,17,522.764706,1950,Automatic,3.0
1,AM General,FJ8c Post Office,1984,4.2,6.0,Automatic 3-spd,2-Wheel Drive,Special Purpose Vehicle 2WD,Regular,25.354615,13,13,13,683.615385,2550,Automatic,3.0
2,AM General,Post Office DJ5 2WD,1985,2.5,4.0,Automatic 3-spd,Rear-Wheel Drive,Special Purpose Vehicle 2WD,Regular,20.600625,16,17,16,555.4375,2100,Automatic,3.0
3,AM General,Post Office DJ8 2WD,1985,4.2,6.0,Automatic 3-spd,Rear-Wheel Drive,Special Purpose Vehicle 2WD,Regular,25.354615,13,13,13,683.615385,2550,Automatic,3.0
4,ASC Incorporated,GNX,1987,3.8,6.0,Automatic 4-spd,Rear-Wheel Drive,Midsize Cars,Premium,20.600625,14,21,16,555.4375,2550,Automatic,4.0
