## 24. Pandas: Check If Value of Column Is Contained in Another Column in the Same Row

In [1]:
import pandas as pd
df = pd.read_csv("../csv/movie_metadata.csv")

In [2]:
df[['movie_title', 'plot_keywords', 'country']].head(10)

Unnamed: 0,movie_title,plot_keywords,country
0,Avatar,avatar|future|marine|native|paraplegic,USA
1,Pirates of the Caribbean: At World's End,goddess|marriage ceremony|marriage proposal|pi...,USA
2,Spectre,bomb|espionage|sequel|spy|terrorist,UK
3,The Dark Knight Rises,deception|imprisonment|lawlessness|police offi...,USA
4,Star Wars: Episode VII - The Force Awakens ...,,
5,John Carter,alien|american civil war|male nipple|mars|prin...,USA
6,Spider-Man 3,sandman|spider man|symbiote|venom|villain,USA
7,Tangled,17th century|based on fairy tale|disney|flower...,USA
8,Avengers: Age of Ultron,artificial intelligence|based on comic book|ca...,USA
9,Harry Potter and the Half-Blood Prince,blood|book|love|potion|professor,UK


## Step 1: Check If String Column Contains Substring of Another with Function

In [3]:
def find_value_column(row):
    return row.country in row.movie_title

df.country.fillna('_', inplace=True)
df[df.apply(find_value_column, axis=1)][['movie_title', 'country']].head(10)

Unnamed: 0,movie_title,country
196,Australia,Australia
2504,"McFarland, USA",USA


In [4]:
for row in df.loc[df.plot_keywords.isnull(), 'plot_keywords'].index:
    df.at[row, 'plot_keywords'] = []

In [5]:
def find_value_column(row):
    return row.movie_title.lower().strip() in row.plot_keywords

df[df.apply(find_value_column, axis=1)][['movie_title', 'plot_keywords']].head(10)

Unnamed: 0,movie_title,plot_keywords
0,Avatar,avatar|future|marine|native|paraplegic
22,Robin Hood,1190s|archer|england|king of england|robin hood
25,King Kong,animal name in title|ape abducts a woman|goril...
26,Titanic,artist|love|ship|titanic|wet
33,Alice in Wonderland,alice in wonderland|mistaking reality for drea...
130,Thor,battle|marvel cinematic universe|scientist|tho...
145,Pan,1940s|child hero|fantasy world|orphan|referenc...
147,Troy,greek|mythology|prince|trojan|troy
150,Ghostbusters,ghost|ghostbuster|ghostbusters|male objectific...
160,Star Trek,box office hit|future|lifted by the throat|sta...


## Step 2: Check If Column contains another column with lambda

In [6]:
df[df.apply(lambda x: x.country in x.movie_title, axis=1)][['movie_title', 'country']].head(10)

Unnamed: 0,movie_title,country
196,Australia,Australia
2504,"McFarland, USA",USA


In [7]:
# Warning for common error
df.apply(lambda row: df.country in df.movie_title, axis=1)

TypeError: ("'Series' objects are mutable, thus they cannot be hashed", 'occurred at index 0')

## Step 3: Fastest Way to Check If One Column Contains Another

In [8]:
df['country'].fillna('Uknown', inplace=True)

In [9]:
df[[x[0] in x[1] for x in zip(df['country'], df['movie_title'])]][['movie_title', 'country']]

Unnamed: 0,movie_title,country
196,Australia,Australia
2504,"McFarland, USA",USA


## Step 4: For Loop and df.iterrows() Version

In [10]:
for i, row in df.iterrows():
    if row.country in row.movie_title:
        print(row.country, row.movie_title)

Australia Australia 
USA McFarland, USA 


## Bonus Step: Check If List Column Contains Substring of Another with Function

In [15]:
df['keywords'] = df.plot_keywords.str.split('|')

In [16]:
df['keywords']

0            [avatar, future, marine, native, paraplegic]
1       [goddess, marriage ceremony, marriage proposal...
2               [bomb, espionage, sequel, spy, terrorist]
3       [deception, imprisonment, lawlessness, police ...
4                                                     NaN
                              ...                        
5038         [fraud, postal worker, prison, theft, trial]
5039    [cult, fbi, hideout, prison escape, serial kil...
5040                                                  NaN
5041                                                  NaN
5042    [actress name in title, crush, date, four word...
Name: keywords, Length: 5043, dtype: object

In [17]:
def find_value_column(row):
    if isinstance(row['keywords'], list):
        for keyword in row['keywords']:
            return keyword in row.movie_title.lower()
    else:
        return False

df[df.apply(find_value_column, axis=1)][['movie_title', 'keywords']].head()

Unnamed: 0,movie_title,keywords
0,Avatar,"[avatar, future, marine, native, paraplegic]"
9,Harry Potter and the Half-Blood Prince,"[blood, book, love, potion, professor]"
33,Alice in Wonderland,"[alice in wonderland, mistaking reality for dr..."
68,Monsters vs. Aliens,"[alien, alien invasion, alien space craft, gia..."
77,G.I. Joe: The Rise of Cobra,"[cobra, gi joe, snake, train, warhead]"


In [18]:
df['keywords'] = df['keywords'].apply(lambda d: d if isinstance(d, list) else [])

In [19]:
def find_value_column(row):
    for keyword in row['keywords']:
        return keyword in row.movie_title.lower()
    return False

df[df.apply(find_value_column, axis=1)][['movie_title', 'keywords']].head()

Unnamed: 0,movie_title,keywords
0,Avatar,"[avatar, future, marine, native, paraplegic]"
9,Harry Potter and the Half-Blood Prince,"[blood, book, love, potion, professor]"
33,Alice in Wonderland,"[alice in wonderland, mistaking reality for dr..."
68,Monsters vs. Aliens,"[alien, alien invasion, alien space craft, gia..."
77,G.I. Joe: The Rise of Cobra,"[cobra, gi joe, snake, train, warhead]"


## Performance

In [11]:
%%timeit
def find_value_column(row):
    return row.country in row.movie_title

df[df.apply(find_value_column, axis=1)]

10 loops, best of 3: 154 ms per loop


In [12]:
%%timeit
df[df.apply(lambda x: x.country in x.movie_title, axis=1)]

10 loops, best of 3: 155 ms per loop


In [13]:
%%timeit
df[[x[0] in x[1] for x in zip(df['country'], df['movie_title'])]]

1000 loops, best of 3: 1.76 ms per loop


In [14]:
%%timeit
for i, row in df.iterrows():
    if row.country in row.movie_title:
        pass

1 loop, best of 3: 599 ms per loop
