# How to Compare Titles and URLs in Pandas

In [1]:
import pandas as pd
df = pd.read_csv('../data/medium_data.csv.zip')
df

Unnamed: 0,id,url,title,subtitle,image,claps,responses,reading_time,publication,date
0,1,https://towardsdatascience.com/a-beginners-gui...,A Beginner’s Guide to Word Embedding with Gens...,,1.png,850,8,8,Towards Data Science,2019-05-30
1,2,https://towardsdatascience.com/hands-on-graph-...,Hands-on Graph Neural Networks with PyTorch & ...,,2.png,1100,11,9,Towards Data Science,2019-05-30
2,3,https://towardsdatascience.com/how-to-use-ggpl...,How to Use ggplot2 in Python,A Grammar of Graphics for Python,3.png,767,1,5,Towards Data Science,2019-05-30
3,4,https://towardsdatascience.com/databricks-how-...,Databricks: How to Save Files in CSV on Your L...,When I work on Python projects dealing…,4.jpeg,354,0,4,Towards Data Science,2019-05-30
4,5,https://towardsdatascience.com/a-step-by-step-...,A Step-by-Step Implementation of Gradient Desc...,One example of building neural…,5.jpeg,211,3,4,Towards Data Science,2019-05-30
...,...,...,...,...,...,...,...,...,...,...
6503,6504,https://medium.com/better-marketing/we-vs-i-ho...,“We” vs “I” — How Should You Talk About Yourse...,Basic copywriting choices with a big…,6504.jpg,661,6,6,Better Marketing,2019-12-05
6504,6505,https://medium.com/better-marketing/how-donald...,How Donald Trump Markets Himself,Lessons from who might be the most popular bra...,6505.jpeg,189,1,5,Better Marketing,2019-12-05
6505,6506,https://medium.com/better-marketing/content-an...,Content and Marketing Beyond Mass Consumption,How to acquire customers without wasting money...,6506.jpg,207,1,8,Better Marketing,2019-12-05
6506,6507,https://medium.com/better-marketing/5-question...,5 Questions All Copywriters Should Ask Clients...,Save time and effort by…,6507.jpg,253,2,5,Better Marketing,2019-12-05


## Step 2: Convert title to slug with slugify

In [2]:
from slugify import slugify

df['title_to_url'] = df['title'].fillna('').apply(lambda x: slugify(x))
df['title_to_url'] 

0       a-beginners-guide-to-word-embedding-with-gensi...
1       hands-on-graph-neural-networks-with-pytorch-py...
2                            how-to-use-ggplot2-in-python
3       databricks-how-to-save-files-in-csv-on-your-lo...
4       a-step-by-step-implementation-of-gradient-desc...
                              ...                        
6503    we-vs-i-how-should-you-talk-about-yourself-on-...
6504                     how-donald-trump-markets-himself
6505        content-and-marketing-beyond-mass-consumption
6506    5-questions-all-copywriters-should-ask-clients...
6507               how-to-write-a-good-business-blog-post
Name: title_to_url, Length: 6508, dtype: object

In [3]:
cols = ['url', 'title', 'title_to_url']
df[cols]

Unnamed: 0,url,title,title_to_url
0,https://towardsdatascience.com/a-beginners-gui...,A Beginner’s Guide to Word Embedding with Gens...,a-beginners-guide-to-word-embedding-with-gensi...
1,https://towardsdatascience.com/hands-on-graph-...,Hands-on Graph Neural Networks with PyTorch & ...,hands-on-graph-neural-networks-with-pytorch-py...
2,https://towardsdatascience.com/how-to-use-ggpl...,How to Use ggplot2 in Python,how-to-use-ggplot2-in-python
3,https://towardsdatascience.com/databricks-how-...,Databricks: How to Save Files in CSV on Your L...,databricks-how-to-save-files-in-csv-on-your-lo...
4,https://towardsdatascience.com/a-step-by-step-...,A Step-by-Step Implementation of Gradient Desc...,a-step-by-step-implementation-of-gradient-desc...
...,...,...,...
6503,https://medium.com/better-marketing/we-vs-i-ho...,“We” vs “I” — How Should You Talk About Yourse...,we-vs-i-how-should-you-talk-about-yourself-on-...
6504,https://medium.com/better-marketing/how-donald...,How Donald Trump Markets Himself,how-donald-trump-markets-himself
6505,https://medium.com/better-marketing/content-an...,Content and Marketing Beyond Mass Consumption,content-and-marketing-beyond-mass-consumption
6506,https://medium.com/better-marketing/5-question...,5 Questions All Copywriters Should Ask Clients...,5-questions-all-copywriters-should-ask-clients...


## Step 3: Find all rows where the slugified titles is not in the URL

In [4]:
df['bad_title'] = False

for row in df[df.title.notna()].iterrows():
    title_to_url_temp = row[1].title_to_url.lower()
    title_temp = row[1].title.replace('-', '').replace(' ', '').lower()
    if not title_to_url_temp in row[1]['url']:
        df.loc[row[0], 'bad_title'] = True
#     else:
#         print(title_to_url_temp)
#         print(title_temp)
#         print(row[1]['url'])

In [5]:
with pd.option_context("display.min_rows", 50, "display.max_rows", 200, "display.max_columns", 5, "display.max_colwidth", 200):
    display(df[df['bad_title'] == True][cols].drop_duplicates())

Unnamed: 0,url,title,title_to_url
10,https://towardsdatascience.com/what-i-learned-from-abhishek-thakur-4b905ac0fd55,"<em class=""markup--em markup--h3-em"">What I Learned from (Two-time) Kaggle Grandmaster Abhishek Thakur</em>",em-class-markup-em-markup-h3-em-what-i-learned-from-two-time-kaggle-grandmaster-abhishek-thakur-em
19,https://towardsdatascience.com/faster-training-of-efficient-cnns-657953aa080,Faster Training for Efficient CNNs,faster-training-for-efficient-cnns
20,https://towardsdatascience.com/buyers-beware-fake-product-reviews-are-plaguing-the-internet-cfc599c42b6b,"Buyers beware, fake product reviews are plaguing the internet. How Machine Learning can help to spot them.",buyers-beware-fake-product-reviews-are-plaguing-the-internet-how-machine-learning-can-help-to-spot-them
21,https://towardsdatascience.com/objects-of-desire-a-talk-on-40-gigabytes-of-4chan-pol-r-braincels-and-r-theredpill-a4f4e36a4fad,"Objects of Desire: 40 gigabytes of 4chan/pol, r/Braincels, and r/TheRedPill",objects-of-desire-40-gigabytes-of-4chan-pol-r-braincels-and-r-theredpill
22,https://towardsdatascience.com/how-data-management-practice-enables-a-successful-implementation-of-single-customer-view-part-1-f9f508bddc85,"<strong class=""markup--strong markup--h3-strong"">How data management practice enables a successful implementation of single customer view?</strong>",strong-class-markup-strong-markup-h3-strong-how-data-management-practice-enables-a-successful-implementation-of-single-customer-view-strong
23,https://uxdesign.cc/white-on-black-or-black-on-white-the-pros-and-cons-of-dark-mode-3439e37d6c6c,"<strong class=""markup--strong markup--h3-strong"">White on black or black on white? The pros and cons of Dark Mode</strong>",strong-class-markup-strong-markup-h3-strong-white-on-black-or-black-on-white-the-pros-and-cons-of-dark-mode-strong
24,https://uxdesign.cc/uxdesign-for-social-change-459203b5d3c1,UX for social change: a 5-step app design approach,ux-for-social-change-a-5-step-app-design-approach
25,https://uxdesign.cc/music-is-an-experience-not-a-science-b5c303d51a70,"“Music is an experience, not a science.” — a UX case study",music-is-an-experience-not-a-science-a-ux-case-study
31,https://uxdesign.cc/reflection-point-usability-accessibility-and-ethics-in-user-experience-design-7f95e27b8273,"Reflection point: usability, accessibility, and ethics in UX",reflection-point-usability-accessibility-and-ethics-in-ux
32,https://uxdesign.cc/building-high-quality-hypothesis-for-better-design-decision-3a18a8d00038,Building high-quality hypotheses for better design decisions,building-high-quality-hypotheses-for-better-design-decisions


In [6]:
'faster-training-for-efficient-cnns' in 'tascience.com/faster-training-of-efficient-cnns-657953aa08'

False

## Step 4: Extract the slug from the URL and compare to title

In [7]:
df['url'].str.split('/', expand = True).sample(5)

Unnamed: 0,0,1,2,3,4
604,https:,,uxdesign.cc,empathy-a-key-ux-design-skill-that-no-job-desc...,
6272,https:,,medium.com,swlh,income-report-i-getting-my-bearings-on-medium-...
5245,https:,,towardsdatascience.com,u-nets-with-resnet-encoders-and-cross-connecti...,
1732,https:,,medium.com,datadriveninvestor,ddi-weekly-selection-october-28-2019-934113e785c7
3859,https:,,towardsdatascience.com,the-craft-of-intelligent-design-an-attempt-to-...,


In [8]:
df['url'].str.rsplit('/').str[-1].str.rsplit('-', 1, expand=True)[0]

0       a-beginners-guide-to-word-embedding-with-gensi...
1       hands-on-graph-neural-networks-with-pytorch-py...
2                            how-to-use-ggplot2-in-python
3       databricks-how-to-save-files-in-csv-on-your-lo...
4       a-step-by-step-implementation-of-gradient-desc...
                              ...                        
6503    we-vs-i-how-should-you-talk-about-yourself-on-...
6504                     how-donald-trump-markets-himself
6505        content-and-marketing-beyond-mass-consumption
6506    5-questions-all-copywriters-should-ask-clients...
6507    how-to-write-a-damn-good-blog-post-for-your-bu...
Name: 0, Length: 6508, dtype: object

In [9]:
df['url'].str.rsplit('/')

0       [https:, , towardsdatascience.com, a-beginners...
1       [https:, , towardsdatascience.com, hands-on-gr...
2       [https:, , towardsdatascience.com, how-to-use-...
3       [https:, , towardsdatascience.com, databricks-...
4       [https:, , towardsdatascience.com, a-step-by-s...
                              ...                        
6503    [https:, , medium.com, better-marketing, we-vs...
6504    [https:, , medium.com, better-marketing, how-d...
6505    [https:, , medium.com, better-marketing, conte...
6506    [https:, , medium.com, better-marketing, 5-que...
6507    [https:, , medium.com, better-marketing, how-t...
Name: url, Length: 6508, dtype: object

In [10]:
df['url'].str.rsplit('/').str[-1]

0       a-beginners-guide-to-word-embedding-with-gensi...
1       hands-on-graph-neural-networks-with-pytorch-py...
2               how-to-use-ggplot2-in-python-74ab8adec129
3       databricks-how-to-save-files-in-csv-on-your-lo...
4       a-step-by-step-implementation-of-gradient-desc...
                              ...                        
6503    we-vs-i-how-should-you-talk-about-yourself-on-...
6504        how-donald-trump-markets-himself-3bb9f65520be
6505    content-and-marketing-beyond-mass-consumption-...
6506    5-questions-all-copywriters-should-ask-clients...
6507    how-to-write-a-damn-good-blog-post-for-your-bu...
Name: url, Length: 6508, dtype: object

In [11]:
df['url'].str.rsplit('/').str[-1].str.rsplit('-', 1, expand=True)[0]

0       a-beginners-guide-to-word-embedding-with-gensi...
1       hands-on-graph-neural-networks-with-pytorch-py...
2                            how-to-use-ggplot2-in-python
3       databricks-how-to-save-files-in-csv-on-your-lo...
4       a-step-by-step-implementation-of-gradient-desc...
                              ...                        
6503    we-vs-i-how-should-you-talk-about-yourself-on-...
6504                     how-donald-trump-markets-himself
6505        content-and-marketing-beyond-mass-consumption
6506    5-questions-all-copywriters-should-ask-clients...
6507    how-to-write-a-damn-good-blog-post-for-your-bu...
Name: 0, Length: 6508, dtype: object

In [14]:
df_temp = df[df['url'].str.rsplit('/').str[-1].str.rsplit('-', 1, expand=True)[0] != df['title_to_url']]
with pd.option_context("display.min_rows", 20, "display.max_rows", 50, "display.max_columns", 5, "display.max_colwidth", 200):
    display(df_temp[cols])

Unnamed: 0,url,title,title_to_url
10,https://towardsdatascience.com/what-i-learned-from-abhishek-thakur-4b905ac0fd55,"<em class=""markup--em markup--h3-em"">What I Learned from (Two-time) Kaggle Grandmaster Abhishek Thakur</em>",em-class-markup-em-markup-h3-em-what-i-learned-from-two-time-kaggle-grandmaster-abhishek-thakur-em
19,https://towardsdatascience.com/faster-training-of-efficient-cnns-657953aa080,Faster Training for Efficient CNNs,faster-training-for-efficient-cnns
20,https://towardsdatascience.com/buyers-beware-fake-product-reviews-are-plaguing-the-internet-cfc599c42b6b,"Buyers beware, fake product reviews are plaguing the internet. How Machine Learning can help to spot them.",buyers-beware-fake-product-reviews-are-plaguing-the-internet-how-machine-learning-can-help-to-spot-them
21,https://towardsdatascience.com/objects-of-desire-a-talk-on-40-gigabytes-of-4chan-pol-r-braincels-and-r-theredpill-a4f4e36a4fad,"Objects of Desire: 40 gigabytes of 4chan/pol, r/Braincels, and r/TheRedPill",objects-of-desire-40-gigabytes-of-4chan-pol-r-braincels-and-r-theredpill
22,https://towardsdatascience.com/how-data-management-practice-enables-a-successful-implementation-of-single-customer-view-part-1-f9f508bddc85,"<strong class=""markup--strong markup--h3-strong"">How data management practice enables a successful implementation of single customer view?</strong>",strong-class-markup-strong-markup-h3-strong-how-data-management-practice-enables-a-successful-implementation-of-single-customer-view-strong
23,https://uxdesign.cc/white-on-black-or-black-on-white-the-pros-and-cons-of-dark-mode-3439e37d6c6c,"<strong class=""markup--strong markup--h3-strong"">White on black or black on white? The pros and cons of Dark Mode</strong>",strong-class-markup-strong-markup-h3-strong-white-on-black-or-black-on-white-the-pros-and-cons-of-dark-mode-strong
24,https://uxdesign.cc/uxdesign-for-social-change-459203b5d3c1,UX for social change: a 5-step app design approach,ux-for-social-change-a-5-step-app-design-approach
25,https://uxdesign.cc/music-is-an-experience-not-a-science-b5c303d51a70,"“Music is an experience, not a science.” — a UX case study",music-is-an-experience-not-a-science-a-ux-case-study
31,https://uxdesign.cc/reflection-point-usability-accessibility-and-ethics-in-user-experience-design-7f95e27b8273,"Reflection point: usability, accessibility, and ethics in UX",reflection-point-usability-accessibility-and-ethics-in-ux
32,https://uxdesign.cc/building-high-quality-hypothesis-for-better-design-decision-3a18a8d00038,Building high-quality hypotheses for better design decisions,building-high-quality-hypotheses-for-better-design-decisions


In [13]:
df_temp.shape

(1119, 12)