In [2]:
import pandas as pd

df = pd.read_csv("total_df.csv")

print(f'data shape: {df.shape}')
df.head()

data shape: (14127, 5)


Unnamed: 0,ep_id,ep_name,show_name,description,label
0,2QwU0zzueDDoi0fN3Kf0HR,#57 - Prof. Melanie Mitchell - Why AI is harde...,Machine Learning Street Talk,"Since its beginning in the 1950s, the field of...",machine learning
1,1ir7MrWUBwUI6AE7Na5diX,"#56 - Dr. Walid Saba, Gadi Singer, Prof. J. Ma...",Machine Learning Street Talk,It has been over three decades since the stati...,machine learning
2,1ACeGKFk4syDdcRuuuSZP7,#55 Self-Supervised Vision Models (Dr. Ishan M...,Machine Learning Street Talk,Dr. Ishan Misra is a Research Scientist at Fac...,machine learning
3,1vuc4azX8Mb0sl0lKOmxZi,#54 Gary Marcus and Luis Lamb - Neurosymbolic ...,Machine Learning Street Talk,"Professor Gary Marcus is a scientist, best-sel...",machine learning
4,6DrRmtpKKpH3yYJ6yPoMR2,#53 Quantum Natural Language Processing - Prof...,Machine Learning Street Talk,"Bob Coercke is a celebrated physicist, he's be...",machine learning


# Clean data
- Check for null values
- Check for duplicates
- Check data stats

In [3]:
df.isnull().sum()

ep_id            0
ep_name          0
show_name        0
description    152
label            0
dtype: int64

In [4]:
df_notnull = df.dropna(subset=['description'])  # Drop null descriptions
df_notnull.duplicated(keep=False).sum()  # Check all the duplicates

520

In [5]:
df_notnull_notdup = df_notnull.drop_duplicates(keep='first')  # Drop duplicates

In [6]:
df_notnull_notdup.label.value_counts()  # Check data stats

kid                 1893
lifestyle           1653
business            1516
crime               1483
politics            1401
comedy              1348
sport               1238
culture             1183
machine learning    1037
cooking              963
Name: label, dtype: int64

# Extract URLs
> 🤔 **Question:** Is tokenizer in the Sklearn, Tensorflow, Pre-trained able to tokenize the url as `<oov>` ?<br>
> → If they can, the processes below in this block can be skipped.

**Todos**:
1. Extract full url
    - *test*: If out-of-the-box tokenizers can mask the url as `<oov>` or `[unk]`. We need to mask it because it would not make sense to split the url into tokens. *Add them to special tokens set to ensure they won’t be split by the tokenization process.*
    - If ✔ , use them
    - If ✖ , we need to mask those url ourself
2. Extract web domain (optional)
    - Just to see the most web mentioned in description
3. Filter out meaningless descriptions *(Is this step needed?)*

## Test tokenizers
- Is they can mask url as unknown token.
    - Sklearn
    - Tensorflow
    - 🤗 Transformer

### 0) Detect rows containing url

In [7]:
import re
import numpy as np

regex_url = "((?:https?://)?(?:(?:www\.)?(?:[\da-z\.-]+)\.(?:[a-z]{2,6})|(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)|(?:(?:[0-9a-fA-F]{1,4}:){7,7}[0-9a-fA-F]{1,4}|(?:[0-9a-fA-F]{1,4}:){1,7}:|(?:[0-9a-fA-F]{1,4}:){1,6}:[0-9a-fA-F]{1,4}|(?:[0-9a-fA-F]{1,4}:){1,5}(?::[0-9a-fA-F]{1,4}){1,2}|(?:[0-9a-fA-F]{1,4}:){1,4}(?::[0-9a-fA-F]{1,4}){1,3}|(?:[0-9a-fA-F]{1,4}:){1,3}(?::[0-9a-fA-F]{1,4}){1,4}|(?:[0-9a-fA-F]{1,4}:){1,2}(?::[0-9a-fA-F]{1,4}){1,5}|[0-9a-fA-F]{1,4}:(?:(?::[0-9a-fA-F]{1,4}){1,6})|:(?:(?::[0-9a-fA-F]{1,4}){1,7}|:)|fe80:(?::[0-9a-fA-F]{0,4}){0,4}%[0-9a-zA-Z]{1,}|::(?:ffff(?::0{1,4}){0,1}:){0,1}(?:(?:25[0-5]|(?:2[0-4]|1{0,1}[0-9]){0,1}[0-9])\.){3,3}(?:25[0-5]|(?:2[0-4]|1{0,1}[0-9]){0,1}[0-9])|(?:[0-9a-fA-F]{1,4}:){1,4}:(?:(?:25[0-5]|(?:2[0-4]|1{0,1}[0-9]){0,1}[0-9])\.){3,3}(?:25[0-5]|(?:2[0-4]|1{0,1}[0-9]){0,1}[0-9])))(?::[0-9]{1,4}|[1-5][0-9]{4}|6[0-4][0-9]{3}|65[0-4][0-9]{2}|655[0-2][0-9]|6553[0-5])?(?:/[\w\.-]*)*/?)"
def extract_url_v2(desc: str):
    '''
        Extract full url from description
    '''
    extracted = re.findall(regex_url, desc)
    if len(extracted) !=0 :
        return extracted
    return np.nan  # If not have url, return NaN

"""def extract_url(desc: str):
    '''
        Extract full url from description
    '''
    extracted = re.findall("https?://[^\s]+", desc)
    if len(extracted) !=0 :
        return extracted
    return np.nan  # If not have url, return NaN
"""
def extract_web(desc:str):
    '''
        Extract web domain
    '''
    extracted = re.findall('https?://(?:[-\w.]|(?:%[\da-fA-F]{2}))+', desc)
    if len(extracted) != 0:
        return extracted
    return np.nan

web_from_desc = df_notnull_notdup.apply(lambda x: extract_web(x['description']), axis=1)
url_from_desc = df_notnull_notdup.apply(lambda x: extract_url_v2(x['description']), axis=1)

In [8]:
url_from_desc.dropna()  # Extraced url samples

0        [https://www.youtube.com/watch, https://www.yo...
1        [https://www.linkedin.com/in/gadi-singer/, htt...
2        [https://arxiv.org/abs/1603.08561, https://arx...
6        [https://youtu.be/J0p_thJJnoo, https://whimsic...
7        [https://whimsical.com/mar-26th-christian-szeg...
                               ...                        
14106    [fire...these, romisesrecoverycenters.com, www...
14114                      [ear.com, crushingyourfear.com]
14117                                  [rushingtheday.com]
14118               [breakfreeacademy.com, omaskeenan.com]
14123                                       [hillmeat.com]
Length: 7524, dtype: object

In [9]:
notHaveUrl = url_from_desc.isnull().sum() / len(url_from_desc)

print(f'Number of descriptions that not have url in description: {np.round(notHaveUrl * 100, 3)} %\n')
print(f'Top 10 websites mentioned most often in Podcast description :\n{web_from_desc.explode().value_counts()[:10]}')

Number of descriptions that not have url in description: 45.14 %

Top 10 websites mentioned most often in Podcast description :
https://anchor.fm            2464
https://twitter.com          1140
https://amzn.to               762
https://www.instagram.com     605
https://www.facebook.com      484
https://art19.com             474
https://www.patreon.com       426
https://www.youtube.com       411
https://redcircle.com         292
https://ronakblog.com         278
dtype: int64


In [10]:
hasUrl = ~url_from_desc.isnull()

### 1) Test 🤗Tokenizer

In [11]:
from IPython.display import clear_output

! pip install transformers[sentencepiece]
clear_output()

In [12]:
from transformers import BertTokenizer

checkpoint = "bert-base-cased"
tokenizer = BertTokenizer.from_pretrained(checkpoint); clear_output()

In [13]:
sentence = "The field of artificial in 1976 [UNK] has educated in https://www.youtube.com/watch?v=A8m1Oqz2HKc"
print(sentence)

tokens = tokenizer.tokenize(sentence)
print(tokens)

ids = tokenizer.convert_tokens_to_ids(tokens)
print(ids)

The field of artificial in 1976 [UNK] has educated in https://www.youtube.com/watch?v=A8m1Oqz2HKc
['The', 'field', 'of', 'artificial', 'in', '1976', '[UNK]', 'has', 'educated', 'in', 'https', ':', '/', '/', 'www', '.', 'you', '##tub', '##e', '.', 'com', '/', 'watch', '?', 'v', '=', 'A', '##8', '##m', '##1', '##O', '##q', '##z', '##2', '##H', '##K', '##c']
[1109, 1768, 1104, 8246, 1107, 2402, 100, 1144, 4512, 1107, 18630, 131, 120, 120, 7001, 119, 1128, 25098, 1162, 119, 3254, 120, 2824, 136, 191, 134, 138, 1604, 1306, 1475, 2346, 4426, 1584, 1477, 3048, 2428, 1665]


In [14]:
list(zip(tokenizer.all_special_tokens, tokenizer.all_special_ids))

[('[UNK]', 100), ('[SEP]', 102), ('[PAD]', 0), ('[CLS]', 101), ('[MASK]', 103)]

#### Result: ✖
🤗Tokenzier cannot mask url as `[UNK]`, So we need to replace it ourselevs.

In [None]:
df_notnull_notdup['desc_nonUrl'] = df_notnull_notdup['description'].apply(lambda x: re.sub(regex_url,"[UNK]",x))

The outcome is not perfect. Some url is not completely detected.
> 🤔 Is there a regex that can match all the url format? 

In [16]:
for i, row in df_notnull_notdup.loc[hasUrl, ['description','desc_nonUrl']].iterrows():
    print(f"original : {row['description']}")
    print(f"after sub: {row['desc_nonUrl']}\n")
    if i==2: break

original : Since its beginning in the 1950s, the field of artificial intelligence has vacillated between periods of optimistic predictions and massive investment and periods of disappointment, loss of confidence, and reduced funding. Even with today’s seemingly fast pace of AI breakthroughs, the development of long-promised technologies such as self-driving cars, housekeeping robots, and conversational companions has turned out to be much harder than many people expected.  Professor Melanie Mitchell thinks one reason for these repeating cycles is our limited understanding of the nature and complexity of intelligence itself. YT vid- https://www.youtube.com/watch?v=A8m1Oqz2HKc Main show kick off [00:26:51]  Panel: Dr. Tim Scarfe, Dr. Keith Duggar, Letitia Parcalabescu (https://www.youtube.com/c/AICoffeeBreak/)
after sub: Since its beginning in the 1950s, the field of artificial intelligence has vacillated between periods of optimistic predictions and massive investment and periods of dis

Now our data is ready

In [17]:
df_notnull_notdup.head()  # Preprocessed data

Unnamed: 0,ep_id,ep_name,show_name,description,label,desc_nonUrl
0,2QwU0zzueDDoi0fN3Kf0HR,#57 - Prof. Melanie Mitchell - Why AI is harde...,Machine Learning Street Talk,"Since its beginning in the 1950s, the field of...",machine learning,"Since its beginning in the 1950s, the field of..."
1,1ir7MrWUBwUI6AE7Na5diX,"#56 - Dr. Walid Saba, Gadi Singer, Prof. J. Ma...",Machine Learning Street Talk,It has been over three decades since the stati...,machine learning,It has been over three decades since the stati...
2,1ACeGKFk4syDdcRuuuSZP7,#55 Self-Supervised Vision Models (Dr. Ishan M...,Machine Learning Street Talk,Dr. Ishan Misra is a Research Scientist at Fac...,machine learning,Dr. Ishan Misra is a Research Scientist at Fac...
3,1vuc4azX8Mb0sl0lKOmxZi,#54 Gary Marcus and Luis Lamb - Neurosymbolic ...,Machine Learning Street Talk,"Professor Gary Marcus is a scientist, best-sel...",machine learning,"Professor Gary Marcus is a scientist, best-sel..."
4,6DrRmtpKKpH3yYJ6yPoMR2,#53 Quantum Natural Language Processing - Prof...,Machine Learning Street Talk,"Bob Coercke is a celebrated physicist, he's be...",machine learning,"Bob Coercke is a celebrated physicist, he's be..."


In [18]:
df_notnull_notdup.to_csv('df_notnull_notdup.csv', index=False)

## (optional) Filter meaningless descriptions

In [132]:
from typing import List

def detect_almost_dup(desc:List[str]):

    min_len = min([len(e) for e in desc])
    for i in range(min_len):
        if x[i] == y[i]: c+=1

    return c/min_len

detect_almost_dup("This exp", "This ep")

0.8571428571428571

In [113]:
# Tets: groupby -> apply
# For meaningless descriptions of a show detection

x = pd.DataFrame({
    'gender':['m','m','f','f','o','o','o'],
    'grade':[2,6,10,20,90,90,200],
    'age':[10,20,50,20,90,60,100]
})

def test(x):
    print(x)
    print(type(x))
    return x

y = x.groupby('gender').apply(test)

  gender  grade  age
2      f     10   50
3      f     20   20
<class 'pandas.core.frame.DataFrame'>
  gender  grade  age
0      m      2   10
1      m      6   20
<class 'pandas.core.frame.DataFrame'>
  gender  grade  age
4      o     90   90
5      o     90   60
6      o    200  100
<class 'pandas.core.frame.DataFrame'>
