In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt


## Loading the Dataset

In [2]:
raw_data = pd.read_csv('./podcasts.csv')

# information about the dataset
raw_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 121175 entries, 0 to 121174
Data columns (total 9 columns):
 #   Column       Non-Null Count   Dtype 
---  ------       --------------   ----- 
 0   uuid         121175 non-null  object
 1   title        121173 non-null  object
 2   image        121175 non-null  object
 3   description  119832 non-null  object
 4   language     121175 non-null  object
 5   categories   121175 non-null  object
 6   website      120005 non-null  object
 7   author       118678 non-null  object
 8   itunes_id    121175 non-null  int64 
dtypes: int64(1), object(8)
memory usage: 8.3+ MB


## Selecting only the title, paper text column

In [3]:
df = raw_data[['title','description']].iloc[:10]

In [4]:
# checking for null values
df.isnull().sum()/df.shape[0] * 100

title          0.0
description    0.0
dtype: float64

In [5]:
# dropping the missing values 
df.dropna(axis=0,how='any',inplace=True)

In [6]:
df.isna().sum()

title          0
description    0
dtype: int64

# Removing the STOPWORDS after tokenization

In [7]:
# tokenizer
from nltk.tokenize import word_tokenize

# IMPORTING THE STOPWORDS FOR ENGLISH LANGUAGE
from nltk.corpus import stopwords
stop_words = list(set(stopwords.words('english')))

In [8]:
def filter_stopwords(x):
    words = word_tokenize(x)
    token = []
    for i in words:
        if i not in stop_words:
            token.append(i)
    return token

df['f_tokens']=df.description.apply(filter_stopwords)

# Lemmatization tokens in base form with WORDNET Nltk tool

In [9]:
df.columns

Index(['title', 'description', 'f_tokens'], dtype='object')

In [10]:
from nltk.stem import WordNetLemmatizer
lemmatizer =  WordNetLemmatizer()

def lema(x):
    lemas = []
    for i in x:
        lemas.append(lemmatizer.lemmatize(i))
    return lemas
df['lemma'] = df.f_tokens.apply(lema)

In [11]:
df.head()

Unnamed: 0,title,description,f_tokens,lemma
0,"Ecommerce Conversations, by Practical Ecommerce",Listen in as the Practical Ecommerce editorial...,"[Listen, Practical, Ecommerce, editorial, staf...","[Listen, Practical, Ecommerce, editorial, staf..."
1,Eat Sleep Code Podcast,On the show we’ll be talking to passionate peo...,"[On, show, ’, talking, passionate, people, wid...","[On, show, ’, talking, passionate, people, wid..."
2,SoundtrackAlley,A podcast about soundtracks and movies from my...,"[A, podcast, soundtracks, movies, childhood, b...","[A, podcast, soundtrack, movie, childhood, bey..."
3,The Tech M&A Podcast,The Tech M&A Podcast pulls from the best of th...,"[The, Tech, M, &, A, Podcast, pulls, best, Tec...","[The, Tech, M, &, A, Podcast, pull, best, Tech..."
4,"The Tech Informist - For fans of Apple, Google...",The tech news show with two guys shooting the ...,"[The, tech, news, show, two, guys, shooting, b...","[The, tech, news, show, two, guy, shooting, br..."


In [12]:
print(df.description.iloc[1])
print("\n")
print(df.lemma.iloc[1])

On the show we’ll be talking to passionate people about a wide range of developer related topics.

Part of the Telerik Developer Network , Eat Sleep Code is the Official Telerik Podcast. At Telerik we believe in giving back to the developer community and we do this by writing, speaking and staying in touch with the software development community.





['On', 'show', '’', 'talking', 'passionate', 'people', 'wide', 'range', 'developer', 'related', 'topic', '.', 'Part', 'Telerik', 'Developer', 'Network', ',', 'Eat', 'Sleep', 'Code', 'Official', 'Telerik', 'Podcast', '.', 'At', 'Telerik', 'believe', 'giving', 'back', 'developer', 'community', 'writing', ',', 'speaking', 'staying', 'touch', 'software', 'development', 'community', '.']


# Lemmatizing with SpaCy tool

In [13]:
import sys
import spacy
import en_core_web_sm
nlp = en_core_web_sm.load(disable=['parser','ner'])

In [14]:
def spacy_lema(x):
    lemas = []
    for i in x:
        lemas.append(nlp(i))
    return lemas

df['spacy'] = df.f_tokens.apply(spacy_lema)

In [15]:
print(df.description.iloc[1])
print("\n")
print(df.lemma.iloc[1])
print("\n")
print(df.spacy.iloc[1])

On the show we’ll be talking to passionate people about a wide range of developer related topics.

Part of the Telerik Developer Network , Eat Sleep Code is the Official Telerik Podcast. At Telerik we believe in giving back to the developer community and we do this by writing, speaking and staying in touch with the software development community.





['On', 'show', '’', 'talking', 'passionate', 'people', 'wide', 'range', 'developer', 'related', 'topic', '.', 'Part', 'Telerik', 'Developer', 'Network', ',', 'Eat', 'Sleep', 'Code', 'Official', 'Telerik', 'Podcast', '.', 'At', 'Telerik', 'believe', 'giving', 'back', 'developer', 'community', 'writing', ',', 'speaking', 'staying', 'touch', 'software', 'development', 'community', '.']


[On, show, ’, talking, passionate, people, wide, range, developer, related, topics, ., Part, Telerik, Developer, Network, ,, Eat, Sleep, Code, Official, Telerik, Podcast, ., At, Telerik, believe, giving, back, developer, community, writing, ,, speaking, stay