# Data Preprocessing
    * This notebook Processes the Raw data into Normalized data for different categories.
    * Combine all individual category data into big csv file(YouTube_Scraped_PP_final.csv)

In [1]:
import pandas as pd
import re
import string
import ssl
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer

try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    pass
else:
    ssl._create_default_https_context = _create_unverified_https_context
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
import warnings
warnings.filterwarnings('ignore')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/sabareeswarans/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/sabareeswarans/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/sabareeswarans/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [26]:
# individual category preprocessing
df_bd= pd.read_csv("/Users/sabareeswarans/Desktop/BD_Lab/BD_final/youtube_dataset_custom_scraped/bd_raw.csv")
df_dl= pd.read_csv("/Users/sabareeswarans/Desktop/BD_Lab/BD_final/youtube_dataset_custom_scraped/dl_raw.csv")
df_film= pd.read_csv("/Users/sabareeswarans/Desktop/BD_Lab/BD_final/youtube_dataset_custom_scraped/film_and_animi_raw.csv")
df_food= pd.read_csv("/Users/sabareeswarans/Desktop/BD_Lab/BD_final/youtube_dataset_custom_scraped/food_raw.csv")
df_leedcode= pd.read_csv("/Users/sabareeswarans/Desktop/BD_Lab/BD_final/youtube_dataset_custom_scraped/leedcode_raw.csv")
df_mkbhd= pd.read_csv("/Users/sabareeswarans/Desktop/BD_Lab/BD_final/youtube_dataset_custom_scraped/mkbd_raw.csv")
df_ml= pd.read_csv("/Users/sabareeswarans/Desktop/BD_Lab/BD_final/youtube_dataset_custom_scraped/ml_raw.csv")

In [27]:
def raw_preprocessing(dataframe,filename):
    #Change to lowercase
    dataframe['title'] = dataframe['title'].map(lambda x: x.lower())
    dataframe['description'] = dataframe['description'].map(lambda x: x.lower())

    # Remove numbers
    dataframe['title'] = dataframe['title'].map(lambda x: re.sub(r'\d+', '', x))
    dataframe['description'] = dataframe['description'].map(lambda x: re.sub(r'\d+', '', x))

    # Remove Punctuation
    dataframe['title']  = dataframe['title'].map(lambda x: x.translate(x.maketrans('', '', string.punctuation)))
    dataframe['description']  = dataframe['description'].map(lambda x: x.translate(x.maketrans('', '', string.punctuation)))

    # Remove white spaces
    dataframe['title'] = dataframe['title'].map(lambda x: x.strip())
    dataframe['description'] = dataframe['description'].map(lambda x: x.strip())

    # Tokenize into words
    dataframe['title'] = dataframe['title'].map(lambda x: word_tokenize(x))
    dataframe['description'] = dataframe['description'].map(lambda x: word_tokenize(x))

    # Remove non alphabetic tokens
    dataframe['title'] = dataframe['title'].map(lambda x: [word for word in x if word.isalpha()])
    dataframe['description'] = dataframe['description'].map(lambda x: [word for word in x if word.isalpha()])

    # filter out stop words
    stop_words = set(stopwords.words('english'))
    dataframe['title'] = dataframe['title'].map(lambda x: [w for w in x if not w in stop_words])
    dataframe['description'] = dataframe['description'].map(lambda x: [w for w in x if not w in stop_words])

    # Word Lemmatization
    lem = WordNetLemmatizer()
    dataframe['title'] = dataframe['title'].map(lambda x: [lem.lemmatize(word,"v") for word in x])
    dataframe['description'] = dataframe['description'].map(lambda x: [lem.lemmatize(word,"v") for word in x])

    # Turn lists back to string
    dataframe['title'] = dataframe['title'].map(lambda x: ' '.join(x))
    dataframe['description'] = dataframe['description'].map(lambda x: ' '.join(x))
    print(dataframe.shape)
    dataframe.to_csv('/Users/sabareeswarans/Desktop/BD_Lab/BD_final/youtube_dataset_custom_scraped/preprocessed_individual/'+filename+'.csv', index=False)



In [28]:
# for scraped Big data category
print('No of Null data in Big data category:\n',df_bd.isnull().sum())

# Filling Null Values
df_bd['description']=df_bd['description'].fillna(' ')
# drop the rows where title value is NAN/ NULL
df_bd = df_bd.dropna(subset=['title'])
df_bd = df_bd.reset_index(drop=True)
print('Removed all null Values in Big data category:\n',df_bd.isnull().sum())

## Preprocessing - calling the Preprocessor function
raw_preprocessing(df_bd,'big_data_pp')
print("Preprocessing done and save under youtube_dataset_custom_scraped/preprocessed_individual !")

No of Null data in Big data category:
 links           0
title           8
description    78
category        0
dtype: int64
Removed all null Values in Big data category:
 links          0
title          0
description    0
category       0
dtype: int64
(568, 4)
Preprocessing done and save under youtube_dataset_custom_scraped/preprocessed_individual !


In [29]:
# for scraped dl_raw category
print('No of Null data in dl_raw category:\n',df_dl.isnull().sum())

# Filling Null Values
df_dl['description']=df_dl['description'].fillna(' ')
# drop the rows where title value is NAN/ NULL
df_dl = df_dl.dropna(subset=['title'])
df_dl = df_dl.reset_index(drop=True)
print('Removed all null Values in deep learning category:\n',df_dl.isnull().sum())

## Preprocessing - calling the Preprocessor function
raw_preprocessing(df_dl,'deep_learning_pp')
print("Preprocessing of deep_learning done and save under youtube_dataset_custom_scraped/preprocessed_individual !")

No of Null data in dl_raw category:
 links          0
title          0
description    0
category       0
dtype: int64
Removed all null Values in deep learning category:
 links          0
title          0
description    0
category       0
dtype: int64
(596, 10)
Preprocessing of deep_learning done and save under youtube_dataset_custom_scraped/preprocessed_individual !


In [30]:
# for scraped film and animation_raw category
print('No of Null data in film and animation category:\n',df_film.isnull().sum())

# Filling Null Valuesf
df_film['description']=df_film['description'].fillna(' ')
# drop the rows where title value is NAN/ NULL
df_film = df_film.dropna(subset=['title'])
df_film = df_film.reset_index(drop=True)
print('Removed all null Values in deep learning category:\n',df_film.isnull().sum())

## Preprocessing - calling the Preprocessor function
raw_preprocessing(df_film,'film_pp')
print("Preprocessing of film and animation done and save under youtube_dataset_custom_scraped/preprocessed_individual !")

No of Null data in film and animation category:
 links           0
title          21
description    31
category        0
dtype: int64
Removed all null Values in deep learning category:
 links          0
title          0
description    0
category       0
dtype: int64
(547, 4)
Preprocessing of film and animation done and save under youtube_dataset_custom_scraped/preprocessed_individual !


In [31]:
# for scraped food   category
print('No of Null data in film and animation category:\n',df_food.isnull().sum())

# Filling Null Valuesf
df_food['description']=df_food['description'].fillna(' ')
# drop the rows where title value is NAN/ NULL
df_food = df_food.dropna(subset=['title'])
df_food = df_food.reset_index(drop=True)
print('Removed all null Values in deep learning category:\n',df_food.isnull().sum())

## Preprocessing - calling the Preprocessor function
raw_preprocessing(df_food,'food_pp')
print("Preprocessing of food done and save under youtube_dataset_custom_scraped/preprocessed_individual !")

No of Null data in film and animation category:
 links            0
title            7
description     10
category         0
Unnamed: 4     517
Unnamed: 5     518
Unnamed: 6     518
dtype: int64
Removed all null Values in deep learning category:
 links            0
title            0
description      0
category         0
Unnamed: 4     510
Unnamed: 5     511
Unnamed: 6     511
dtype: int64
(512, 7)
Preprocessing of food done and save under youtube_dataset_custom_scraped/preprocessed_individual !


In [32]:
# for scraped leedcode raw category
print('No of Null data in film and animation category:\n',df_leedcode.isnull().sum())
# Filling Null Valuesf
df_leedcode['description']=df_leedcode['description'].fillna(' ')
# drop the rows where title value is NAN/ NULL
df_leedcode = df_leedcode.dropna(subset=['title'])
df_leedcode = df_leedcode.reset_index(drop=True)
print('Removed all null Values in deep learning category:\n',df_leedcode.isnull().sum())

## Preprocessing - calling the Preprocessor function
raw_preprocessing(df_leedcode,'leedcode_pp')
print("Preprocessing of food done and save under youtube_dataset_custom_scraped/preprocessed_individual !")

No of Null data in film and animation category:
 links           0
title           5
description    21
category        0
dtype: int64
Removed all null Values in deep learning category:
 links          0
title          0
description    0
category       0
dtype: int64
(556, 4)
Preprocessing of food done and save under youtube_dataset_custom_scraped/preprocessed_individual !


In [33]:
# for scraped mkbd raw category
print('No of Null data in film and animation category:\n',df_mkbhd.isnull().sum())
# Filling Null Valuesf
df_mkbhd['description']=df_mkbhd['description'].fillna(' ')
# drop the rows where title value is NAN/ NULL
df_mkbhd = df_mkbhd.dropna(subset=['title'])
df_mkbhd = df_mkbhd.reset_index(drop=True)
print('Removed all null Values in deep learning category:\n',df_mkbhd.isnull().sum())

## Preprocessing - calling the Preprocessor function
raw_preprocessing(df_mkbhd,'mkbhd_pp')
print("Preprocessing of food done and save under youtube_dataset_custom_scraped/preprocessed_individual !")

No of Null data in film and animation category:
 links           0
title          38
description    40
category        0
dtype: int64
Removed all null Values in deep learning category:
 links          0
title          0
description    0
category       0
dtype: int64
(576, 4)
Preprocessing of food done and save under youtube_dataset_custom_scraped/preprocessed_individual !


In [34]:
# for scraped machine_learning raw category
print('No of Null data in film and animation category:\n',df_ml.isnull().sum())
# Filling Null Valuesf
df_ml['description']=df_ml['description'].fillna(' ')
# drop the rows where title value is NAN/ NULL
df_ml = df_ml.dropna(subset=['title'])
df_ml = df_ml.reset_index(drop=True)
print('Removed all null Values in deep learning category:\n',df_ml.isnull().sum())
## Preprocessing - calling the Preprocessor function
raw_preprocessing(df_ml,'ml_pp')
print("Preprocessing of food done and save under youtube_dataset_custom_scraped/preprocessed_individual !")

No of Null data in film and animation category:
 links           0
title           9
description    29
category        0
dtype: int64
Removed all null Values in deep learning category:
 links          0
title          0
description    0
category       0
dtype: int64
(692, 4)
Preprocessing of food done and save under youtube_dataset_custom_scraped/preprocessed_individual !


## Combining all the individual data category together.

In [138]:

combine1 =[df_ml,df_dl,df_mkbhd,df_leedcode,df_food,df_bd,df_film]
data_pp1 = pd.concat( combine1,ignore_index=True)
data_pp1 = data_pp1.drop(data_pp1.columns[[4, 5, 6,7,8,9]], axis=1)

In [139]:
data_pp1

Unnamed: 0,links,title,description,category
0,GwIo3gDZCVQ,Machine Learning Full Course - Learn Machine L...,🔥 Machine Learning Engineer Masters Program (U...,machine learning
1,7eh4d6sabA0,Python Machine Learning Tutorial (Data Science),Python Machine Learning Tutorial - Learn how ...,machine learning
2,PmlRbfSavbI,Stealing Baseball Signs with a Phone (Machine ...,I always sucked at baseball... until now... ok...,machine learning
3,ircAruvnKk,"But what is a neural network? | Chapter 1, Dee...","What are the neurons, why are there layers, an...",machine learning
4,5q87K1WaoFI,Computer Scientist Explains Machine Learning i...,WIRED has challenged computer scientist and Hi...,machine learning
...,...,...,...,...
4139,8_o67Bm-2PU,Jungle Beat: Munki and Trunk | Fun Compilation...,The first in the Jungle Beat Book Collection i...,film and animation
4140,i8MQl7vCkMQ,Oscar Winner Animated Short Film 2022 | The Wi...,,film and animation
4141,N6dX6tzApG,Compilation | Jungle Beat: Munki and Trunk | K...,We're combining some of your favourite episode...,film and animation
4142,_ukPcjh2Em4,The Flower Day Parade/The Big Bunny Blues,Wubbzy wants to contribute to the Flower Day P...,film and animation


In [90]:
data_pp1.isnull().sum()

links            0
title           97
description    246
category         0
dtype: int64

In [140]:
# Filling Null Values
data_pp1['description']=data_pp1['description'].fillna(' ')


In [142]:
# drop the rows where title value is NAN/ NULL
data_pp1 = data_pp1.dropna(subset=['title'])
data_pp1 = data_pp1.reset_index(drop=True)


In [143]:
# Null values removed
data_pp1.isnull().sum()

links          0
title          0
description    0
category       0
dtype: int64

In [144]:
data_pp1

Unnamed: 0,links,title,description,category
0,GwIo3gDZCVQ,Machine Learning Full Course - Learn Machine L...,🔥 Machine Learning Engineer Masters Program (U...,machine learning
1,7eh4d6sabA0,Python Machine Learning Tutorial (Data Science),Python Machine Learning Tutorial - Learn how ...,machine learning
2,PmlRbfSavbI,Stealing Baseball Signs with a Phone (Machine ...,I always sucked at baseball... until now... ok...,machine learning
3,ircAruvnKk,"But what is a neural network? | Chapter 1, Dee...","What are the neurons, why are there layers, an...",machine learning
4,5q87K1WaoFI,Computer Scientist Explains Machine Learning i...,WIRED has challenged computer scientist and Hi...,machine learning
...,...,...,...,...
4042,8_o67Bm-2PU,Jungle Beat: Munki and Trunk | Fun Compilation...,The first in the Jungle Beat Book Collection i...,film and animation
4043,i8MQl7vCkMQ,Oscar Winner Animated Short Film 2022 | The Wi...,,film and animation
4044,N6dX6tzApG,Compilation | Jungle Beat: Munki and Trunk | K...,We're combining some of your favourite episode...,film and animation
4045,_ukPcjh2Em4,The Flower Day Parade/The Big Bunny Blues,Wubbzy wants to contribute to the Flower Day P...,film and animation


In [146]:
data_pp1

Unnamed: 0,links,title,description,category
0,GwIo3gDZCVQ,machine learn full course learn machine learn ...,machine learn engineer master program use code...,machine learning
1,7eh4d6sabA0,python machine learn tutorial data science,python machine learn tutorial learn predict ki...,machine learning
2,PmlRbfSavbI,steal baseball sign phone machine learn,always suck baseball ok still probably suck go...,machine learning
3,ircAruvnKk,neural network chapter deep learn,neurons layer math underlie help fund future p...,machine learning
4,5q87K1WaoFI,computer scientist explain machine learn level...,wire challenge computer scientist hide door co...,machine learning
...,...,...,...,...
4042,8_o67Bm-2PU,jungle beat munki trunk fun compilation kid an...,first jungle beat book collection buy today ht...,film and animation
4043,i8MQl7vCkMQ,oscar winner animate short film windshield wiper,,film and animation
4044,N6dX6tzApG,compilation jungle beat munki trunk kid animation,combine favourite episodes bring even jungle b...,film and animation
4045,_ukPcjh2Em4,flower day paradethe big bunny blue,wubbzy want contribute flower day paradewhen e...,film and animation


In [147]:
# combining food and food2

df_f1 = data_pp1[data_pp1['category']=='food']
df_f1

Unnamed: 0,links,title,description,category
2420,Wxdj970RM7M,iconic food every state state favorites,every state iconic food idaho potatoes world f...,food
2421,LeYCwv0j9Q,us vs japan mcdonalds food war,calorie count portion size want find differenc...,food
2422,64CSA7FMzuE,us vs japan wendy food war,calorie count portion size want find differenc...,food
2423,9i4SKHbhbqk,best fast food recipes part one gordon ramsay,fast food recipes do right fry chicken tostada...,food
2424,T4NOt727wqI,comfort foods around world,time much world face lockdown coronavirus pand...,food
...,...,...,...,...
2927,Ga1DcA6BI,tucker may enough food soon,fox news host react president say food shortag...,food
2928,KJpAbGA9Cj,spongebob squarepants movie,wacky underwater star oceanful adventure spong...,food
2929,YBDnVr,food process plant attack,unite state attack food process plant burn wat...,food
2930,nmH88P32Bj,school dance,high school freshman jason bobbe j thompson fi...,food


In [148]:
#df_food2['category'] = df_food2['category'].map({'food2': 'food'})

In [151]:
data_pp1

Unnamed: 0.1,links,title,description,category,Unnamed: 0
0,GwIo3gDZCVQ,machine learn full course learn machine learn ...,machine learn engineer master program use code...,machine learning,
1,7eh4d6sabA0,python machine learn tutorial data science,python machine learn tutorial learn predict ki...,machine learning,
2,PmlRbfSavbI,steal baseball sign phone machine learn,always suck baseball ok still probably suck go...,machine learning,
3,ircAruvnKk,neural network chapter deep learn,neurons layer math underlie help fund future p...,machine learning,
4,5q87K1WaoFI,computer scientist explain machine learn level...,wire challenge computer scientist hide door co...,machine learning,
...,...,...,...,...,...
12417,UNpyF58BY,fourier transform visual introduct,anim introduct fourier transform home page htt...,science,8357.0
12418,NBSv_0yHnB0,nyc center space scienc educ,nyc center space scienc educ experienti space ...,science,8358.0
12419,YJjL82-KORA,hubbl telescop show spiral black hole power je...,visit websit http www junglejoel com hubbl spa...,science,8359.0
12420,NSAgLvKOPLQ,model atom timelin,see chemistri video check http socrat org chem...,science,8368.0


In [152]:
# Shuffle the data Frame
data_final = data_pp1.sample(frac=1).reset_index(drop=True)

In [153]:
data_final

Unnamed: 0.1,links,title,description,category,Unnamed: 0
0,Co0iR1FjGY,hours vegetarian food challenge korea vlog cook,hello today go h vegetarian food challenge hop...,food,
1,TelY8wdrA,volvo engin factori made manufactur powertrain...,volvo engin factori made manufactur manufactur...,Car Manufacturing,4133.0
2,fL6uJYFrp4,deep learn tutorial,deep learn tutorial su lab ucsd,deep learning,
3,lUmL6sGvaig,japan travel cost japan rail pass japan travel...,japan travel cost japan rail pass japan travel...,vlogs and travel,
4,lp0ur_qXoM,irish peopl tast indian food,thing dessert never confus make sure subscrib ...,food,
...,...,...,...,...,...
12417,CsEWdTR7Oi8,hampi gloriou histori time ii day ii k video,video follow messag maharaja krishnadevraya pl...,biography,
12418,iBFDGKfY7xU,korean food never seen uniqu korean eleven tou...,korean street food challeng http www youtub co...,food,
12419,kReFGDDGn5Y,lord buddha animation film power life,lord buddha power life buddha reveal power lov...,film and animation,
12420,6HLmhxLmRp4,evolut spider man costum yellow spandex nowthi...,subscrib nowthi nerd http go nowth nerd subscr...,biography,


In [154]:
value_count=data_final['category'].value_counts()

In [155]:
to_remove = value_count[value_count <= 2].index
data_final = data_final[~data_final.category.isin(to_remove)]


In [156]:
data_final['category'].value_counts()

food                  1943
dance                 1445
biography             1415
science               1400
vlogs and travel      1352
Car Manufacturing     1330
machine learning       692
deep learning          594
mkbhd                  576
big data               568
Leedcode               556
film and animation     547
Name: category, dtype: int64

In [157]:
data_final.isnull().sum()

links             0
title             1
description      20
category          0
Unnamed: 0     9688
dtype: int64

In [158]:
# Drop Unnecessary column
data_final=data_final.drop('Unnamed: 0',axis=1)

In [159]:
# Filling Null Values
data_final['description']=data_final['description'].fillna(' ')

In [160]:
data_final.shape

(12418, 4)

In [161]:
# drop the rows where title value is NAN/ NULL
data_final = data_final.dropna(subset=['title'])
data_final = data_final.reset_index(drop=True)

In [162]:
data_final.shape

(12417, 4)

In [163]:
data_final.isnull().sum()

links          0
title          0
description    0
category       0
dtype: int64

In [164]:
data_final.to_csv('/Users/sabareeswarans/Desktop/BD_Lab/BD_final/Custom_dataset/YouTube_Scraped_PP_final.csv')