In [1]:
import pandas as pd
import glob
import json

Concatenate all data into one DataFrame

In [2]:
# get data file names
path =r'../Files/Data/'
filenames = glob.glob(path + "*.csv")

dfs = []
for filename in filenames:
    dfs.append(pd.read_csv(filename))

# Concatenate all data into one DataFrame
big_frame = pd.concat(dfs, ignore_index=True)

In [3]:
big_frame.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 120746 entries, 0 to 120745
Data columns (total 17 columns):
 #   Column                  Non-Null Count   Dtype 
---  ------                  --------------   ----- 
 0   video_id                120746 non-null  object
 1   trending_date           120746 non-null  object
 2   title                   120746 non-null  object
 3   channel_title           120746 non-null  object
 4   category_id             120746 non-null  int64 
 5   publish_time            120746 non-null  object
 6   tags                    120746 non-null  object
 7   views                   120746 non-null  int64 
 8   likes                   120746 non-null  int64 
 9   dislikes                120746 non-null  int64 
 10  comment_count           120746 non-null  int64 
 11  thumbnail_link          120746 non-null  object
 12  comments_disabled       120746 non-null  bool  
 13  ratings_disabled        120746 non-null  bool  
 14  video_error_or_removed  120746 non-n

Clean tags column and convert it to list of tags

In [4]:
big_frame["tags"] = big_frame["tags"].str.replace('"', '').replace("'", '')

In [5]:
big_frame["tags"] = big_frame["tags"].str.split("|")

Change category_id into human-readable category string

In [6]:
with open("../Files/Category/US_category_id.json") as f:
    category = json.load(f)

# ... then create a dictionary that maps category_id
# into a human-readable category ...
category_id = {item['id']: item['snippet']['title']
               for item in category['items']}

# ... and show the resulting dictionary.
category_id

{'1': 'Film & Animation',
 '2': 'Autos & Vehicles',
 '10': 'Music',
 '15': 'Pets & Animals',
 '17': 'Sports',
 '18': 'Short Movies',
 '19': 'Travel & Events',
 '20': 'Gaming',
 '21': 'Videoblogging',
 '22': 'People & Blogs',
 '23': 'Comedy',
 '24': 'Entertainment',
 '25': 'News & Politics',
 '26': 'Howto & Style',
 '27': 'Education',
 '28': 'Science & Technology',
 '29': 'Nonprofits & Activism',
 '30': 'Movies',
 '31': 'Anime/Animation',
 '32': 'Action/Adventure',
 '33': 'Classics',
 '34': 'Comedy',
 '35': 'Documentary',
 '36': 'Drama',
 '37': 'Family',
 '38': 'Foreign',
 '39': 'Horror',
 '40': 'Sci-Fi/Fantasy',
 '41': 'Thriller',
 '42': 'Shorts',
 '43': 'Shows',
 '44': 'Trailers'}

In [7]:
# Change category_id into human-readable category string
# using the dictionary created in previous cell
big_frame['category'] = big_frame['category_id'].astype(str).replace(category_id)
big_frame.head()

Unnamed: 0,video_id,trending_date,title,channel_title,category_id,publish_time,tags,views,likes,dislikes,comment_count,thumbnail_link,comments_disabled,ratings_disabled,video_error_or_removed,description,Country,category
0,n1WpP7iowLc,17.14.11,Eminem - Walk On Water (Audio) ft. Beyoncé,EminemVEVO,10,2017-11-10T17:00:03.000Z,"[Eminem, Walk, On, Water, Aftermath/Shady/Inte...",17158579,787425,43420,125882,https://i.ytimg.com/vi/n1WpP7iowLc/default.jpg,False,False,False,Eminem's new track Walk on Water ft. Beyoncé i...,CA,Music
1,0dBIkQ4Mz1M,17.14.11,PLUSH - Bad Unboxing Fan Mail,iDubbbzTV,23,2017-11-13T17:00:00.000Z,"[plush, bad unboxing, unboxing, fan mail, idub...",1014651,127794,1688,13030,https://i.ytimg.com/vi/0dBIkQ4Mz1M/default.jpg,False,False,False,STill got a lot of packages. Probably will las...,CA,Comedy
2,5qpjK5DgCt4,17.14.11,"Racist Superman | Rudy Mancuso, King Bach & Le...",Rudy Mancuso,23,2017-11-12T19:05:24.000Z,"[racist superman, rudy, mancuso, king, bach, r...",3191434,146035,5339,8181,https://i.ytimg.com/vi/5qpjK5DgCt4/default.jpg,False,False,False,WATCH MY PREVIOUS VIDEO ▶ \n\nSUBSCRIBE ► http...,CA,Comedy
3,d380meD0W0M,17.14.11,I Dare You: GOING BALD!?,nigahiga,24,2017-11-12T18:01:41.000Z,"[ryan, higa, higatv, nigahiga, i dare you, idy...",2095828,132239,1989,17518,https://i.ytimg.com/vi/d380meD0W0M/default.jpg,False,False,False,I know it's been a while since we did this sho...,CA,Entertainment
4,2Vv-BfVoq4g,17.14.11,Ed Sheeran - Perfect (Official Music Video),Ed Sheeran,10,2017-11-09T11:04:14.000Z,"[edsheeran, ed sheeran, acoustic, live, cover,...",33523622,1634130,21082,85067,https://i.ytimg.com/vi/2Vv-BfVoq4g/default.jpg,False,False,False,🎧: https://ad.gt/yt-perfect\n💰: https://atlant...,CA,Music


Convert trending_date to datetime

In [8]:
big_frame.trending_date = pd.to_datetime(big_frame.trending_date, format='%y.%d.%m')

Convert Date to Year, Month, Day

In [9]:
date_col =big_frame['trending_date'].dt

big_frame["Year"] = date_col.year
big_frame["Month"] = date_col.month
big_frame["Day"] = date_col.day
big_frame['Weekday'] = date_col.day_name()
# big_frame['trending_date'] = date_col.strftime('%Y-%m')

In [10]:
big_frame['publish_time'] = pd.to_datetime(big_frame['publish_time'], errors='coerce')

In [11]:
big_frame.columns

Index(['video_id', 'trending_date', 'title', 'channel_title', 'category_id',
       'publish_time', 'tags', 'views', 'likes', 'dislikes', 'comment_count',
       'thumbnail_link', 'comments_disabled', 'ratings_disabled',
       'video_error_or_removed', 'description', 'Country', 'category', 'Year',
       'Month', 'Day', 'Weekday'],
      dtype='object')

In [12]:
big_frame.drop(columns=['video_id','title', 'channel_title', 'category_id', 'thumbnail_link', 'comments_disabled', 'ratings_disabled',
       'video_error_or_removed', 'description',], inplace=True)

In [13]:
big_frame.to_csv('../Files/data.csv', index=False)