### Load the data

In [18]:
import pandas as pd
import json

# Load CSV
df = pd.read_csv("../datasets/USvideos.csv")


# Load JSON to map category_id to category name
with open("../datasets/US_category_id.json") as f:
    category_data = json.load(f)

category_mapping = {
    int(item["id"]): item["snippet"]["title"]
    for item in category_data["items"]
}

# Add readable category
df["category"] = df["category_id"].map(category_mapping)


In [19]:
df.head(5)

Unnamed: 0,video_id,trending_date,title,channel_title,category_id,publish_time,tags,views,likes,dislikes,comment_count,thumbnail_link,comments_disabled,ratings_disabled,video_error_or_removed,description,category
0,2kyS6SvSYSE,17.14.11,WE WANT TO TALK ABOUT OUR MARRIAGE,CaseyNeistat,22,2017-11-13T17:13:01.000Z,SHANtell martin,748374,57527,2966,15954,https://i.ytimg.com/vi/2kyS6SvSYSE/default.jpg,False,False,False,SHANTELL'S CHANNEL - https://www.youtube.com/s...,People & Blogs
1,1ZAPwfrtAFY,17.14.11,The Trump Presidency: Last Week Tonight with J...,LastWeekTonight,24,2017-11-13T07:30:00.000Z,"last week tonight trump presidency|""last week ...",2418783,97185,6146,12703,https://i.ytimg.com/vi/1ZAPwfrtAFY/default.jpg,False,False,False,"One year after the presidential election, John...",Entertainment
2,5qpjK5DgCt4,17.14.11,"Racist Superman | Rudy Mancuso, King Bach & Le...",Rudy Mancuso,23,2017-11-12T19:05:24.000Z,"racist superman|""rudy""|""mancuso""|""king""|""bach""...",3191434,146033,5339,8181,https://i.ytimg.com/vi/5qpjK5DgCt4/default.jpg,False,False,False,WATCH MY PREVIOUS VIDEO ▶ \n\nSUBSCRIBE ► http...,Comedy
3,puqaWrEC7tY,17.14.11,Nickelback Lyrics: Real or Fake?,Good Mythical Morning,24,2017-11-13T11:00:04.000Z,"rhett and link|""gmm""|""good mythical morning""|""...",343168,10172,666,2146,https://i.ytimg.com/vi/puqaWrEC7tY/default.jpg,False,False,False,Today we find out if Link is a Nickelback amat...,Entertainment
4,d380meD0W0M,17.14.11,I Dare You: GOING BALD!?,nigahiga,24,2017-11-12T18:01:41.000Z,"ryan|""higa""|""higatv""|""nigahiga""|""i dare you""|""...",2095731,132235,1989,17518,https://i.ytimg.com/vi/d380meD0W0M/default.jpg,False,False,False,I know it's been a while since we did this sho...,Entertainment


### Clean and format data

In [21]:
df['publish_time'] = pd.to_datetime(df['publish_time'])
df['trending_date'] = pd.to_datetime(df['trending_date'], format='%y.%d.%m')

df.drop_duplicates(subset=['video_id', 'trending_date'], inplace=True)

df.dropna(inplace=True)

### Explore DataFrame

In [22]:
print(df.shape)
print(df.columns)
df.info()
df.describe()


(40330, 17)
Index(['video_id', 'trending_date', 'title', 'channel_title', 'category_id',
       'publish_time', 'tags', 'views', 'likes', 'dislikes', 'comment_count',
       'thumbnail_link', 'comments_disabled', 'ratings_disabled',
       'video_error_or_removed', 'description', 'category'],
      dtype='object')
<class 'pandas.core.frame.DataFrame'>
Index: 40330 entries, 0 to 40948
Data columns (total 17 columns):
 #   Column                  Non-Null Count  Dtype              
---  ------                  --------------  -----              
 0   video_id                40330 non-null  object             
 1   trending_date           40330 non-null  datetime64[ns]     
 2   title                   40330 non-null  object             
 3   channel_title           40330 non-null  object             
 4   category_id             40330 non-null  int64              
 5   publish_time            40330 non-null  datetime64[ns, UTC]
 6   tags                    40330 non-null  object         

Unnamed: 0,trending_date,category_id,views,likes,dislikes,comment_count
count,40330,40330.0,40330.0,40330.0,40330.0,40330.0
mean,2018-02-27 09:58:12.526654976,19.942276,2370795.0,74621.01,3508.275,8289.024
min,2017-11-14 00:00:00,1.0,549.0,0.0,0.0,0.0
25%,2018-01-04 00:00:00,17.0,246627.0,5616.0,204.0,623.0
50%,2018-02-26 00:00:00,24.0,688290.0,18351.5,635.5,1873.0
75%,2018-04-24 00:00:00,25.0,1831572.0,55625.5,1945.75,5784.75
max,2018-06-14 00:00:00,43.0,225211900.0,5613827.0,1643059.0,1228655.0
std,,7.596013,7432303.0,229735.7,23188.31,34335.67
