# Import Libraries

In [1]:
import pandas as pd
from sqlalchemy import create_engine

import pprint
import psycopg2

In [2]:
# Import Postgress credentials from config.py file 
from database_credentials import username
from database_credentials import password

# Data Preprocessing

In [3]:
# Import CSV files
csvfile = "../DataFiles/USVideos.csv"
us_videos = pd.read_csv(csvfile)


In [4]:
us_videos.dtypes

video_id                  object
trending_date             object
title                     object
channel_title             object
category_id                int64
publish_time              object
tags                      object
views                      int64
likes                      int64
dislikes                   int64
comment_count              int64
thumbnail_link            object
comments_disabled           bool
ratings_disabled            bool
video_error_or_removed      bool
description               object
dtype: object

### CSV File cleanup

In [5]:
#Transforming publish time to datetime format
us_videos = pd.read_csv(csvfile, parse_dates=['publish_time'])

In [6]:
#Transforming trending date to datetime format
us_videos['trending_date'] = pd.to_datetime(us_videos['trending_date'],format = '%y.%d.%m')

In [7]:
#Validating datatype changes
us_videos.dtypes

video_id                               object
trending_date                  datetime64[ns]
title                                  object
channel_title                          object
category_id                             int64
publish_time              datetime64[ns, UTC]
tags                                   object
views                                   int64
likes                                   int64
dislikes                                int64
comment_count                           int64
thumbnail_link                         object
comments_disabled                        bool
ratings_disabled                         bool
video_error_or_removed                   bool
description                            object
dtype: object

In [8]:
#Added additional columns to our dataset
us_videos['country'] ='US'
us_videos['trending_month'] =us_videos['trending_date'].dt.month
us_videos['trending_year'] =us_videos['trending_date'].dt.year
us_videos['publish_month'] = us_videos['publish_time'].dt.month
us_videos['publish_year'] = us_videos['publish_time'].dt.year

In [9]:
us_videos.columns

Index(['video_id', 'trending_date', 'title', 'channel_title', 'category_id',
       'publish_time', 'tags', 'views', 'likes', 'dislikes', 'comment_count',
       'thumbnail_link', 'comments_disabled', 'ratings_disabled',
       'video_error_or_removed', 'description', 'country', 'trending_month',
       'trending_year', 'publish_month', 'publish_year'],
      dtype='object')

In [10]:
# Get rid of unnecessary columns
us_videos_df = us_videos[[ 'title', 'channel_title', 'category_id','views', 'likes', 'dislikes', 'comment_count',
                          'trending_date','country', 'trending_month', 'trending_year',
                          'publish_time', 'publish_month', 'publish_year']].copy()


In [11]:
# Data Transformation: Rename Columns
us_videos_df = us_videos_df.rename(columns={ "publish_time":"publish_date", "views":"view_count", 
                             "likes":"like_count", "dislikes":"dislike_count"})

Verifying and Removing Duplicates in the DataFrame

In [12]:
#Count Duplicates in the DataFrame
us_videos_count = us_videos_df['title'].count()
print(f'Total number of youtube videos in DataFrame before Cleanup: {us_videos_count}')
duplicate_count=us_videos_df.duplicated().sum()
print(f'Total number of duplicates in DataFrame before Cleanup: {duplicate_count}')

Total number of youtube videos in DataFrame before Cleanup: 40949
Total number of duplicates in DataFrame before Cleanup: 48


In [13]:
#Extracting Duplicate Rows
us_videos_df.loc[us_videos_df.duplicated(), :]

Unnamed: 0,title,channel_title,category_id,view_count,like_count,dislike_count,comment_count,trending_date,country,trending_month,trending_year,publish_date,publish_month,publish_year
34899,Why I'm So Scared (being myself and crying too...,grav3yardgirl,26,1469627,188652,3124,33032,2018-05-15,US,5,2018,2018-05-14 19:00:01+00:00,5,2018
34900,YoungBoy Never Broke Again Goes Sneaker Shoppi...,Complex,24,1199587,49709,2380,7261,2018-05-15,US,5,2018,2018-05-14 14:00:03+00:00,5,2018
34901,WE MADE OUR MOM CRY...HER DREAM CAME TRUE!,Lucas and Marcus,24,3906727,77378,12160,15874,2018-05-15,US,5,2018,2018-05-13 18:03:56+00:00,5,2018
34902,"周杰倫 Jay Chou【不愛我就拉倒 If You Don't Love Me, It's...",杰威爾音樂 JVR Music,10,916128,40485,1042,4746,2018-05-15,US,5,2018,2018-05-14 15:59:47+00:00,5,2018
34903,Terry Crews Answers the Web's Most Searched Qu...,WIRED,24,343967,16988,132,1308,2018-05-15,US,5,2018,2018-05-14 16:00:29+00:00,5,2018
34904,Why Stradivarius violins are worth millions,Vox,25,433833,12356,307,1129,2018-05-15,US,5,2018,2018-05-14 12:00:03+00:00,5,2018
34905,"$17 Pet vs. $100,000 Pet",BuzzFeedBlue,22,3081033,60379,6857,7796,2018-05-15,US,5,2018,2018-05-13 15:00:57+00:00,5,2018
34906,Sarah Paulson Gets Scared During '5 Second Rule',TheEllenShow,24,704786,19880,248,669,2018-05-15,US,5,2018,2018-05-14 13:00:00+00:00,5,2018
34907,Gabby Barrett Sings I Have Nothing by Whitney ...,American Idol,24,735031,11734,1468,1870,2018-05-15,US,5,2018,2018-05-14 02:23:01+00:00,5,2018
34908,"The ULTIMATE $30,000 Gaming PC Setup",Unbox Therapy,28,4700460,103430,8028,13293,2018-05-15,US,5,2018,2018-05-13 19:00:25+00:00,5,2018


In [14]:
#Removing Duplicates
us_videos_df.drop_duplicates(inplace=True)
us_videos_df.duplicated().sum()

0

In [15]:
#Final DataFrame after Data Cleanup
us_videos_count = us_videos_df['title'].count()
print(f'Total number of youtube videos in DataFrame after Cleanup: {us_videos_count}')
duplicate_count=us_videos_df.duplicated().sum()
print(f'Total number of duplicates in DataFrame after Cleanup: {duplicate_count}')

Total number of youtube videos in DataFrame after Cleanup: 40901
Total number of duplicates in DataFrame after Cleanup: 0


### JSON File cleanup

In [16]:
categories = pd.read_json("../DataFiles/US_category_id.json")
video_category = pd.json_normalize(categories['items'])
video_category.head()

Unnamed: 0,kind,etag,id,snippet.channelId,snippet.title,snippet.assignable
0,youtube#videoCategory,"""m2yskBQFythfE4irbTIeOgYYfBU/Xy1mB4_yLrHy_BmKm...",1,UCBR8-60-B28hp2BmDPdntcQ,Film & Animation,True
1,youtube#videoCategory,"""m2yskBQFythfE4irbTIeOgYYfBU/UZ1oLIIz2dxIhO45Z...",2,UCBR8-60-B28hp2BmDPdntcQ,Autos & Vehicles,True
2,youtube#videoCategory,"""m2yskBQFythfE4irbTIeOgYYfBU/nqRIq97-xe5XRZTxb...",10,UCBR8-60-B28hp2BmDPdntcQ,Music,True
3,youtube#videoCategory,"""m2yskBQFythfE4irbTIeOgYYfBU/HwXKamM1Q20q9BN-o...",15,UCBR8-60-B28hp2BmDPdntcQ,Pets & Animals,True
4,youtube#videoCategory,"""m2yskBQFythfE4irbTIeOgYYfBU/9GQMSRjrZdHeb1OEM...",17,UCBR8-60-B28hp2BmDPdntcQ,Sports,True


In [17]:
video_category.columns

Index(['kind', 'etag', 'id', 'snippet.channelId', 'snippet.title',
       'snippet.assignable'],
      dtype='object')

In [18]:
#Transforming id column in video_category_df to int-64 so we can match the same with CSV File category_id column
        #video_category['id'] = video_category_df['id'].astype('int64')
video_category['id'] = pd.to_numeric(video_category['id'])

In [19]:
# Get rid of unnecessary columns
video_category_df = video_category[['id','snippet.title']].copy()

In [20]:
#Rename column
video_category_df = video_category_df.rename(columns ={"snippet.title":"category_title"})

In [21]:
video_category_df.head()

Unnamed: 0,id,category_title
0,1,Film & Animation
1,2,Autos & Vehicles
2,10,Music
3,15,Pets & Animals
4,17,Sports


# Loading Data to Postgress Database

In [22]:
#Creating connection to Postgress database
from sqlalchemy import create_engine
engine = create_engine(f'postgresql://{username}:{password}@localhost:5432/etl_project')
#connection = engine.connect()

In [23]:
engine.table_names()

  engine.table_names()


['us_videos', 'video_category']

In [24]:
#Remove after final run
engine.execute("TRUNCATE TABLE us_videos, video_category")

<sqlalchemy.engine.cursor.LegacyCursorResult at 0x1f1cef35340>

### Using pandas to load converted DataFrame to database

Load video_category file first as it has reference in us_video table

In [25]:
#Load data into video_category table
video_category_df.to_sql(name='video_category' , con=engine, if_exists= 'append', index=False)

In [26]:
#Verifying Results in Postgres Table
sql_category_count =pd.read_sql_query('select count(id) from video_category', con=engine)
sql_category_count

Unnamed: 0,count
0,32


In [27]:
#Load data into us_videos table
us_videos_df.to_sql(name ='us_videos', con =engine, if_exists ='append', index=False)

In [28]:
sql_video_count = pd.read_sql_query('Select count(id) from us_videos', con=engine)
sql_video_count

Unnamed: 0,count
0,40901


In [29]:
#Quering the reults from Postgres SQL
us_videos_sql = pd.read_sql("SELECT * FROM us_videos", con=engine)
video_category_sql = pd.read_sql("SELECT * FROM video_category", con=engine)

## Joining Datasets in Jupyter Notebook


In [30]:
combined_df = us_videos_sql.merge(video_category_sql, left_on='category_id', right_on='id')
combined_df.head()

Unnamed: 0,id_x,title,channel_title,category_id,view_count,like_count,dislike_count,comment_count,country,trending_date,trending_month,trending_year,publish_date,publish_month,publish_year,id_y,category_title
0,286308,WE WANT TO TALK ABOUT OUR MARRIAGE,CaseyNeistat,22,748374,57527,2966,15954,US,2017-11-14,11,2017,2017-11-13 11:13:01,11,2017,22,People & Blogs
1,286343,Me-O Cats Commercial,Nobrand,22,98966,2486,184,532,US,2017-11-14,11,2017,2017-04-21 01:47:32,4,2017,22,People & Blogs
2,286352,"AFFAIRS, EX BOYFRIENDS, $18MILLION NET WORTH -...",Shawn Johnson East,22,321053,4451,1772,895,US,2017-11-14,11,2017,2017-11-11 09:00:03,11,2017,22,People & Blogs
3,286362,BLIND(folded) CAKE DECORATING CONTEST (with Mo...,Grace Helbig,22,197062,7250,217,456,US,2017-11-14,11,2017,2017-11-11 12:08:04,11,2017,22,People & Blogs
4,286374,Wearing Online Dollar Store Makeup For A Week,Safiya Nygaard,22,2744430,115426,1110,6541,US,2017-11-14,11,2017,2017-11-10 19:19:33,11,2017,22,People & Blogs


In [31]:
#Keeping the category_id from us_videos table and removing it from video_category(id_x) table.
#Removing unique id(id_x) generated in SQL Serial from us_videos table
combined_df= combined_df.drop(columns=['id_x','id_y'])

In [32]:
combined_df.head()

Unnamed: 0,title,channel_title,category_id,view_count,like_count,dislike_count,comment_count,country,trending_date,trending_month,trending_year,publish_date,publish_month,publish_year,category_title
0,WE WANT TO TALK ABOUT OUR MARRIAGE,CaseyNeistat,22,748374,57527,2966,15954,US,2017-11-14,11,2017,2017-11-13 11:13:01,11,2017,People & Blogs
1,Me-O Cats Commercial,Nobrand,22,98966,2486,184,532,US,2017-11-14,11,2017,2017-04-21 01:47:32,4,2017,People & Blogs
2,"AFFAIRS, EX BOYFRIENDS, $18MILLION NET WORTH -...",Shawn Johnson East,22,321053,4451,1772,895,US,2017-11-14,11,2017,2017-11-11 09:00:03,11,2017,People & Blogs
3,BLIND(folded) CAKE DECORATING CONTEST (with Mo...,Grace Helbig,22,197062,7250,217,456,US,2017-11-14,11,2017,2017-11-11 12:08:04,11,2017,People & Blogs
4,Wearing Online Dollar Store Makeup For A Week,Safiya Nygaard,22,2744430,115426,1110,6541,US,2017-11-14,11,2017,2017-11-10 19:19:33,11,2017,People & Blogs


 ## Joining the two table using SQL Query

In [33]:
#Quering the join reults from Postgres SQL
query_join = pd.read_sql("SELECT v.id, title, category_title FROM us_videos v INNER JOIN video_category vc ON (category_id = vc.id)", con=engine)


In [34]:
query_join.head()

Unnamed: 0,id,title,category_title
0,286308,WE WANT TO TALK ABOUT OUR MARRIAGE,People & Blogs
1,286309,The Trump Presidency: Last Week Tonight with J...,Entertainment
2,286310,"Racist Superman | Rudy Mancuso, King Bach & Le...",Comedy
3,286311,Nickelback Lyrics: Real or Fake?,Entertainment
4,286312,I Dare You: GOING BALD!?,Entertainment
