### **CS 3300 - Final Project**

###### Paige Rosynek & Xavier Robbins

#### **Import Libraries**

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import json
from nlp_functions import normalize_text
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC
import spacy

#### **Data Cleaning**

**Import the Data**

In [None]:
trending_df = pd.read_csv('../data/US_youtube_trending_data.csv')

In [None]:
# load data using Python JSON module
with open('../data/US_category_id.json','r') as f:
    data = json.loads(f.read())
    
category_df = pd.json_normalize(data, record_path=['items'])

In [None]:
trending_df.head()

In [None]:
trending_df.describe()

In [None]:
trending_df.info()

In [None]:
category_df.head()

In [None]:
category_df.describe()

In [None]:
category_df.info()

In [None]:
# convert type of column to make merge successful
category_df['id'] = category_df['id'].astype('int64')

In [None]:
trending_df['channelId'].value_counts()

In [None]:
category_df['snippet.channelId'].value_counts()

**Merge Datasets Into Single DataFrame**

In [None]:
df = pd.merge(trending_df, category_df, left_on=['categoryId'], right_on=['id'], how='inner')

In [None]:
df.shape

In [None]:
df.head()

##### **Clean Merged Dataset**

In [None]:
category_col = df['snippet.title']
df = df.drop(labels=['snippet.channelId', 'id', 'snippet.assignable', 'etag', 'kind', 'categoryId', 'snippet.title', 'thumbnail_link'], axis=1)
df['category'] = category_col

**Convert Merged Dataframe Columns to Correct Type**

In [None]:
df['video_id'] = df['video_id'].astype('category')
df['title'] = df['title'].astype('category')
df['channelId'] = df['channelId'].astype('category')
df['channelTitle'] = df['channelTitle'].astype('category')
df['tags'] = df['tags'].astype('category')
df['description'] = df['description'].astype('category')
df['category'] = df['category'].astype('category')
df['publishedAt'] = pd.to_datetime(df['publishedAt'])
df['trending_date'] = pd.to_datetime(df['trending_date'])

In [None]:
df.head()

In [None]:
df.info()

<mark>ADD MORE COLUMNS? WHAT TO DO WITH TEXT COLUMNS -> slam it all in the df</mark>

tfidvectorizer

- morning, afternoon, night ?
- day of the week
- month ?
- days between trending videos
- days between publish & trending date
- number of days trending

**Upload & Trending Day of Week**

- 0 : monday, 1 : tuesday, ...

In [None]:
df['dayofweek_upload'] = df['publishedAt'].dt.dayofweek
df['dayofweek_trending'] = df['trending_date'].dt.dayofweek

Get DFs of first video id occurrence and last video id occurrence 

In [None]:
first_video_occ_df = df.drop_duplicates('video_id', keep='first').reset_index(drop=True)
last_video_occ_df = df.drop_duplicates('video_id', keep='last').reset_index(drop=True)

Create new df of first video occurences and add columns from last occurence df 

In [None]:
new_df = first_video_occ_df.rename(columns={'trending_date':'start_trend_date', 
                                            'view_count':'start_view_count', 
                                            'likes':'start_likes', 
                                            'dislikes':'start_dislikes', 
                                            'comment_count':'start_comment_count'})

Add last occurence columns to new DF for data that changes

In [None]:
new_df['end_trend_date'] = last_video_occ_df['trending_date']
new_df['end_view_count'] = last_video_occ_df['view_count']
new_df['end_likes'] = last_video_occ_df['likes']
new_df['end_dislikes'] = last_video_occ_df['dislikes']
new_df['end_comment_count'] = last_video_occ_df['comment_count']

Reset index of df and print head elements

In [None]:
new_df = new_df.sort_values('video_id').reset_index(drop=True)
new_df.head()

In [None]:
new_df.info()

**Number of Days Trending**

In [None]:
new_df['number_days_trend'] = new_df['end_trend_date'] - new_df['start_trend_date']
new_df['number_days_trend'] = new_df['number_days_trend'].dt.days
new_df.head()

In [None]:
new_df.info()

##### **Extract Words from Text Columns**

- columns: title, tags, description

In [None]:
lang_model = spacy.load("en_core_web_sm")

In [None]:
# for title in df['title']:
#     print(normalize_text(title, lang_model, lemmatizing=True, stop_words=True))