### **CS 3300 - Final Project**

###### Paige Rosynek & Xavier Robbins

#### **Import Libraries**

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import json
import nlp_functions
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC
import spacy

#### **Data Cleaning**

**Import the Data**

In [None]:
trending_df = pd.read_csv('../data/US_youtube_trending_data.csv')

In [None]:
# load data using Python JSON module
with open('../data/US_category_id.json','r') as f:
    data = json.loads(f.read())
    
category_df = pd.json_normalize(data, record_path=['items'])

In [None]:
trending_df.head()

In [None]:
trending_df.describe()

In [None]:
trending_df.info()

In [None]:
category_df.head()

In [None]:
category_df.describe()

In [None]:
category_df.info()

In [None]:
# convert type of column to make merge successful
category_df['id'] = category_df['id'].astype('int64')

In [None]:
trending_df['channelId'].value_counts()

In [None]:
category_df['snippet.channelId'].value_counts()

**Merge Datasets Into Single DataFrame**

In [None]:
df = pd.merge(trending_df, category_df, left_on=['categoryId'], right_on=['id'], how='inner')

In [None]:
df.shape

In [None]:
df.head()

##### **Clean Merged Dataset**

In [None]:
category_col = df['snippet.title']
df = df.drop(labels=['snippet.channelId', 'id', 'snippet.assignable', 'etag', 'kind', 'categoryId', 'snippet.title', 'thumbnail_link'], axis=1)
df['category'] = category_col

**Convert Merged Dataframe Columns to Correct Type**

In [None]:
df['video_id'] = df['video_id'].astype('category')
df['title'] = df['title'].astype('category')
df['channelId'] = df['channelId'].astype('category')
df['channelTitle'] = df['channelTitle'].astype('category')
df['tags'] = df['tags'].astype('category')
df['description'] = df['description'].astype('category')
df['category'] = df['category'].astype('category')
df['publishedAt'] = pd.to_datetime(df['publishedAt'])
df['trending_date'] = pd.to_datetime(df['trending_date'])

In [None]:
df.head()

In [None]:
df[df['video_id'] == '3C66w5Z0ixs']

In [None]:
df.info()

<mark>ADD MORE COLUMNS? WHAT TO DO WITH TEXT COLUMNS -> slam it all in the df</mark>

tfidvectorizer

- morning, afternoon, night ?
- day of the week
- month ?
- days between trending videos
- days between publish & trending date
- number of days trending

**Upload & Trending Day of Week**

- 0 : monday, 1 : tuesday, ...

In [None]:
df['dayofweek_upload'] = df['publishedAt'].dt.dayofweek
df['dayofweek_trending'] = df['trending_date'].dt.dayofweek

**Number of Days Trending**

**Clean Tags Column**

In [132]:
df['tags'] = df['tags'].apply(lambda tagstr : tagstr.replace('|', ' '))

0         brawadis prank basketball skits ghost funny vi...
1                                                    [None]
2                  smosh smosh pit smosh games funny comedy
3         the bachelor the bachelorette Tyler c Tyler Ca...
4         farming family farm agriculture agriculture jo...
                                ...                        
177385    denzel washington interview discovering the de...
177386    denzel washington interview discovering the de...
177387    denzel washington interview discovering the de...
177388    denzel washington interview discovering the de...
177389    denzel washington interview discovering the de...
Name: tags, Length: 177390, dtype: object

##### **Extract Words from Text Columns**

- columns: title, tags, description

In [None]:
lang_model = spacy.load("en_core_web_sm")
vectorizer = TfidfVectorizer()

In [133]:
vocab_normalized = set()
# title
for title in df['title']:
    vocab_normalized.add(nlp_functions.normalize_text(title, lang_model, lemmatizing=True, stop_words=True, numeric=True))

# tag
for tag in df['tags']:
    vocab_normalized.add(nlp_functions.normalize_text(tag, lang_model, lemmatizing=True, stop_words=True, numeric=True))
    
# description
for desc in df['description']:
    vocab_normalized.add(nlp_functions.normalize_text(desc, lang_model, lemmatizing=True, stop_words=True, numeric=True))

vocab_vectorized = vectorizer.fit_transform(vocab_normalized)  

KeyboardInterrupt: 