In [None]:
import kagglehub
import pandas as pd
import os

# Trending Youtube Video Dataset From Kaggle
# Download latest version
path = kagglehub.dataset_download("datasnaek/youtube-new")

print("Path to dataset files:", path)

Downloading from https://www.kaggle.com/api/v1/datasets/download/datasnaek/youtube-new?dataset_version_number=115...


100%|██████████| 201M/201M [00:05<00:00, 37.3MB/s]

Extracting files...





Path to dataset files: /root/.cache/kagglehub/datasets/datasnaek/youtube-new/versions/115


In [None]:
files = os.listdir(path)
print("Files in dataset:", files)

Files in dataset: ['GB_category_id.json', 'FRvideos.csv', 'GBvideos.csv', 'US_category_id.json', 'JP_category_id.json', 'DE_category_id.json', 'RUvideos.csv', 'JPvideos.csv', 'RU_category_id.json', 'USvideos.csv', 'DEvideos.csv', 'MX_category_id.json', 'INvideos.csv', 'MXvideos.csv', 'FR_category_id.json', 'IN_category_id.json', 'CA_category_id.json', 'KRvideos.csv', 'KR_category_id.json', 'CAvideos.csv']


In [None]:
file_to_load = os.path.join(path, 'USvideos.csv')
if 'USvideos.csv' in files:
    data = pd.read_csv(file_to_load)
    print(data.head())
else:
    print("The expected file 'USvideos.csv' is not in the dataset.")

      video_id trending_date  \
0  2kyS6SvSYSE      17.14.11   
1  1ZAPwfrtAFY      17.14.11   
2  5qpjK5DgCt4      17.14.11   
3  puqaWrEC7tY      17.14.11   
4  d380meD0W0M      17.14.11   

                                               title          channel_title  \
0                 WE WANT TO TALK ABOUT OUR MARRIAGE           CaseyNeistat   
1  The Trump Presidency: Last Week Tonight with J...        LastWeekTonight   
2  Racist Superman | Rudy Mancuso, King Bach & Le...           Rudy Mancuso   
3                   Nickelback Lyrics: Real or Fake?  Good Mythical Morning   
4                           I Dare You: GOING BALD!?               nigahiga   

   category_id              publish_time  \
0           22  2017-11-13T17:13:01.000Z   
1           24  2017-11-13T07:30:00.000Z   
2           23  2017-11-12T19:05:24.000Z   
3           24  2017-11-13T11:00:04.000Z   
4           24  2017-11-12T18:01:41.000Z   

                                                tags    views   lik

In [None]:
# Preprocessing

import pandas as pd
import string
from nltk.corpus import stopwords
import nltk

#download stopwords - Jessica
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))


#load the CSV file
"""
file_path = '/content/USvideos.csv'    #we realized with using the kagglehub import that we do not
data = pd.read_csv(file_path)          #need to import our dataset, but instead, use kagglehub to download the dataset - Sloane, Jessica, Persabella
"""

#Preprocessing function, removes punctuations and stopwords from our video titles - Sloane
def preprocess_text_simple(text):
    text = text.lower()
    text = ''.join([char for char in text if char not in string.punctuation])
    tokens = text.split()
    tokens = [word for word in tokens if word not in stop_words]
    return tokens

#set preprocessing to the 'title' column of our csv file & remove duplications - Jessica
data['title_preprocessed'] = data['title'].apply(preprocess_text_simple)
data = data.drop_duplicates(subset='title', keep='first')

print(data[['title', 'title_preprocessed']].head())   #preview


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


                                               title  \
0                 WE WANT TO TALK ABOUT OUR MARRIAGE   
1  The Trump Presidency: Last Week Tonight with J...   
2  Racist Superman | Rudy Mancuso, King Bach & Le...   
3                   Nickelback Lyrics: Real or Fake?   
4                           I Dare You: GOING BALD!?   

                                  title_preprocessed  
0                             [want, talk, marriage]  
1  [trump, presidency, last, week, tonight, john,...  
2  [racist, superman, rudy, mancuso, king, bach, ...  
3                   [nickelback, lyrics, real, fake]  
4                                [dare, going, bald]  


We were having issues with neural networks, specifically with the similarity results being perfect 1s or 99.9% (so, essentially 1). We were using cosine similarity to compute similarity scores, so we tried other methods (besides neural networks and cosine similarities) to see what we could get.

As seen in the above code, I made a system using a neural network model by dot-product instead of cosine similartiy. Unfortunately, I did not get accurate similarity scores and the code would just recommend the same videos

-- Jessica

In [None]:
#Recommendations by Euclidean Distance - Sloane
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import pairwise_distances
import time
import random

start_time = time.time() #starts a timer so we can see how long it takes for the code to run

exclude_titles = ["Where are we?", "ME YOU YOU ME"]         #these two titles always appeared in the results no
data = data[~data['title'].isin(exclude_titles)]            #matter what, so i just removed them entirely - Sloane

data['title_preprocessed'] = data['title'].apply(preprocess_text_simple) #load the preprocess
data = data.drop_duplicates(subset='title', keep='first')

vectorizer = TfidfVectorizer(stop_words='english')  #vectorize the titles
title_vectors = vectorizer.fit_transform(data['title_preprocessed'].apply(lambda x: ' '.join(x))).toarray()

distances = pairwise_distances(title_vectors, metric='euclidean')

similarities = 1 / (1 + distances)  #this converts distance to similarities

#recommendation function
def recommend_videos_by_title_euclidean(video_index, num_recommendations=10):
    sim_scores = list(enumerate(distances[video_index]))
    #sort from shortest distance to largest distance
    sim_scores = sorted(sim_scores, key=lambda x: x[1])
    sim_scores = sim_scores[1:num_recommendations + 1]
    video_indices = [i[0] for i in sim_scores]
    recommendations = data.iloc[video_indices][['title', 'views']].copy()
    recommendations['distance'] = [sim_scores[i][1] for i in range(len(sim_scores))]
    recommendations['similarity'] = 1 / (1 + recommendations['distance'])
    return recommendations[['title', 'views', 'distance', 'similarity']]


random_video_index = random.randint(0, len(data) - 1) #picks a random video
print("Euclidean Recommendations for:", data.iloc[random_video_index]['title'])


num_recommendations = 10
recommendations = recommend_videos_by_title_euclidean(random_video_index, num_recommendations)
end_time = time.time() #stops timer after the code completes recommendations
print(recommendations)
print()
print(f"Time taken: {end_time - start_time:.2f} seconds")


Euclidean Recommendations for: When the girl you're wrestling has a tough little brother  - 979280
                                                   title    views  distance  \
38355                  You're Not Edgy, You're Just Lazy   795307  1.199852   
16400                               Sumo Wrestling in 4K   655388  1.237355   
5678                       I Have (a little bit) HAD IT!   216921  1.263428   
10282   Dogs See Their Brother After 10 Months  - 980323     7251  1.268107   
30367                 You're Too Good To Date My Friends   705768  1.270018   
17     How does your body know you're full? - Hilary ...    78044  1.299193   
170       Train - Have Yourself a Merry Little Christmas    30021  1.302036   
33539  You're Too Good To Date My Friends | Hardly Wo...  1161472  1.304480   
18006               The Girl Next Door (Valentine's Day)    55778  1.304619   
26971                    How Girl Scout Cookies Are Made   237087  1.304939   

       similarity  
38355    0.

This code was part of a larger project done for an Artificial Intelligence course at Belmont University. I am only uploading my euclidean distance code as well as the necessary preprocessing code.