<a href="https://colab.research.google.com/github/shilpathota/RecommendationSystem/blob/main/EnhancedRecommendationSystem_Phase2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
# ✅ STEP 1: Install Google API Client (only needed once per session)
!pip install --quiet google-api-python-client

In [4]:
# ✅ STEP 2: Import libraries
import pandas as pd
from googleapiclient.discovery import build
import getpass

In [5]:
# ✅ STEP 3: Securely input API key
API_KEY = getpass.getpass('🔑 Enter your YouTube API key: ')
YOUTUBE_API_SERVICE_NAME = "youtube"
YOUTUBE_API_VERSION = "v3"


🔑 Enter your YouTube API key: ··········


In [6]:
# ✅ STEP 4: Initialize API client
youtube = build(YOUTUBE_API_SERVICE_NAME, YOUTUBE_API_VERSION, developerKey=API_KEY)


In [7]:
# ✅ STEP 5: Search YouTube videos by topic (age-specific query)
def search_youtube_videos(query="learning ABC", max_results=10):
    response = youtube.search().list(
        q=query,
        part="snippet",
        maxResults=max_results,
        type="video"
    ).execute()

    video_ids = [item['id']['videoId'] for item in response['items']]
    return video_ids


In [8]:
# ✅ STEP 6: Get metadata for a list of video IDs
def get_video_metadata(video_ids):
    response = youtube.videos().list(
        part="snippet,contentDetails,statistics",
        id=",".join(video_ids)
    ).execute()

    video_data = []
    for item in response['items']:
        video_data.append({
            'video_id': item['id'],
            'title': item['snippet']['title'],
            'description': item['snippet'].get('description', ''),
            'channel_title': item['snippet']['channelTitle'],
            'tags': item['snippet'].get('tags', []),
            'published_at': item['snippet']['publishedAt'],
            'view_count': item['statistics'].get('viewCount', 0),
            'like_count': item['statistics'].get('likeCount', 0),
            'comment_count': item['statistics'].get('commentCount', 0),
        })
    return video_data

In [9]:
# ✅ STEP 7: Run the Search + Fetch
query = "nursery rhymes for toddlers"
video_ids = search_youtube_videos(query, max_results=15)
video_metadata = get_video_metadata(video_ids)


In [10]:
# ✅ STEP 8: Save to CSV or show dataframe
df = pd.DataFrame(video_metadata)
df.to_csv("youtube_kids_data.csv", index=False)
df.head()

Unnamed: 0,video_id,title,description,channel_title,tags,published_at,view_count,like_count,comment_count
0,drkOBuiGPCM,Old MacDonald Song with Safari Animals | Lalaf...,"Old MacDonald meets safari animals like lions,...",Lalafun - Nursery Rhymes,"[lalafun, toddler songs, nursery rhymes, anima...",2024-11-18T07:04:52Z,141513677,185887,0
1,5gZOYKHXwyQ,Nursery Rhymes for Kids | Songs Compilation - ...,Itsy Bitsy Spider and many more children songs...,Twinkle Little Songs - Nursery Rhymes,"[nursery rhymes, songs for kids, children song...",2019-07-26T16:15:17Z,66150643,129531,0
2,e_04ZrNroTo,Wheels on the Bus | @CoComelon Nursery Rhymes ...,Bounce along in the bus all over town with thi...,Cocomelon - Nursery Rhymes,"[preschool, toddler, children songs, abckidtv,...",2018-05-24T07:00:02Z,7505976360,19687613,0
3,hxOApe1P9dM,Humpty Dumpty Grocery Store | CoComelon Nurser...,Look at Humpty Dumpty bounce and roll! Nina go...,Cocomelon - Nursery Rhymes,"[cocomelon, abckidtv, nursery rhymes, children...",2023-08-22T09:30:16Z,939645449,1378563,0
4,buE6l32rCHo,Incy Wincy Spider and More Nursery Rhymes for ...,Watch cute Incy Wincy Spider in this Itsy Bits...,Toddler Fun Learning,"[itsy bitsy spider, itsy bitsy spider nursery ...",2015-05-01T11:00:00Z,54651699,73951,0


In [12]:
import base64
import json
import requests

# 🔐 Securely input GitHub token
github_token = getpass.getpass('🔐 Enter your GitHub token: ')

# 🔗 GitHub repo details
GITHUB_USERNAME = "shilpathota"
REPO_NAME = "RecommendationSystem"
FILE_PATH = "data/youtube_kids_data.csv"  # path inside the repo
COMMIT_MESSAGE = "Add latest YouTube kids metadata"

# 📤 Read the CSV content
with open("youtube_kids_data.csv", "rb") as file:
    content = file.read()
    encoded_content = base64.b64encode(content).decode("utf-8")

# 🧠 GitHub API URL
url = f"https://api.github.com/repos/{GITHUB_USERNAME}/{REPO_NAME}/contents/{FILE_PATH}"

# Check if file exists to determine PUT or PATCH
response = requests.get(url, headers={"Authorization": f"token {github_token}"})
if response.status_code == 200:
    sha = response.json()['sha']  # Needed for update
else:
    sha = None

# 📤 Upload the file (create or update)
data = {
    "message": COMMIT_MESSAGE,
    "content": encoded_content,
    "branch": "main"
}
if sha:
    data["sha"] = sha

response = requests.put(url, headers={"Authorization": f"token {github_token}"}, data=json.dumps(data))

if response.status_code in [200, 201]:
    print("✅ File pushed to GitHub!")
else:
    print("❌ Failed to push:", response.json())


🔐 Enter your GitHub token: ··········
✅ File pushed to GitHub!


In [13]:
import pandas as pd

# Define the age-query mapping
mapping_data = [
    {"query": "nursery rhymes for toddlers", "age_min": 3, "age_max": 5, "category": "Music"},
    {"query": "learning ABC for preschoolers", "age_min": 3, "age_max": 5, "category": "Education"},
    {"query": "colors and shapes for kids", "age_min": 3, "age_max": 5, "category": "Education"},
    {"query": "science for kids age 6 to 8", "age_min": 6, "age_max": 8, "category": "Science"},
    {"query": "math games for kids", "age_min": 6, "age_max": 8, "category": "Math"},
    {"query": "bedtime stories for kids", "age_min": 6, "age_max": 8, "category": "Storytelling"},
    {"query": "coding for kids age 9", "age_min": 9, "age_max": 12, "category": "Programming"},
    {"query": "solar system for kids", "age_min": 9, "age_max": 12, "category": "Space/Science"},
    {"query": "how to draw for kids", "age_min": 9, "age_max": 12, "category": "Arts & Crafts"},
    {"query": "STEM activities for kids", "age_min": 9, "age_max": 12, "category": "STEM"},
]

# Convert to DataFrame and save
mapping_df = pd.DataFrame(mapping_data)
mapping_df.to_csv("age_query_mapping.csv", index=False)

print("✅ Mapping CSV created successfully!")
mapping_df


✅ Mapping CSV created successfully!


Unnamed: 0,query,age_min,age_max,category
0,nursery rhymes for toddlers,3,5,Music
1,learning ABC for preschoolers,3,5,Education
2,colors and shapes for kids,3,5,Education
3,science for kids age 6 to 8,6,8,Science
4,math games for kids,6,8,Math
5,bedtime stories for kids,6,8,Storytelling
6,coding for kids age 9,9,12,Programming
7,solar system for kids,9,12,Space/Science
8,how to draw for kids,9,12,Arts & Crafts
9,STEM activities for kids,9,12,STEM


In [14]:
import base64
import json
import requests
import getpass

# 🔐 Enter your GitHub token
github_token = getpass.getpass('🔐 Enter your GitHub token: ')

# GitHub repository info
GITHUB_USERNAME = "shilpathota"  # 👈 change this
REPO_NAME = "RecommendationSystem"            # 👈 change this
FILE_PATH = "data/age_query_mapping.csv"  # 👈 path inside repo
COMMIT_MESSAGE = "Add age-query mapping CSV"

# Read CSV file and encode
with open("age_query_mapping.csv", "rb") as file:
    content = file.read()
    encoded_content = base64.b64encode(content).decode("utf-8")

# Check if file already exists to get its SHA
url = f"https://api.github.com/repos/{GITHUB_USERNAME}/{REPO_NAME}/contents/{FILE_PATH}"
headers = {"Authorization": f"token {github_token}"}
response = requests.get(url, headers=headers)

sha = response.json()['sha'] if response.status_code == 200 else None

# Prepare PUT request
data = {
    "message": COMMIT_MESSAGE,
    "content": encoded_content,
    "branch": "main"
}
if sha:
    data["sha"] = sha

response = requests.put(url, headers=headers, data=json.dumps(data))

# Status output
if response.status_code in [200, 201]:
    print("✅ age_query_mapping.csv pushed to GitHub!")
else:
    print("❌ Failed to push:", response.json())


🔐 Enter your GitHub token: ··········
✅ age_query_mapping.csv pushed to GitHub!


In [15]:
import pandas as pd

mapping_df = pd.read_csv("age_query_mapping.csv")


In [16]:
#Get queries for user age
import random

def get_queries_for_age(age, n_queries=2):
    matches = mapping_df[(mapping_df['age_min'] <= age) & (mapping_df['age_max'] >= age)]
    return random.sample(matches['query'].tolist(), k=min(n_queries, len(matches)))


In [17]:
#Fetch YouTube Videos for Each Query
def get_videos_for_age(age):
    queries = get_queries_for_age(age)
    all_video_data = []

    for q in queries:
        print(f"🔎 Searching for: {q}")
        ids = search_youtube_videos(q, max_results=10)
        metadata = get_video_metadata(ids)
        all_video_data.extend(metadata)

    return all_video_data


In [18]:
#Recommend Top N Videos
def recommend_for_age(age, top_n=5):
    videos = get_videos_for_age(age)
    df = pd.DataFrame(videos)

    # Example: Sort by view_count
    df['view_count'] = df['view_count'].astype(int)
    recommended = df.sort_values(by="view_count", ascending=False).head(top_n)

    return recommended[['title', 'channel_title', 'view_count', 'published_at']]


In [19]:
user_age = int(input("Enter child’s age: "))
recommendations = recommend_for_age(user_age)

print("🎉 Top Recommendations:")
print(recommendations)


Enter child’s age: 6
🔎 Searching for: science for kids age 6 to 8
🔎 Searching for: math games for kids
🎉 Top Recommendations:
                                                title  \
7                                    This is SO cool!   
13  "Math Whiz!" Addition Song  /// Danny Go! Kids...   
17  Counting 1 to 5! 🔵 Cartoon Maths fun for Kids ...   
10  Addition and Subtraction with Dinosaurs - Math...   
1   What Is A Virus ? | Best Learning Videos For K...   

                channel_title  view_count          published_at  
7                     DaveHax  1006556174  2023-02-02T12:43:06Z  
13                  Danny Go!    39454380  2022-10-10T11:00:33Z  
17               Numberblocks    19896912  2024-01-03T11:00:37Z  
10  Smile and Learn - English    12086524  2020-06-25T15:30:03Z  
1               Peekaboo Kidz    10963260  2017-12-29T12:37:52Z  


# Enhacing the recommendation System to provide recommendation based on parent's input

In [20]:
# Modify Query Filter System

def get_queries_for_age_and_subject(age, subject=None, n_queries=2):
    subset = mapping_df[
        (mapping_df['age_min'] <= age) &
        (mapping_df['age_max'] >= age)
    ]

    if subject:
        subset = subset[subset['category'].str.lower().str.contains(subject.lower())]

    if subset.empty:
        print("⚠️ No queries found for this subject at this age. Showing general results.")
        subset = mapping_df[(mapping_df['age_min'] <= age) & (mapping_df['age_max'] >= age)]

    return random.sample(subset['query'].tolist(), k=min(n_queries, len(subset)))


In [21]:
# Updating Recommendation Function
def recommend_for_age_and_subject(age, subject=None, top_n=5):
    queries = get_queries_for_age_and_subject(age, subject)
    all_videos = []

    for q in queries:
        print(f"🔎 Searching: {q}")
        video_ids = search_youtube_videos(q, max_results=10)
        all_videos.extend(get_video_metadata(video_ids))

    df = pd.DataFrame(all_videos)
    df['view_count'] = df['view_count'].astype(int)
    return df.sort_values(by="view_count", ascending=False).head(top_n)


In [23]:
age = int(input("Enter child’s age: "))
subject = input("Enter a preferred subject (e.g., Science, Geography, Math): ")

recommendations = recommend_for_age_and_subject(age, subject)
recommendations[['title', 'channel_title', 'view_count']]

Enter child’s age: 6
Enter a preferred subject (e.g., Science, Geography, Math): Science
🔎 Searching: science for kids age 6 to 8


Unnamed: 0,title,channel_title,view_count
7,This is SO cool!,DaveHax,1006556174
1,What Is A Virus ? | Best Learning Videos For K...,Peekaboo Kidz,10963266
6,Digestive System | The Dr. Binocs Show | Learn...,Peekaboo Kidz,6697201
2,Human Body - Science for Kids - Rock 'N Learn,Rock 'N Learn,6273495
0,How Your Brain Works? - The Dr. Binocs Show | ...,Peekaboo Kidz,3894910


## Phase 2 - Enhance existing popularity-based logic to make it smarter and more tailored to children

In [24]:
import pandas as pd
import random
from datetime import datetime, timedelta

# Sample data
titles = ["ABC Song", "Counting Stars", "Fairy Tale", "Learn Colors", "Dinosaur Facts"]
tags_list = [["music", "abc"], ["math", "numbers"], ["story", "fairy"], ["colors", "learning"], ["science", "dinosaurs"]]
content_types = ["music", "educational", "storytelling"]
age_groups = ["3-5", "6-8"]

data = []
for i in range(100):
    title = random.choice(titles)
    tags = random.choice(tags_list)
    view_count = random.randint(1000, 100000)
    like_count = random.randint(100, 10000)
    watch_time = random.randint(10, 300)
    upload_date = datetime.now() - timedelta(days=random.randint(0, 365))
    age_group = random.choice(age_groups)
    content_type = random.choice(content_types)
    data.append({
        "video_id": f"vid_{i}",
        "title": title,
        "tags": ", ".join(tags),
        "view_count": view_count,
        "like_count": like_count,
        "watch_time_minutes": watch_time,
        "upload_date": upload_date.date(),
        "age_group": age_group,
        "content_type": content_type
    })

df = pd.DataFrame(data)
df.to_csv("synthetic_kids_videos.csv", index=False)



### Compute Engagement Score which is including like and watch time

In [25]:
def compute_engagement_score(df):
    # Normalize individual columns
    df["views_norm"] = df["view_count"] / df["view_count"].max()
    df["likes_norm"] = df["like_count"] / df["like_count"].max()
    df["watch_time_norm"] = df["watch_time_minutes"] / df["watch_time_minutes"].max()

    # Weighted score
    df["engagement_score"] = (
        0.4 * df["views_norm"] +
        0.3 * df["likes_norm"] +
        0.3 * df["watch_time_norm"]
    )

    return df

df = compute_engagement_score(df)

Boost recent videos — newer videos will get a bonus score.


In [26]:
from datetime import datetime

def add_recentness_boost(df, weight=0.2):
    today = pd.to_datetime(datetime.today().date())  # Convert to datetime64
    df["upload_date"] = pd.to_datetime(df["upload_date"])  # Ensure datetime format
    df["days_old"] = (today - df["upload_date"]).dt.days
    df["recency_score"] = 1 / (1 + df["days_old"])
    df["final_score"] = df["engagement_score"] + weight * df["recency_score"]
    return df

df = add_recentness_boost(df)


In [28]:
## Filter by kids profile
kid_profile = {
    "age_group": "3-5",
    "preferred_types": ["math", "music"]
}

def filter_for_kid(df, profile):
    return df[
        (df["age_group"] == profile["age_group"]) &
        (df["content_type"].isin(profile["preferred_types"]))
    ]

df_kid_filtered = filter_for_kid(df, kid_profile)
df_kid_filtered.shape


(14, 16)

In [29]:
## REcommend Top - N videos
def recommend_top_n(df_filtered, n=5):
    return df_filtered.sort_values("final_score", ascending=False).head(n)

top_recommendations = recommend_top_n(df_kid_filtered)
print(top_recommendations[["title", "final_score"]])


             title  final_score
37    Learn Colors     0.643885
93  Counting Stars     0.642373
25    Learn Colors     0.608692
14      Fairy Tale     0.584329
27  Dinosaur Facts     0.577550


## Summary
Built a composite score = views + likes + watch time + recency

Filtered content based on age and type

Delivered final Top-N recommendations with improved ranking

# Phase 2: Content-Based Similarity (TF-IDF)

In [30]:
## Goal: Recommend videos similar in title/tags to videos the child watched recently.

### Step 1: Choose a Reference Video (or Session History)
#### You can either:Pick 1–3 videos the child watched (manually for now)
#### Later use session history tracking

# Simulating that the child last watched these videos:
watched_titles = ["ABC Song", "Dinosaur Facts"]

In [31]:

### Step 2: Prepare the Data for TF-IDF
#### We'll combine title + tags for each video to get a better representation.

df["text"] = df["title"] + " " + df["tags"]

In [32]:
### Step 3: Apply TF-IDF Vectorization
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(df["text"])

In [37]:



### Step 4: Find Similar Videos to Watched Ones
import numpy as np

def get_similar_videos(df, watched_titles, tfidf_matrix, top_n=5):
    watched_indices = df[df["title"].isin(watched_titles)].index.tolist()

    if not watched_indices:
        print("⚠️ No watched titles found in dataset.")
        return pd.DataFrame()

    # Compute the mean vector for watched videos
    watched_vector = tfidf_matrix[watched_indices].mean(axis=0)
    watched_vector = np.asarray(watched_vector)  # 👈 FIXED: Convert to ndarray

    similarity_scores = cosine_similarity(watched_vector, tfidf_matrix).flatten()

    df["similarity_score"] = similarity_scores
    recommended = df[~df["title"].isin(watched_titles)].sort_values("similarity_score", ascending=False).head(top_n)
    return recommended[["video_id", "title", "similarity_score", "age_group", "content_type"]]


In [38]:
### Step 5: Call the Recommender
recommendations = get_similar_videos(df, watched_titles, tfidf_matrix, top_n=5)
print(recommendations)

### Personalized recommendations based on video content

### Can work even if view count is low

### Easy to combine with previous scoring later (hybrid)

   video_id         title  similarity_score age_group  content_type
8     vid_8  Learn Colors          0.324707       3-5  storytelling
18   vid_18  Learn Colors          0.324707       3-5  storytelling
54   vid_54  Learn Colors          0.324707       6-8         music
85   vid_85  Learn Colors          0.324707       6-8         music
88   vid_88  Learn Colors          0.324707       6-8         music


You benefit from both:

🔍 Content similarity: personalization (based on what the child watched)

🔥 Engagement score: popularity/quality (based on views, likes, etc.)

In [39]:
# Step 1: Ensure both scores exist in your DataFrame

## final_score = α * similarity_score + β * engagement_score
## α = 0.6 (weight for similarity)
## β = 0.4 (weight for engagement)

##  Step 1: Ensure both scores exist in your DataFrame

df["engagement_score"]  # from Phase 1
df["similarity_score"]  # from Phase 2



Unnamed: 0,similarity_score
0,0.598939
1,0.171698
2,0.673689
3,0.652914
4,0.117499
...,...
95,0.147030
96,0.203857
97,0.183929
98,0.673391


In [41]:
## Step 2: Compute the hybrid score
def compute_hybrid_score(df, alpha=0.6, beta=0.4):
    df["hybrid_score"] = (
        alpha * df["similarity_score"].fillna(0) +
        beta * df["engagement_score"].fillna(0)
    )
    return df

df = compute_hybrid_score(df)


In [42]:
## Step 3: Recommend Top-N Based on Hybrid Score

def recommend_hybrid(df, watched_titles, top_n=5):
    return df[~df["title"].isin(watched_titles)].sort_values("hybrid_score", ascending=False).head(top_n)

hybrid_recommendations = recommend_hybrid(df, watched_titles)
print(hybrid_recommendations[["title", "hybrid_score", "similarity_score", "engagement_score"]])


           title  hybrid_score  similarity_score  engagement_score
54  Learn Colors      0.507043          0.324707          0.780546
88  Learn Colors      0.502732          0.324707          0.769769
18  Learn Colors      0.469199          0.324707          0.685936
85  Learn Colors      0.444650          0.324707          0.624564
92    Fairy Tale      0.421925          0.265253          0.656933
