## Downsampling

In [None]:
import pandas as pd

queries_df = pd.read_csv('queries.csv')
all_results_df = pd.read_csv('all_results.csv')
unique_results_df = pd.read_csv('uniqueResults.csv')
all_Top10_SERP_MM_df = pd.read_csv('all_Top10_SERP-MM.csv')

all_results_sample = all_results_df.sample(n=2000, random_state=42)
unique_results_sample = unique_results_df.sample(n=2000, random_state=42)
all_Top10_SERP_MM_sample = all_Top10_SERP_MM_df.sample(n=2000, random_state=42, replace = True)

all_results_sample.to_csv('all_results_sample.csv', index=False)
unique_results_sample.to_csv('uniqueResults_sample.csv', index=False)
all_Top10_SERP_MM_sample.to_csv('all_Top10_SERP_MM_sample.csv', index=False)

In [None]:
# Check the number of fields in each downsampled DataFrame
print("Number of fields in all_results_sample:", all_results_sample.shape[1])
print("Number of fields in unique_results_sample:", unique_results_sample.shape[1])
print("Number of fields in all_Top10_SERP_MM_sample:", all_Top10_SERP_MM_sample.shape[1])


Number of fields in all_results_sample: 17
Number of fields in unique_results_sample: 15
Number of fields in all_Top10_SERP_MM_sample: 12


In [None]:
# Check the number of samples in each downsampled DataFrame
print("Number of samples in all_results_sample:", all_results_sample.shape[0])
print("Number of samples in unique_results_sample:", unique_results_sample.shape[0])
print("Number of samples in all_Top10_SERP_MM_sample:", all_Top10_SERP_MM_sample.shape[0])


Number of samples in all_results_sample: 2000
Number of samples in unique_results_sample: 2000
Number of samples in all_Top10_SERP_MM_sample: 2000


## Preprocessing

Basically getting the thumbnail using the Youtube API



In [4]:
!pip install requests



In [5]:
import pandas as pd
import os
import requests

In [14]:
def extract_video_id(url):
    if 'v=' in url:
        return url.split('v=')[1]
    else:
        print(f"URL format not recognized: {url}")
        return None

def download_thumbnail(url, output_dir, video_id):
    response = requests.get(url)
    if response.status_code == 200:
        thumbnail_filename = os.path.join(output_dir, f"{video_id}.jpg")
        with open(thumbnail_filename, 'wb') as f:
            f.write(response.content)
        return thumbnail_filename
    else:
        print(f"Failed to download thumbnail from {url}")


In [7]:
all_results_sample_df = pd.read_csv('all_results_sample.csv')

In [8]:
output_dir = 'thumbnails'
os.makedirs(output_dir, exist_ok=True)

In [15]:
for index, row in all_results_sample_df.iterrows():
    youtube_url = row['vid_url']
    video_id = extract_video_id(youtube_url)
    if video_id:
        # API request to get thumbnails ->
        api_key = "REDACTED"
        api_url = f"https://www.googleapis.com/youtube/v3/videos?id={video_id}&key={api_key}&part=snippet"
        response = requests.get(api_url)
        if response.status_code == 200:
            data = response.json()
            if 'items' in data and data['items']:
                thumbnail_url = data['items'][0]['snippet']['thumbnails']['medium']['url']
                # Downloading image
                thumbnail_filename = download_thumbnail(thumbnail_url, output_dir, video_id)
                if thumbnail_filename:
                    print(f"Thumbnail image downloaded for video ID {video_id}: {thumbnail_filename}")
            else:
                print(f"No video information found for video ID {video_id}")
        else:
            print(f"Failed to fetch video information from YouTube API for video ID {video_id}")
    else:
        print(f"No video ID found in URL: {youtube_url}")

print("Thumbnail images downloaded for all videos.")


Thumbnail image downloaded for video ID VNqNnUJVcVs: thumbnails/VNqNnUJVcVs.jpg
No video information found for video ID LXKUHFV70hE
No video information found for video ID E96EPhqT-ds
Thumbnail image downloaded for video ID HioX-6Hm2J8: thumbnails/HioX-6Hm2J8.jpg
Thumbnail image downloaded for video ID GQpUFiXCRv4: thumbnails/GQpUFiXCRv4.jpg
Thumbnail image downloaded for video ID UT4cKK3nJmc: thumbnails/UT4cKK3nJmc.jpg
Thumbnail image downloaded for video ID ZJeR--9lSMU: thumbnails/ZJeR--9lSMU.jpg
Thumbnail image downloaded for video ID JDy95_eNPzM: thumbnails/JDy95_eNPzM.jpg
Thumbnail image downloaded for video ID 6_LPuUEkSQM: thumbnails/6_LPuUEkSQM.jpg
Thumbnail image downloaded for video ID Wm69ik_Qdb8: thumbnails/Wm69ik_Qdb8.jpg
Thumbnail image downloaded for video ID x3vcSab13Sk: thumbnails/x3vcSab13Sk.jpg
Thumbnail image downloaded for video ID VXvblzAs6vM: thumbnails/VXvblzAs6vM.jpg
Thumbnail image downloaded for video ID fktWJ0M80QE: thumbnails/fktWJ0M80QE.jpg
Thumbnail image 

In [16]:
!zip -r thumbnails.zip thumbnails

  adding: thumbnails/ (stored 0%)
  adding: thumbnails/U2dcjFCvnmU.jpg (deflated 1%)
  adding: thumbnails/GS32pRTURdI.jpg (deflated 0%)
  adding: thumbnails/I4V8dLm4hEc.jpg (deflated 1%)
  adding: thumbnails/nN3qUXJp7l0.jpg (deflated 0%)
  adding: thumbnails/sNhhvQGsMEc.jpg (deflated 0%)
  adding: thumbnails/5pdPrQFjo2o.jpg (deflated 1%)
  adding: thumbnails/5VvXUXW7_GA.jpg (deflated 1%)
  adding: thumbnails/G0SpzIIHEaE.jpg (deflated 0%)
  adding: thumbnails/5bQzgrkI2j0.jpg (deflated 1%)
  adding: thumbnails/pAuhnBtBqXI.jpg (deflated 1%)
  adding: thumbnails/o1nYpfyfHOs.jpg (deflated 0%)
  adding: thumbnails/dWBYAxhH3u4.jpg (deflated 1%)
  adding: thumbnails/bqSvJHQRQZ8.jpg (deflated 0%)
  adding: thumbnails/svwlefBW7rk.jpg (deflated 1%)
  adding: thumbnails/MeHqsfgB2v8.jpg (deflated 5%)
  adding: thumbnails/M-rBSdQ_sMU.jpg (deflated 0%)
  adding: thumbnails/YI3tsmFsrOg.jpg (deflated 0%)
  adding: thumbnails/loL_iIWltI0.jpg (deflated 4%)
  adding: thumbnails/YSAz-Ot3Mco.jpg (deflated 0