In [None]:
## Trying to use Langchain to pull transcripts from a YouTube Channel, use a document loader
# To load them into a vectorizer, and then use the vectorizer to get a similarity score between
# the documents and a query.

# Importing the necessary libraries
import streamlit as st
import os
from dotenv import load_dotenv
load_dotenv()
import openai
from google.oauth2 import service_account
from langchain.document_loaders import GoogleApiClient, GoogleApiYoutubeLoader, YoutubeLoader
from pathlib import Path
from pyyoutube import Api
import pinecone

# Set your API key
openai.api_key = os.getenv("OPENAI_API_KEY")
openai.organization = os.getenv("OPENAI_ORG")
api = Api(api_key=os.getenv("GOOGLE_KEY"))
pinecone.init(api_key = os.getenv("PINECONE_KEY"), environment=os.getenv("PINECONE_ENV"))

credentials = service_account.Credentials.from_service_account_info(st.secrets["gcp_service_account"])

#loader = YoutubeLoader.from_youtube_url(youtube_url="https://youtu.be/JYQyovXIqGU")
# Get the transcripts from the YouTube "no-till" channel
#youtube_loader_channel = GoogleApiYoutubeLoader(google_api_client=google_api_client, channel_name="notillgrowers",captions_language="en")

#videos = api.get_channel_info(channel_id='UCLhu5JoRWPgEGDoUFfQHTPQ', return_json=True)

  from tqdm.autonotebook import tqdm


In [None]:
videos_list = api.search_by_keywords(q='no-till farming')

In [None]:
# Get the channel IDs of the videos in the playlist
channel_ids = [video.snippet.channelId for video in videos_list.items]
print(channel_ids)

videos_dict = {}
for video in videos_list.items:
    videos_dict[video.snippet.title] = video.snippet
    

['UCLhu5JoRWPgEGDoUFfQHTPQ', 'UCLhu5JoRWPgEGDoUFfQHTPQ', 'UCNMZF39LBB-E1DIkxnD6cbA', 'UCjLmlBb3hAPB927mQmijXxw', 'UC7fbFHvVzaXui0_twnMkfdQ', 'UCwHc-v7mSl7O08pHcgCFjmg', 'UCdSzqQPK8_Qs616MRdlIruw', 'UCLhu5JoRWPgEGDoUFfQHTPQ', 'UCHynVrKVZtTXf3hndd2ZR4A', 'UCLhu5JoRWPgEGDoUFfQHTPQ', 'UCqmlN9TQnjEIfWqGb5lFDyA', 'UCRKrBNIf4pk-pT3WpEhIHcQ', 'UCLhu5JoRWPgEGDoUFfQHTPQ', 'UChaqnaouyz7CCOV_NYTEgJg', 'UCQVGf64a6nqKhgR9eKCUY4A', 'UCLhu5JoRWPgEGDoUFfQHTPQ', 'UCQLmgJIK_oah_05hM6nPvRQ', 'UC3111rvadtBPUY9JJBqdmzg', 'UChVVAZfswcfA2CDbTZawuXQ', 'UCHs2EsV7uH8hWX3JZy3trAQ', 'UCLhu5JoRWPgEGDoUFfQHTPQ', 'UCwHc-v7mSl7O08pHcgCFjmg', 'UCHynVrKVZtTXf3hndd2ZR4A', 'UCm1LeeTs-xy3z-sQVs3b7EA', 'UC3WNMbluZ33z-mTgOwG2JSg']


In [None]:
channel_ids

['UCLhu5JoRWPgEGDoUFfQHTPQ',
 'UCLhu5JoRWPgEGDoUFfQHTPQ',
 'UCLzfA1TEGFy-1anN20nznqQ',
 'UCwHc-v7mSl7O08pHcgCFjmg',
 'UCkVCHw3i7x9YVLPPrbNsYMA',
 'UCjLmlBb3hAPB927mQmijXxw',
 'UCHynVrKVZtTXf3hndd2ZR4A',
 'UCdSzqQPK8_Qs616MRdlIruw',
 'UCNMZF39LBB-E1DIkxnD6cbA',
 'UCLzfA1TEGFy-1anN20nznqQ',
 'UCWsI0LmiDyezbnN2JCL4P9w',
 'UCjIaKojPkN70UhR2F1QoLyA',
 'UCLhu5JoRWPgEGDoUFfQHTPQ',
 'UCqmlN9TQnjEIfWqGb5lFDyA',
 'UCTZ9TogThNuyzvlZEnLNq0g',
 'UCLhu5JoRWPgEGDoUFfQHTPQ',
 'UCTRDu6ScmwL-3eitegMT_qQ',
 'UCLhu5JoRWPgEGDoUFfQHTPQ',
 'UCQLmgJIK_oah_05hM6nPvRQ',
 'UCLhu5JoRWPgEGDoUFfQHTPQ',
 'UC3111rvadtBPUY9JJBqdmzg',
 'UCm1LeeTs-xy3z-sQVs3b7EA',
 'UCw2il95DXcigSOEosqL8izA',
 'UC8NYD_o_fbtX9J4zMG0cgZg',
 'UChaqnaouyz7CCOV_NYTEgJg']

In [None]:
# Define a function to get transcripts from a YouTube channel
def get_channel_info(channel_id):
    # Get the transcripts from the videos in the channel
    channel_info = api.get_channel_info(channel_id=channel_id, return_json=True)

    return channel_info
    

        

channel_videos = {}


for channel in channel_ids:
    # Get the transcripts from the videos in the channel
    channel_videos[channel] = get_channel_info(channel_id=channel)
    print(channel_videos[channel])


{'kind': 'youtube#channelListResponse', 'etag': 'UDO_AGd9H1YZ4kSAeuMWkGR1JOI', 'pageInfo': {'totalResults': 1, 'resultsPerPage': 5}, 'items': [{'kind': 'youtube#channel', 'etag': '0Y1KuXsYG8GQUt5nEjeAP-KKCD0', 'id': 'UCLhu5JoRWPgEGDoUFfQHTPQ', 'snippet': {'title': 'No-Till Growers', 'description': 'Notillgrowers.com', 'customUrl': '@notillgrowers', 'publishedAt': '2016-09-15T13:08:20Z', 'thumbnails': {'default': {'url': 'https://yt3.ggpht.com/ytc/AGIKgqP8YsPZZcTsXn3wl9hfG_Ou9TXyTT992AVHUaZkUA=s88-c-k-c0x00ffffff-no-rj', 'width': 88, 'height': 88}, 'medium': {'url': 'https://yt3.ggpht.com/ytc/AGIKgqP8YsPZZcTsXn3wl9hfG_Ou9TXyTT992AVHUaZkUA=s240-c-k-c0x00ffffff-no-rj', 'width': 240, 'height': 240}, 'high': {'url': 'https://yt3.ggpht.com/ytc/AGIKgqP8YsPZZcTsXn3wl9hfG_Ou9TXyTT992AVHUaZkUA=s800-c-k-c0x00ffffff-no-rj', 'width': 800, 'height': 800}}, 'localized': {'title': 'No-Till Growers', 'description': 'Notillgrowers.com'}, 'country': 'US'}, 'contentDetails': {'relatedPlaylists': {'likes':

In [None]:
playlist_list = []
for key in channel_videos.keys():
    for video in channel_videos[key]['items']:
        playlist_list.append(video['contentDetails']['relatedPlaylists']['uploads'])


In [None]:
playlist_list

['UULhu5JoRWPgEGDoUFfQHTPQ',
 'UUNMZF39LBB-E1DIkxnD6cbA',
 'UUjLmlBb3hAPB927mQmijXxw',
 'UU7fbFHvVzaXui0_twnMkfdQ',
 'UUwHc-v7mSl7O08pHcgCFjmg',
 'UUdSzqQPK8_Qs616MRdlIruw',
 'UUHynVrKVZtTXf3hndd2ZR4A',
 'UUqmlN9TQnjEIfWqGb5lFDyA',
 'UURKrBNIf4pk-pT3WpEhIHcQ',
 'UUhaqnaouyz7CCOV_NYTEgJg',
 'UUQVGf64a6nqKhgR9eKCUY4A',
 'UUQLmgJIK_oah_05hM6nPvRQ',
 'UU3111rvadtBPUY9JJBqdmzg',
 'UUhVVAZfswcfA2CDbTZawuXQ',
 'UUHs2EsV7uH8hWX3JZy3trAQ',
 'UUm1LeeTs-xy3z-sQVs3b7EA',
 'UU3WNMbluZ33z-mTgOwG2JSg']

In [None]:
video_list = [()]

for play_list in playlist_list:
    playlist_items = api.get_playlist_items(playlist_id=play_list, return_json=True)
    for item in playlist_items['items'][0:5]:
        channelTitle = item['snippet']['channelTitle']
        video_id = item['contentDetails']['videoId']
        # Append the tuple to the list
        video_list.append((channelTitle, video_id))

        

In [None]:
video_list

[(),
 ('No-Till Growers', 'JYQyovXIqGU'),
 ('No-Till Growers', 'heTxEsrPVdQ'),
 ('No-Till Growers', 'XQMJK9UYOF4'),
 ('No-Till Growers', 'Ct3CL22RpTg'),
 ('No-Till Growers', 'FvbsB1U-in4'),
 ('AgPhD', 'UTB72G_cE_g'),
 ('AgPhD', 'S-wIdl_3EC0'),
 ('AgPhD', 'wNUUuaPy2iI'),
 ('AgPhD', 'F24gHjcKPgI'),
 ('AgPhD', 'TI4GnIFl6HU'),
 ('University of Kentucky Martin-Gatton', 'ug2L35cM-s8'),
 ('University of Kentucky Martin-Gatton', 'lr4Isc3pv68'),
 ('University of Kentucky Martin-Gatton', 'FhE1_7OnfdE'),
 ('University of Kentucky Martin-Gatton', 'MTXRwf3-s1Y'),
 ('University of Kentucky Martin-Gatton', 'esBrr6xOw-I'),
 ('No-Till Farmer', '-OuiU9EtdFQ'),
 ('No-Till Farmer', 'WzQHE8s81u8'),
 ('No-Till Farmer', 'fjQHOS6e7qY'),
 ('No-Till Farmer', 'NlWRH3WTM6s'),
 ('No-Till Farmer', 'hOXOMkEO9_A'),
 ('Univ of Wisconsin Integrated Pest and Crop Management', '_gYEAPLYs8o'),
 ('Univ of Wisconsin Integrated Pest and Crop Management', '2O3ucHj0HJ8'),
 ('Univ of Wisconsin Integrated Pest and Crop Managemen

In [None]:
# Use the YouTubeLoader to get the transcripts from the videos in the playlist
# Add the channelTitle to the metadata
docs_dict = {}
for video in video_list:
    try:
        loader = YoutubeLoader.from_youtube_url(youtube_url=f"https://youtu.be/{video[1]}")
        # Check to see if the docs_dict has a key for the channelTitle, and if so, append the new docs
        if video[0] in docs_dict.keys():
            docs_dict[video[0]].extend(loader.load_and_split())
        else:
            docs_dict[video[0]] = loader.load_and_split()
    except:
        pass

In [None]:
len(docs)

262

In [None]:
for doc in docs:
    print(doc)

page_content="hey nerds farmer Jesse here so it's one thing to grow vegetables in an entirely other thing to get them ready for market so in today's video we're going to go over the keys to a good wash pack station sometimes called a pack house sometimes called a wash head sometimes called wash Vegas nearly 8 billion people in the world somebody calls it wash Vegas no matter the name there are some key elements each wash station should possess and some that may not be as necessary and at the end of this video I will take two crops from field to Market so you can see what that process roughly looks like per usual I'll share some of our mistakes uh our deficiencies as they pertain to washing and packing so let's do it foreign there are very few crops on our farm that come out of the field just ready for Market most produce need some amount of washing wiping packing cooling uh maybe just one of the above but maybe all of them the goal however is not to spend more time on this task than yo

In [None]:
from langchain.embeddings import OpenAIEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Pinecone



embed = OpenAIEmbeddings(openai_api_key = os.getenv("OPENAI_API_KEY"), openai_organization = os.getenv("OPENAI_ORG"))
res = embed.embed_documents(docs)
len(res), len(res[0])

TypeError: expected string or buffer

In [None]:
from langchain.indexes import VectorstoreIndexCreator
index = VectorstoreIndexCreator.from_documents(documents=docs)

Using embedded DuckDB without persistence: data will be transient


In [None]:
from langchain.chat_models import ChatOpenAI

llm = ChatOpenAI(model_name = 'gpt-3.5-turbo', verbose=True)



In [None]:
from langchain.vectorstores import Pinecone
# Create a pinecone index and add the documents to it
embeddings = OpenAIEmbeddings()
docsearch = Pinecone.from_documents(documents=docs, index_name="no-till-farming", embedding=embeddings)


Retrying langchain.embeddings.openai.embed_with_retry.<locals>._embed_with_retry in 4.0 seconds as it raised APIConnectionError: Error communicating with OpenAI: ('Connection aborted.', ConnectionResetError(10054, 'An existing connection was forcibly closed by the remote host', None, 10054, None)).


In [None]:
index = pinecone.Index("no-till-farming")
vectorstore = Pinecone(index, embed.embed_query, '')

In [None]:
from langchain.chains import RetrievalQA

qa = RetrievalQA.from_chain_type(llm = llm, chain_type = 'stuff', retriever = vectorstore.as_retriever())

In [None]:
query = "What does it mean when my tomato plant has yellow leaves?"
qa.run(query)

Found document with no `` key. Skipping.
Found document with no `` key. Skipping.
Found document with no `` key. Skipping.
Found document with no `` key. Skipping.


"Yellow leaves on tomato plants can be caused by several factors including overwatering, underwatering, nutrient deficiencies, pests or diseases. It's important to properly diagnose the cause in order to treat the issue effectively."

In [None]:
qa.run(query)