In [44]:
import json
from pytube import extract
from fastapi import APIRouter

from langchain.chat_models import ChatOpenAI
from langchain.schema import HumanMessage, SystemMessage

from pytube import YouTube
from youtube_transcript_api import YouTubeTranscriptApi
from dotenv import load_dotenv
from pydantic import BaseModel
import pandas as pd
load_dotenv()


def yt_transcribe(video_url):
    """
    Given YouTube Video URL as input, returns the transcription
    :param video_url:
    :return: <str> transcribed text and author name
    """
    youtube = YouTube(video_url)
    author = youtube.author
    publish_date = youtube.publish_date

    transcript = ""
    video_id = extract.video_id(video_url)
    srt = YouTubeTranscriptApi.get_transcript(video_id)

    for line in srt:
        transcript = transcript + " " + line["text"]
    return author, publish_date, transcript


def graph_extractor(transcript):
    """
    Takes YouTube transcript as input, extracts stock name, recommendation, and target_achieve_duration
    :param transcript:
    :return: df
    """
    chat = ChatOpenAI(temperature=0, model_name='gpt-4', request_timeout=120)
    messages = [
        SystemMessage(content='''You are a Financial Analyst. Your task is to identify the company stock (ticker) a Financial Influencer is talking about \
                        and his recommendation to buy or sell stock if specifically mentioned. If he is talking about duration in which stock will get mentioned target get that as well.\
                        Report the response in JSON format with keys ticker, recommendation, target_achieve_duration. Just save ticker like sbin not state bank of india. \
                        example:\
                                'ticker': ['TCS', 'ITC', 'HCL'],
                                'recommendation': ['not mentioned', 'buy', 'sell'],
                                'target_achieve_duration': ['not mentioned', '12 month', '1 year']'''
                            ),
        HumanMessage(content=transcript)
    ]
    response = chat(messages)
    formatted_output = eval(response.content)

    df=pd.DataFrame(formatted_output)
    return df

In [59]:
video_urls=['https://www.youtube.com/watch?v=5hhyZtBRwNQ', 'https://www.youtube.com/watch?v=UjcaOJ6rzPU', 'https://www.youtube.com/watch?v=9K3MXPc7L3E',
'https://www.youtube.com/watch?v=ObrTHa7OAC8', 'https://www.youtube.com/watch?v=SSzGJCupXKU', 'https://www.youtube.com/watch?v=alJ1iSyqzPU',
'https://www.youtube.com/watch?v=UUJIgcb0QMU', 'https://www.youtube.com/watch?v=jbNi7aXEcJk', 'https://www.youtube.com/watch?v=h76vSBw3p_8']

out_df=[]
for vid in video_urls:
    author, publish_date, transcript = yt_transcribe(vid)
    df = graph_extractor(transcript)
    df['Author']=author
    df['Publish Date']=publish_date
    out_df.append(df)

save_out=pd.concat(out_df)
save_out['recommendation']=save_out['recommendation'].apply(lambda x: 'buy' if x=='not mentioned' else x)
save_out['target_achieve_duration']=save_out.apply(lambda x: '3 years' if ((x['target_achieve_duration']=='not mentioned') & (x['recommendation']=='buy')) else x['target_achieve_duration'], axis=1)
save_out.to_csv('graph_extractor.csv', index=False)