**Keyword extraction from videos either from Youtube or local device. List down keywords**

In [None]:
!pip install SpeechRecognition
import speech_recognition as sr
from moviepy.editor import VideoFileClip

def extract_audio(video_file_path, output_audio_path):
    # Load the video file
    video = VideoFileClip(video_file_path)

    # Extract audio
    audio = video.audio

    # Write audio to file in WAV format
    audio.write_audiofile(output_audio_path, codec='pcm_s16le')

    # Close the video file
    video.close()

    # Return the path of the output audio file
    return output_audio_path


# Define the function to transcribe audio
def transcribe_audio(audio_file_path):
    recognizer = sr.Recognizer()
    with sr.AudioFile(audio_file_path) as audio_file:
        audio_data = recognizer.record(audio_file)
        transcript = recognizer.recognize_google(audio_data)
    return transcript


transcript=[]

# Call the transcribe_audio function to get the transcript and append it to the list
transcript.append(transcribe_audio(extract_audio('/content/a.mp4', "/content/h.wav")))
transcript.append(transcribe_audio(extract_audio('/content/a1.mp4', "/content/h1.wav")))
transcript.append(transcribe_audio(extract_audio('/content/a2.mp4', "/content/h2.wav")))
transcript.append(transcribe_audio(extract_audio('/content/a3.mp4', "/content/h3.wav")))
transcript.append(transcribe_audio(extract_audio('/content/a4.mp4', "/content/h4.wav")))
transcript.append(transcribe_audio(extract_audio('/content/a5.mp4', "/content/h5.wav")))
transcript.append(transcribe_audio(extract_audio('/content/a6.mp4', "/content/h6.wav")))


Collecting SpeechRecognition
  Downloading SpeechRecognition-3.10.3-py2.py3-none-any.whl (32.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m32.8/32.8 MB[0m [31m28.8 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: SpeechRecognition
Successfully installed SpeechRecognition-3.10.3
MoviePy - Writing audio in /content/h.wav




MoviePy - Done.
MoviePy - Writing audio in /content/h1.wav




MoviePy - Done.
MoviePy - Writing audio in /content/h2.wav




MoviePy - Done.
MoviePy - Writing audio in /content/h3.wav




MoviePy - Done.
MoviePy - Writing audio in /content/h4.wav




MoviePy - Done.
MoviePy - Writing audio in /content/h5.wav




MoviePy - Done.
MoviePy - Writing audio in /content/h6.wav




MoviePy - Done.


In [None]:
transcript

['the human voice warm up your voice actually do these first arms up deep breath in now very good finally let me just put this in contacts',
 "fail big today's the beginning of the rest of your life so do what you feel passionate about take chances don't be afraid to fail there's an old IQ test was nine dots and you had to draw five lines with a pencil Within These nine dots without lifting the pencil the only way to do it was to go outside the box don't be afraid to think outside the box don't be afraid to fail big to dream big but remember dreams without goals I just dreams and they ultimately fuel disappointment so have dreams about and understand that to achieve these goals you must apply discipline and consistency",
 "the biggest poison in US is Regret but I think there are a lot of people that have dreams and aspirations of things that they always wanted to do but it wasn't the right time or they didn't have enough money or they didn't have enough experience it's never the right 

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import nltk
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('punkt')

# Preprocess the transcribed text
stop_words = set(stopwords.words('english'))
preprocessed_transcripts = []

for text in transcript:
    word_tokens = word_tokenize(text.lower())
    filtered_words = [word for word in word_tokens if word.isalnum() and word not in stop_words]
    preprocessed_transcripts.append(" ".join(filtered_words))

wordnet_lemmatizer = WordNetLemmatizer()

lemmatized_transcripts = [' '.join([wordnet_lemmatizer.lemmatize(word) for word in word_tokenize(doc)]) for doc in preprocessed_transcripts]

# Apply TF-IDF
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(lemmatized_transcripts).toarray()

# Extract keywords
feature_names = tfidf_vectorizer.get_feature_names_out()

# Print TF-IDF matrix shape
print("TF-IDF Matrix Shape:", tfidf_matrix.shape)


[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


TF-IDF Matrix Shape: (7, 223)


In [None]:
#keywords
print(feature_names)

['18' '2009' '24' '24th' '44' '50' '70' '777' 'able' 'achieve' 'achieved'
 'achieving' 'actually' 'afraid' 'along' 'always' 'another' 'apply' 'arm'
 'around' 'art' 'ask' 'aspiration' 'attention' 'audience' 'back' 'balloon'
 'beginning' 'believed' 'believing' 'big' 'biggest' 'born' 'box' 'breath'
 'broke' 'buffett' 'ca' 'called' 'came' 'care' 'caring' 'chance' 'click'
 'come' 'comparing' 'consistency' 'contact' 'could' 'dance' 'day' 'deep'
 'dependent' 'difference' 'disappointment' 'discipline' 'done' 'dot'
 'draw' 'dream' 'else' 'enough' 'everyone' 'excited' 'expensive'
 'experience' 'facebook' 'fail' 'feel' 'final' 'finally' 'first' 'five'
 'fuel' 'future' 'get' 'go' 'goal' 'god' 'gone' 'good' 'got' 'great'
 'hard' 'hear' 'honest' 'hour' 'human' 'idea' 'instead' 'internet'
 'invention' 'iq' 'jobless' 'joke' 'key' 'kid' 'know' 'labored' 'last'
 'later' 'let' 'life' 'lifetime' 'lifting' 'like' 'line' 'listen' 'little'
 'living' 'look' 'lot' 'make' 'may' 'maybe' 'mean' 'minute' 'money'
 

In [None]:
tfidf_matrix

array([[0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 0.        , 0.21951545,
        0.        ],
       [0.        , 0.        , 0.40728439, ..., 0.        , 0.        ,
        0.        ],
       [0.12392316, 0.12392316, 0.        , ..., 0.12392316, 0.        ,
        0.        ]])

In [None]:
tfidf_vectorizer.idf_

array([2.38629436, 2.38629436, 2.38629436, 2.38629436, 2.38629436,
       2.38629436, 2.38629436, 2.38629436, 1.98082925, 2.38629436,
       2.38629436, 2.38629436, 1.98082925, 2.38629436, 2.38629436,
       1.98082925, 2.38629436, 2.38629436, 2.38629436, 2.38629436,
       2.38629436, 2.38629436, 2.38629436, 2.38629436, 2.38629436,
       1.98082925, 2.38629436, 2.38629436, 2.38629436, 2.38629436,
       2.38629436, 2.38629436, 2.38629436, 2.38629436, 2.38629436,
       2.38629436, 2.38629436, 2.38629436, 2.38629436, 2.38629436,
       1.98082925, 2.38629436, 2.38629436, 2.38629436, 2.38629436,
       2.38629436, 2.38629436, 2.38629436, 1.69314718, 2.38629436,
       1.98082925, 2.38629436, 2.38629436, 2.38629436, 2.38629436,
       2.38629436, 2.38629436, 2.38629436, 2.38629436, 1.98082925,
       2.38629436, 2.38629436, 2.38629436, 2.38629436, 2.38629436,
       2.38629436, 2.38629436, 2.38629436, 2.38629436, 2.38629436,
       1.98082925, 2.38629436, 2.38629436, 2.38629436, 2.38629