In [None]:
import torch
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline


device = "cuda:0" if torch.cuda.is_available() else "cpu"
torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
model_id = "openai/whisper-large-v3-turbo"
model = AutoModelForSpeechSeq2Seq.from_pretrained(
    model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True
)
model.to(device)

processor = AutoProcessor.from_pretrained(model_id)

pipe = pipeline(
    "automatic-speech-recognition",
    model=model,
    tokenizer=processor.tokenizer,
    feature_extractor=processor.feature_extractor,
    torch_dtype=torch_dtype,
    device=device,
)
# Use a pipeline as a high-level helper
from transformers import pipeline

pipe1 = pipeline("text-classification", model="cardiffnlp/twitter-roberta-base-sentiment",device=device)

In [None]:
!pip install lennon

In [None]:
import torch
from PIL import Image
from lennon import LeNNon
from torchvision import transforms
from huggingface_hub import hf_hub_download

# Download the pytorch model
hf_hub_download(repo_id="AiresPucrs/LeNNon-Smile-Detector",
                filename="LeNNon-Smile-Detector.pt",
                local_dir="./",
                repo_type="model"
                )

# Download the source implementation of the model's architecture
hf_hub_download(repo_id="AiresPucrs/LeNNon-Smile-Detector",
                filename="lennon.py",
                local_dir="./",
                repo_type="model"
                )

# Check if GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load the model an pass it to the proper device
model = torch.load('./LeNNon-Smile-Detector.pt')
model = model.to(device)
model.eval()

# This `transform` object will transform our test images into proper tensors
transform = transforms.Compose([
    transforms.Resize((100, 100)),  # Resize the image to 100x100
    transforms.ToTensor(),
])

image_path = "/kaggle/input/smiledetection/datasets/train_folder/0/file2163.jpg"

# Open and preprocess he image
image = Image.open(image_path)
tensor = transform(image)
tensor = tensor.to(device)

# forward pass trough the model
with torch.no_grad():

  outputs = model(tensor)

# Get the class prediction
_, predicted = torch.max(outputs.data, 1)

print("Smiling" if predicted.item() > 0 else "Not Smiling")


In [None]:
audio_path='/kaggle/input/indian-speech/IEAD/IEAD/angry-340.wav'
from IPython.display import Audio
import librosa
import matplotlib.pyplot as plt
import numpy as np

# Load the audio file
y, sr = librosa.load(audio_path)

# Calculate the Short-Time Fourier Transform (STFT)
S = librosa.stft(y)

# Convert amplitude to decibels
D = librosa.amplitude_to_db(np.abs(S), ref=np.max)

# Calculate the mean decibel level for each frame
mean_db = np.mean(D, axis=0)

# Create a time axis in seconds
times = librosa.times_like(mean_db, sr=sr)

# Plot decibels vs. time
plt.figure(figsize=(14, 5))
plt.plot(times, mean_db, color='b')
plt.title('Average Decibel Levels Over Time')
plt.xlabel('Time (s)')
plt.ylabel('Decibels (dB)')
plt.grid()
plt.xlim(0, max(times))  # Limit x-axis to audio duration
plt.ylim(np.min(mean_db) - 5, np.max(mean_db) + 5)  # Adjust y-axis limits
plt.show()



In [None]:
import librosa
import numpy as np
import nltk
from collections import Counter

# Ensure necessary packages are downloaded
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

def extract_audio_features(audio_path, transcript):
    # Load the audio file
    y, sr = librosa.load(audio_path)

    # Sound intensity (RMS)
    rms = librosa.feature.rms(y=y)
    sound_intensity = np.mean(rms)

    # Fundamental frequency (F0)
    f0, voiced_flag, voiced_probs = librosa.pyin(y, fmin=librosa.note_to_hz('C2'), fmax=librosa.note_to_hz('C7'))
    fundamental_frequency = np.nanmean(f0)

    # Spectral energy (based on STFT)
    S = np.abs(librosa.stft(y))
    spectral_energy = np.mean(np.sum(S ** 2, axis=0))

    # Spectral centroid
    spectral_centroid = librosa.feature.spectral_centroid(y=y, sr=sr)
    avg_spectral_centroid = np.mean(spectral_centroid)

    # Zero-crossing rate
    zcr = librosa.feature.zero_crossing_rate(y)
    zero_crossing_rate = np.mean(zcr)

    # Pause detection
    # Define a threshold for silence (you may need to adjust this value)
    silence_threshold = 0-40
    silent_intervals = librosa.effects.split(y, top_db=1)  # Split into non-silent intervals
    pause_duration = 0  # Initialize pause duration
    for start, end in silent_intervals:
        pause_duration += (end - start) / sr  # Add the pause duration in seconds
    print(pause_duration)
    # Calculate total duration of audio in seconds
    total_duration = librosa.get_duration(y=y, sr=sr)

    # Pause rate (pauses per minute)
    pause_rate = (pause_duration / total_duration) * 60  # Convert to pauses per minute

    # Process the transcript
    words = nltk.word_tokenize(transcript)
    num_words = len(words)
    unique_words = len(set(words))
    word_frequencies = Counter(words)

    # Average number of words spoken per minute
    duration_minutes = total_duration / 60
    avg_words_per_minute = num_words / duration_minutes

    # Average number of unique words per minute
    avg_unique_words_per_minute = unique_words / duration_minutes

    # Count of unique words in the transcript
    unique_word_count = unique_words

    # Number of filler words (e.g., "um", "uh") per minute
    filler_words = [
        'uh', 'um', 'like', 'you know', 'ah', 'er', 'hmm', 'well', 'so', 
        'I mean', 'okay', 'right', 'actually', 'basically', 'you see', 
        'sort of', 'kind of', 'yeah', 'literally', 'just', 'I guess', 
        'totally', 'honestly', 'seriously', 'alright'
    ]

    filler_word_count = sum([word_frequencies.get(filler, 0) for filler in filler_words])
    filler_words_per_minute = filler_word_count / duration_minutes

    # Count of nouns, adjectives, and verbs
    pos_tags = nltk.pos_tag(words)
    nouns = [word for word, pos in pos_tags if pos.startswith('NN')]
    adjectives = [word for word, pos in pos_tags if pos.startswith('JJ')]
    verbs = [word for word, pos in pos_tags if pos.startswith('VB')]
    sentiment = pipe1(transcript)

    print("Nouns: ", nouns)
    print("Adjectives: ", adjectives)
    print("Verbs: ", verbs)

    return {
        "transcript": transcript,
        "sound_intensity": sound_intensity,
        "fundamental_frequency": fundamental_frequency,
        "spectral_energy": spectral_energy,
        "spectral_centroid": avg_spectral_centroid,
        "zero_crossing_rate": zero_crossing_rate,
        "avg_words_per_minute": avg_words_per_minute,
        "avg_unique_words_per_minute": avg_unique_words_per_minute,
        "unique_word_count": unique_word_count,
        "filler_words_per_minute": filler_words_per_minute,
        "noun_count": len(nouns),
        "adjective_count": len(adjectives),
        "verb_count": len(verbs),
        "pause_rate": pause_rate
    }
transcript=pipe(audio_path)['text']
features = extract_audio_features(audio_path, transcript)


In [None]:
#Label
#0-negative
#1-neutral
#2-positive
features

In [None]:
from wordcloud import WordCloud
import matplotlib.pyplot as plt


# Create a WordCloud object
wordcloud = WordCloud(width=800, height=400, background_color='white',max_font_size=100,   # Adjust maximum font size
    min_font_size=2,    # Ensure small words are still visible
    scale=10,             # Higher resolution for sharper words
    max_words=50  ).generate(transcript)

# Display the generated word cloud using matplotlib
plt.figure(figsize=(10, 5))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')  # Turn off axis
plt.show()

In [None]:
!pip install transformers accelerate

In [None]:
api_key='AIzaSyCU1oULP-rbHovW4B6ODgiE9jgFaHYfhWE'

In [5]:
import google.generativeai as genai
import os
from PIL import Image
import base64

# Set your API key
api_key='AIzaSyCU1oULP-rbHovW4B6ODgiE9jgFaHYfhWE'
genai.configure(api_key=api_key)

# Choose a Gemini model
model = genai.GenerativeModel(model_name="gemini-1.5-flash")

# Function to encode the image in base64
def encode_image(image_path):
    with open(image_path, "rb") as img_file:
        return base64.b64encode(img_file.read()).decode("utf-8")

# Image path (provide your own image file path)
image_path = "/kaggle/input/smiledetection/datasets/test_folder/0/file3401.jpg"
encoded_image = encode_image(image_path)

# Define your multimodal prompt
prompt = f"""
Get a score for this out of 100 for this linguistic score 
 Audio Analysis
 Transcript: Hello, I am Benesh. How are you? It's nice to meet you. It's very hot here and I am
 feeling very sleepy now.
 Sentiment: Positive (Score: 0.7489377856254578)
 Video Duration:23.49
 Sound Intensity: 0.011787964962422848
 Fundamental Frequency: 207.35014327738318
 Spectral Energy: 485.7623291015625
 Spectral Centroid: 2120.992258329127
 Zero Crossing Rate: 0.1996791294642857
 Average Words per Minute: -1
 Average Unique Words per Minute: -1
 Unique Word Count: 19
 Filler Words per Minute: 0.0
 Noun Count: 2
 Adjective Count: 3
 Verb Count: 8
 Pause Rate: 0.0
Question:
'Can you tell me about a time when you had to work as part of a team to achieve a goal? What was your role, and what was the outcome?'

Transcript:
Sure! During my final year at university, I worked on a group project to design an eco-friendly water purification system. The team consisted of five members, each with different expertise. I took on the role of project coordinator, ensuring that everyone understood their tasks and deadlines. Communication was key—we had regular meetings to discuss progress and challenges.

One challenge we faced was that our initial prototype didn’t meet the efficiency standards we aimed for. I suggested we divide into smaller groups to tackle specific issues, such as material selection and filtration techniques. This approach allowed us to solve problems more efficiently.

In the end, we delivered a functional system that exceeded expectations and even won an award for innovation. I learned the importance of listening to team members, adapting to setbacks, and staying organized.
"""


# Generate a response
response = model.generate_content(prompt)

# Print the response
print("Gemini's Response:", response.text)


Gemini's Response: To score this linguistic performance out of 100, we need to establish criteria.  Since we have both an initial short audio sample and a response to a complex question, we'll weight them differently.

**Scoring Breakdown:**

* **Audio Sample (30%):** This assesses basic linguistic competence – pronunciation, grammar, and fluency.
    * **Fluency & Pronunciation (15%):**  The audio is short, making a precise assessment difficult.  Assuming natural, understandable speech, we'll give a score of 12/15.
    * **Grammar (10%):**  The short sample is grammatically correct. 10/10.
    * **Vocabulary (5%):** Simple vocabulary used appropriately.  5/5.

* **Response to Question (70%):** This assesses more advanced skills –  narrative structure, vocabulary, clarity, relevance, and professionalism.
    * **Structure & Coherence (20%):** The response is well-structured, with a clear beginning, middle, and end.  18/20.
    * **Vocabulary & Grammar (20%):** Uses more advanced vocabu

In [4]:
!pip install moviepy

Collecting moviepy
  Downloading moviepy-1.0.3.tar.gz (388 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m388.3/388.3 kB[0m [31m5.2 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
[?25hCollecting decorator<5.0,>=4.0.2 (from moviepy)
  Downloading decorator-4.4.2-py2.py3-none-any.whl.metadata (4.2 kB)
Collecting proglog<=1.0.0 (from moviepy)
  Downloading proglog-0.1.10-py3-none-any.whl.metadata (639 bytes)
Collecting imageio_ffmpeg>=0.2.0 (from moviepy)
  Downloading imageio_ffmpeg-0.5.1-py3-none-manylinux2010_x86_64.whl.metadata (1.6 kB)
Downloading decorator-4.4.2-py2.py3-none-any.whl (9.2 kB)
Downloading imageio_ffmpeg-0.5.1-py3-none-manylinux2010_x86_64.whl (26.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m26.9/26.9 MB[0m [31m63.0 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[?25hDownloading proglog-0.1.10-py3-none-any.whl (6.1 kB)
Building wheels for collected packages: moviepy

In [52]:
from moviepy.editor import VideoFileClip
from transformers import pipeline
import numpy as np
import cv2
from PIL import Image

# Initialize the model pipeline
pipe = pipeline("image-classification", model="dima806/facial_emotions_image_detection", device=0)

# Video path
video_path = "/kaggle/input/sample-vids/ganesh_sample.mp4"

# Load the video file using moviepy
video_clip = VideoFileClip(video_path)

# Initialize a list to store frames as PIL images
frames_pil = []

# Loop through the video frame
for frame in video_clip.iter_frames(fps=30, dtype="uint8"):  # You can change `fps` to control how often you extract frames
    # Convert the frame from RGB to BGR for OpenCV (moviepy returns RGB)
    frame_bgr = cv2.cvtColor(frame, cv2.COLOR_RGB2BGR)
    
    # Convert BGR frame to PIL image (RGB)
    frame_pil = Image.fromarray(cv2.cvtColor(frame_bgr, cv2.COLOR_BGR2RGB))
    
    # Append the PIL image to the list
    frames_pil.append(frame_pil)
len(frames_pil)
from time import time

t=time()
results = pipe(frames_pil)
top_emotions = []

# Iterate over each frame's results
for frame_results in results:
    # Find the emotion with the highest score in the current frame
    top_emotion = max(frame_results, key=lambda x: x['score'])
    
    # Extract the emotion label
    top_emotions.append(top_emotion['label'])
print(time()-t)

NameError: name 'gemini_model' is not defined

12.618387699127197


In [51]:
dict(Counter(top_emotions))

{'neutral': 569, 'angry': 92, 'fear': 39, 'sad': 5}

In [2]:
from transformers import AutoModelForCausalLM, AutoTokenizer

model_name = "google/codegemma-7b"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)
prompt = "def add_numbers(a, b):"
inputs = tokenizer(prompt, return_tensors="pt")
output = model.generate(**inputs, max_new_tokens=50)
generated_code = tokenizer.decode(output[0], skip_special_tokens=True)
print(generated_code)


OSError: You are trying to access a gated repo.
Make sure to have access to it at https://huggingface.co/google/codegemma-7b.
401 Client Error. (Request ID: Root=1-673b3f70-52b55b3129e9b12d747b6f86;bf3793d8-1e03-49ee-b85b-b5a1e975d502)

Cannot access gated repo for url https://huggingface.co/google/codegemma-7b/resolve/main/config.json.
Access to model google/codegemma-7b is restricted. You must have access to it and be authenticated to access it. Please log in.