## STT(Speach to text)

In [10]:
!pip3 install whisper-timestamped
!pip install torch
!pip install moviepy

Collecting whisper-timestamped
  Using cached whisper_timestamped-1.15.4-py3-none-any.whl.metadata (1.2 kB)
Collecting Cython (from whisper-timestamped)
  Using cached Cython-3.0.11-py2.py3-none-any.whl.metadata (3.2 kB)
Collecting dtw-python (from whisper-timestamped)
  Downloading dtw_python-1.5.3-cp311-cp311-macosx_11_0_arm64.whl.metadata (48 kB)
Collecting openai-whisper (from whisper-timestamped)
  Downloading openai-whisper-20240927.tar.gz (800 kB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m800.0/800.0 kB[0m [31m1.3 MB/s[0m eta [36m0:00:00[0m[31m1.4 MB/s[0m eta [36m0:00:01[0m
[?25h  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Preparing metadata (pyproject.toml) ... [?25ldone
Collecting numba (from openai-whisper->whisper-timestamped)
  Downloading numba-0.60.0-cp311-cp311-macosx_11_0_arm64.whl.metadata (2.7 kB)
Collecting more-itertools (from openai-whisper->whisper-time

In [11]:
import whisper_timestamped as whisper     # For convert to text, STT (Speach to text)
import torch                              # For whisper
from moviepy.editor import VideoFileClip  # For convert MP4 to MP3
import uuid                               # For generate id to user video data
import pickle                             # Save user video data on file
import os                                 # Work with file

Importing the dtw module. When using in academic works please cite:
  T. Giorgino. Computing and Visualizing Dynamic Time Warping Alignments in R: The dtw Package.
  J. Stat. Soft., doi:10.18637/jss.v031.i07.



In [12]:
class STT:
    def __init__(self, modelType="base"):
        self.modelType = modelType
        self.batch_size = 16 # reduce if low on GPU mem
        self.divice = "cuda" if torch.cuda.is_available() else "cpu"
        self.compute_type = "float16" if torch.cuda.is_available() else "int8"
        self.model = whisper.load_model(self.modelType, device=self.divice)

    def convertMP3ToText(self, pathToMP3: str) -> (str, str):
        audio = whisper.load_audio(pathToMP3)
        result = whisper.transcribe(self.model, audio, language="ru")

        text: str = result["text"]
        word_data: list[dict] = list()
        for segment in result["segments"]:
            for word in segment["words"]:
                word_data.append({
                    "text": word["text"],
                    "startTime": word["start"] * 1000,
                    "endTime": word["end"] * 1000
                })
        return text, word_data

    def saveData(self, text: str, word_data: str) -> str:
        absPath = os.path.abspath("")
        
        # Saving data
        saveData = {
            "text": text,
            "word_data": word_data
        }
        
        # Setting save directory
        pathDict = f"{absPath}/user"
        if not os.path.exists(pathDict):
            os.makedirs(pathDict)

        # Create id user video
        _id = uuid.uuid4().hex
        while os.path.isfile(f"{pathDict}/{_id}.pickle"):
            _id = uuid.uuid4().hex
    
        # Save data in pickle
        with open(f"{pathDict}/{_id}.pickle", 'wb') as file:
            pickle.dump(saveData, file, protocol=pickle.HIGHEST_PROTOCOL)
        return _id

    def loadData(self, _id: str) -> (str, list[dict]):
        absPath = os.path.abspath("")
        pathDict = f"{absPath}/user"
        
        # Check created file 
        if not os.path.isfile(f"{pathDict}/{_id}.pickle"):
            print("File does not exist")
            return "", list()

        # Open file
        with open(f"{pathDict}/{_id}.pickle", 'rb') as file:
            data = pickle.load(file)

        return data["text"], data["word_data"]

    def convertMP3(self, pathToMP3: str) -> str:
        text, word_data = self.convertMP3ToText(pathToMP3)
        _id = self.saveData(text, word_data)
        return _id

    def convertMP4(self, pahtToMP4: str) -> str:
        absPath = os.path.abspath("")
        pathTempMP3 = f"{absPath}/temp/mp3"
        fileName = os.path.splitext(os.path.basename(pahtToMP4))[0]
        tempFilePath = f"{pathTempMP3}/{fileName}.mp3"

        # Setting temp directory
        if not os.path.exists(pathTempMP3):
            os.makedirs(pathTempMP3)
        
        # Convert MP4 to MP3
        video = VideoFileClip(pahtToMP4)
        video.audio.write_audiofile(tempFilePath, logger=None)
        
        # Convert MP3 to data text
        _id = self.convertMP3(tempFilePath)

        # Delete temp file MP3
        if os.path.exists(tempFilePath):
            os.remove(tempFilePath)
        
        return _id

In [196]:
converter = STT("base")

In [95]:
test_audio = "/Users/odner/GitHub/hack20_09_27/viral_video/test_audio/test.mp3"

In [96]:
test_video = "/Users/odner/GitHub/hack20_09_27/viral_video/test/1c6bc481dd52a9938e78e755f1e5c90e.mp4"

In [198]:
converter.convertMP3(test_audio)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 874/874 [00:00<00:00, 1515.17frames/s]


'17edd14628964246ba22d61f7457b05a'

In [200]:
converter.loadData("17edd14628964246ba22d61f7457b05a")

(' Привет! Как дела? Hello! Бондиур!',
 [{'text': 'Привет!', 'startTime': 1600.0, 'endTime': 1960.0},
  {'text': 'Как', 'startTime': 2760.0, 'endTime': 3140.0},
  {'text': 'дела?', 'startTime': 3140.0, 'endTime': 3460.0},
  {'text': 'Hello!', 'startTime': 4700.0, 'endTime': 5000.0},
  {'text': 'Бондиур!', 'startTime': 7040.0, 'endTime': 7700.0}])

In [201]:
converter.convertMP4(test_video)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

'f47a8bc0994447f18fb6f47267275b3a'

In [202]:
text, wordData = converter.loadData("f47a8bc0994447f18fb6f47267275b3a")

In [33]:
del converter

## Класификация тональности

In [100]:
import numpy as np
from transformers import pipeline

In [203]:
class SentimentAnalysis:
    def __init__(self):
        self.classifier = pipeline('sentiment-analysis', model="cointegrated/rubert-tiny-sentiment-balanced")

    def _sentimentText(self, text: str) -> float:
        result = self.classifier(text)[0]
        # Преобразование результатов в числовую шкалу
        if result['label'] in ('positive', 'negative'):
            return result['score']  # Позитивные эмоции — от 0 до 1
        else:
            return 0.0  # Нейтральный тон = 0

    def sentimentWordData(self, wordData: list[dict], lenWords: int = 10) -> list[dict]:
        # Data scope with sentiment for ever word
        dataScope = np.zeros(len(wordData) + lenWords * 2)
        # Create zero value form start and end array
        zeroWordData = [{"text": ""} for idx in range(lenWords)]
        # New word data 
        wordData = zeroWordData + wordData + zeroWordData
    
        for idx in range(len(wordData) - lenWords):
            # Create text to sentiment score
            text = "".join([word["text"] for word in wordData[idx:idx+lenWords]])
            sentimentScore = self._sentimentText(text)
    
            # Add score
            dataScope[idx:idx+lenWords] += sentimentScore
        
        dataScope = dataScope / lenWords

        for idx in range(len(dataScope)):
            wordData[idx]["sentiment_analysis"] = dataScope[idx]
        
        return wordData[lenWords:(len(wordData) - lenWords)]

    def toFormant(slef, wordData: list[dict], shift:int = 3) -> list[list[list[int, int], float]]:
        data_format = list()
        flagIdx = -1
        for idx in range(len(wordData)):
            if (flagIdx == -1 and wordData[idx]["sentiment_analysis"] != 0):
                flagIdx = idx
            if (flagIdx != -1 and wordData[idx]["sentiment_analysis"] == 0):
                summ = 0
                for j in range(idx - 1 - flagIdx):
                    summ += wordData[flagIdx + j]["sentiment_analysis"]
                data_format.append([[int(wordData[flagIdx]["startTime"]), int(wordData[idx - 1]["endTime"]), summ / (idx - 1 - flagIdx)]])
                flagIdx = -1
        return data_format

In [204]:
sentimet_analysis = SentimentAnalysis()

Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


In [205]:
wordData1 = sentimet_analysis.sentimentWordData(wordData)

In [206]:
wordData1

[{'text': 'Наташа,',
  'startTime': 140.0,
  'endTime': 420.0,
  'sentiment_analysis': 0.0},
 {'text': 'ты',
  'startTime': 480.0,
  'endTime': 560.0,
  'sentiment_analysis': 0.0},
 {'text': 'эту',
  'startTime': 560.0,
  'endTime': 640.0,
  'sentiment_analysis': 0.05435175895690918},
 {'text': 'кухню',
  'startTime': 640.0,
  'endTime': 900.0,
  'sentiment_analysis': 0.05435175895690918},
 {'text': 'видела?',
  'startTime': 900.0,
  'endTime': 1220.0,
  'sentiment_analysis': 0.05435175895690918},
 {'text': 'Это',
  'startTime': 1420.0,
  'endTime': 1620.0,
  'sentiment_analysis': 0.10411757826805115},
 {'text': '90.',
  'startTime': 1620.0,
  'endTime': 1960.0,
  'sentiment_analysis': 0.10411757826805115},
 {'text': 'Я',
  'startTime': 2460.0,
  'endTime': 2520.0,
  'sentiment_analysis': 0.10411757826805115},
 {'text': 'удивляюсь,',
  'startTime': 2520.0,
  'endTime': 2860.0,
  'sentiment_analysis': 0.1576152265071869},
 {'text': 'как',
  'startTime': 2960.0,
  'endTime': 2980.0,
  's

In [207]:
wordData2 = sentimet_analysis.toFormant(wordData1)

In [208]:
wordData2

[[[560, 6660, 0.14517034637300594]],
 [[7480, 15380, 0.19473947777467615]],
 [[17860, 25660, 0.32214486941695214]],
 [[34400, 53100, 0.22673235698179764]],
 [[53740, 56120, 0.050946450233459464]],
 [[56940, 72700, 0.3322503747124421]],
 [[76660, 82680, 0.10400329913411825]],
 [[88240, 91860, 0.0661762535572052]],
 [[96900, 106340, 0.33135736098995916]],
 [[106920, 116160, 0.31053196152051304]],
 [[119860, 135360, 0.37069333417471056]],
 [[141380, 144140, 0.059987014532089225]],
 [[147140, 215800, 0.44796047601764566]],
 [[292540, 302520, 0.3993522551129846]],
 [[302700, 309700, 0.16062722392380238]],
 [[313300, 356860, 0.325197084442429]],
 [[372320, 391220, 0.25050043882550416]],
 [[392080, 467860, 0.3095327623188496]],
 [[486500, 517840, 0.07698361426591874]],
 [[523580, 536580, 0.4747538900002837]]]

In [5]:
import torch
import numpy as np
from transformers import BertForSequenceClassification, AutoTokenizer

In [2]:
LABELS = ['neutral', 'happiness', 'sadness', 'enthusiasm', 'fear', 'anger', 'disgust']
tokenizer = AutoTokenizer.from_pretrained('Aniemore/rubert-tiny2-russian-emotion-detection')
model = BertForSequenceClassification.from_pretrained('Aniemore/rubert-tiny2-russian-emotion-detection')

tokenizer_config.json:   0%|          | 0.00/379 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/1.08M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.41M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/1.06k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/117M [00:00<?, ?B/s]

In [None]:
class EmotionAnalysis:
    def __init__(self):
        self._labels = ['neutral', 'happiness', 'sadness', 'enthusiasm', 'fear', 'anger', 'disgust']
        self.tokenizer = AutoTokenizer.from_pretrained('Aniemore/rubert-tiny2-russian-emotion-detection')
        self.model = BertForSequenceClassification.from_pretrained('Aniemore/rubert-tiny2-russian-emotion-detection')

    @torch.no_grad()
    def _analysis(self, text: str):
        inputs = self.tokenizer(text, max_length=512, padding=True, truncation=True, return_tensors='pt')
        outputs = self.model(**inputs)
        predicted = torch.nn.functional.softmax(outputs.logits, dim=1)
        
        predicted = torch.argmax(predicted, dim=1).numpy()
        return predicted

    def analysisWordData(self, wordDataFragments: list[dict]) -> list[dict]:
        text = ""
        for word in wordDataFragments:
            text = "".join(text, word["text"])

        textSentiments: list[str] = list()
        split_regex = re.compile(r'[.|!|?|…]')
        sentences = filter(lambda t: t, [t.strip() for t in split_regex.split(text)])
        for s in sentences:
            textSentiments.append(s)

        answer: list[dict] = list()
        for text in textSentiments:
            answer.appned({
                "text": text,
                "emootion": self._analysis(text)
            })
        return answer

In [6]:
def predict_emotions(text: str) -> list:
    """
        It takes a string of text, tokenizes it, feeds it to the model, and returns a dictionary of emotions and their
        probabilities
        :param text: The text you want to classify
        :type text: str
        :return: A dictionary of emotions and their probabilities.
    """
    inputs = tokenizer(text, max_length=512, padding=True, truncation=True, return_tensors='pt')
    outputs = model(**inputs)
    predicted = torch.nn.functional.softmax(outputs.logits, dim=1)
    emotions_list = {}
    for i in range(len(predicted.numpy()[0].tolist())):
        emotions_list[LABELS[i]] = predicted.numpy()[0].tolist()[i]
    return emotions_list

In [7]:
not_simple_prediction = predict_emotions("Какой же сегодня прекрасный день, братья")

In [8]:
not_simple_prediction

{'neutral': 0.0004941819934174418,
 'happiness': 0.9979524612426758,
 'sadness': 0.0002536596148274839,
 'enthusiasm': 0.000549814198166132,
 'fear': 0.00025326196919195354,
 'anger': 0.0003583927755244076,
 'disgust': 0.0001380780158797279}

In [12]:
import re

In [13]:
test = "testd. adsf . asd a. Some "

In [17]:
split_regex = re.compile(r'[.|!|?|…]')
sentences = filter(lambda t: t, [t.strip() for t in split_regex.split(test)])
for s in sentences:
    print(s)

testd
adsf
asd a
Some


In [18]:
lsentences)

[]