In [3]:
!pip install python-youtube mlflow pymystem3 nltk -q

In [1]:
import os
import logging
import yaml
from pyyoutube import Api
import json
import requests
import numpy as np
from nltk.corpus import stopwords
import pandas as pd
import re
import numpy as np
from pymystem3 import Mystem
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import SpectralClustering
from sklearn.metrics import f1_score, silhouette_score
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
import mlflow
from mlflow.tracking import MlflowClient

In [2]:
def get_data(YOUTUBE_API_KEY, videoId, maxResults, nextPageToken):
    """
    –ü–æ–ª—É—á–µ–Ω–∏–µ –∏–Ω—Ñ–æ—Ä–º–∞—Ü–∏–∏ —Å–æ —Å—Ç—Ä–∞–Ω–∏—Ü—ã —Å –≤–∏–¥–µ–æ
    """
    YOUTUBE_URI = 'https://www.googleapis.com/youtube/v3/commentThreads?key={KEY}&textFormat=plainText&' + \
        'part=snippet&videoId={videoId}&maxResults={maxResults}&pageToken={nextPageToken}'
    format_youtube_uri = YOUTUBE_URI.format(KEY=YOUTUBE_API_KEY,
                                            videoId=videoId,
                                            maxResults=maxResults,
                                            nextPageToken=nextPageToken)
    content = requests.get(format_youtube_uri).text
    data = json.loads(content)
    return data


def get_text_of_comment(data):
    """
    –ü–æ–ª—É—á–µ–Ω–∏–µ –∫–æ–º–º–µ–Ω—Ç–∞—Ä–∏–µ–≤ –∏–∑ –ø–æ–ª—É—á–µ–Ω–Ω—ã—Ö –¥–∞–Ω–Ω—ã—Ö –ø–æ–¥ –æ–¥–Ω–∏–º –≤–∏–¥–µ–æ
    """
    comms = set()
    for item in data['items']:
        comm = item['snippet']['topLevelComment']['snippet']['textDisplay']
        comms.add(comm)
    return comms


def get_all_comments(YOUTUBE_API_KEY, query, count_video=10, limit=30, maxResults=10, nextPageToken=''):
    """
    –í—ã–≥—Ä—É–∑–∫–∞ maxResults –∫–æ–º–º–µ–Ω—Ç–∞—Ä–∏–µ–≤
    """
    api = Api(api_key=YOUTUBE_API_KEY)
    video_by_keywords = api.search_by_keywords(q=query,
                                               search_type=["video"],
                                               count=count_video,
                                               limit=limit)
    videoId = [x.id.videoId for x in video_by_keywords.items]

    comments_all = []
    for id_video in videoId:
        try:
            data = get_data(YOUTUBE_API_KEY,
                            id_video,
                            maxResults=maxResults,
                            nextPageToken=nextPageToken)
            comment = list(get_text_of_comment(data))
            comments_all.append(comment)
        except:
            continue
    comments = sum(comments_all, [])
    return comments

In [3]:
config_path = os.path.join('../config/params_all.yaml')
config = yaml.safe_load(open(config_path))['train']

In [4]:
SEED = config['SEED']

In [5]:
config

{'SEED': 10,
 'clustering': {'affinity': 'cosine',
  'count_max_clusters': 15,
  'silhouette_metric': 'euclidean'},
 'comments': {'YOUTUBE_API_KEY': 'AIzaSyCfZBiaYvhsHf7NDHANcp6zFO57z0cX-L8',
  'count_video': 50,
  'limit': 30,
  'maxResults': 5,
  'nextPageToken': '',
  'query': 'Data Science'},
 'cross_val': {'test_size': 0.3},
 'dir_folder': '/Users/miracl6/airflow-mlflow-tutorial',
 'model': {'class_weight': 'balanced'},
 'model_lr': 'LogisticRegression',
 'model_vec': 'vector_tfidf',
 'name_experiment': 'my_third',
 'stopwords': 'russian',
 'tf_model': {'max_features': 300}}

In [6]:
comments = get_all_comments(**config['comments'])

In [7]:
comments[:15]

['@ApnaCollegeOfficial , Respected shraddha di please tell me about eligibility criteria to admission to your esteemed curriculum??',
 'Mam kya diploma mechanical wale student data science mei job le sakte hai kya',
 'So Difficult',
 'how we can create project?',
 'Is she software engineer???',
 'this explanation is really very easy to understand',
 'üëçüèª',
 "I came a long way i won't stop studying Medical school made me happy curing myself on my own is terrible satisfying inject yourself with the drug you made",
 'I have a test on this only',
 'Inject yourself...inject yourself',
 'The video is so helpful and supportive. Thank you so much',
 '"üî•Data Scientist Masters Program (Discount Code - YTBE15) - https://www.simplilearn.com/big-data-and-analytics/senior-data-scientist-masters-program-training?utm_campaign=X3paOmcrTjQ&utm_medium=Comments&utm_source=Youtube\nüî•IITK - Professional Certificate Course in Data Science (India Only) - https://www.simplilearn.com/iitk-professiona

In [8]:
def remove_emoji(string):
    """
    –£–¥–∞–ª–µ–Ω–∏–µ —ç–º–æ–¥–∂–∏ –∏–∑ —Ç–µ–∫—Å—Ç–∞
    """
    emoji_pattern = re.compile("["u"\U0001F600-\U0001F64F"  # emoticons
                               u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                               u"\U0001F680-\U0001F6FF"  # transport & map symbols
                               u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                               u"\U00002702-\U000027B0"
                               u"\U000024C2-\U0001F251"
                               u"\U0001f926-\U0001f937"
                               u'\U00010000-\U0010ffff'
                               u"\u200d"
                               u"\u2640-\u2642"
                               u"\u2600-\u2B55"
                               u"\u23cf"
                               u"\u23e9"
                               u"\u231a"
                               u"\u3030"
                               u"\ufe0f"
                               "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', string)


def remove_links(string):
    """
    –£–¥–∞–ª–µ–Ω–∏–µ —Å—Å—ã–ª–æ–∫
    """
    string = re.sub(r'http\S+', '', string)  # remove http links
    string = re.sub(r'bit.ly/\S+', '', string)  # rempve bitly links
    string = re.sub(r'www\S+', '', string)  # rempve bitly links
    string = string.strip('[link]')  # remove [links]
    return string


def preprocessing(string, stopwords, stem):
    """
    –ü—Ä–æ—Å—Ç–æ–π –ø—Ä–µ–ø—Ä–æ—Ü–µ—Å—Å–∏–Ω–≥ —Ç–µ–∫—Å—Ç–∞, –æ—á–∏—Å—Ç–∫–∞, –ª–µ–º–∞—Ç–∏–∑–∞—Ü–∏—è, —É–¥–∞–ª–µ–Ω–∏–µ –∫–æ—Ä–æ—Ç–∫–∏—Ö —Å–ª–æ–≤
    """
    string = remove_emoji(string)
    string = remove_links(string)

    # —É–¥–∞–ª–µ–Ω–∏–µ —Å–∏–º–≤–æ–ª–æ–≤ "\r\n"
    str_pattern = re.compile("\r\n")
    string = str_pattern.sub(r'', string)

    # –æ—á–∏—Å—Ç–∫–∞ —Ç–µ–∫—Å—Ç–∞ –æ—Ç —Å–∏–º–≤–æ–ª–æ–≤
    string = re.sub('(((?![a-zA-Z ]).)+)', ' ', string)
    # –ª–µ–º–∞—Ç–∏–∑–∞—Ü–∏—è
    string = ' '.join([
        re.sub('\\n', '', ' '.join(stem.lemmatize(s))).strip()
        for s in string.split()
    ])
    # —É–¥–∞–ª—è–µ–º —Å–ª–æ–≤–∞ –∫–æ—Ä–æ—á–µ 3 —Å–∏–º–≤–æ–ª–æ–≤
    string = ' '.join([s for s in string.split() if len(s) > 3])
    # —É–¥–∞–ª—è–µ–º —Å—Ç–æ–ø-—Å–ª–æ–≤–∞
    string = ' '.join([s for s in string.split() if s not in stopwords])
    return string


def get_clean_text(data, stopwords):
    """
    –ü–æ–ª—É—á–µ–Ω–∏–µ —Ç–µ–∫—Å—Ç–∞ –≤ –ø—Ä–µ–æ–±—Ä–∞–∑–æ–≤–∞–Ω–Ω–æ–π –ø–æ—Å–ª–µ –æ—á–∏—Å—Ç–∫–∏
    –º–∞—Ç—Ä–∏—á–Ω–æ–º –≤–∏–¥–µ, –∞ —Ç–∞–∫–∂–µ –º–æ–¥–µ–ª—å –≤–µ–∫—Ç–æ—Ä–∏–∑–∞—Ü–∏–∏
    """
    # –ü—Ä–æ—Å—Ç–æ–π –ø—Ä–µ–ø—Ä–æ—Ü–µ—Å—Å–∏–Ω–≥ —Ç–µ–∫—Å—Ç–∞
    stem = Mystem()
    comments = [preprocessing(x, stopwords, stem) for x in data]
    # –£–¥–∞–ª–µ–Ω–∏–µ –∫–æ–º–º–µ–Ω—Ç–æ–≤, –∫–æ—Ç–æ—Ä—ã–µ –∏–º–µ—é—Ç –º–µ–Ω—å—à–µ, —á–µ–º 5 —Å–ª–æ–≤
    comments = [y for y in comments if len(y.split()) > 5]
    #common_texts = [i.split(' ') for i in comments]
    return comments


def vectorize_text(data, tfidf):
    """
    –ü–æ–ª—É—á–µ–Ω–∏–µ –º–∞—Ç—Ä–∏—Ü—ã –∫–æ–ª-–≤–∞ —Å–ª–æ–≤ –≤ –∫–æ–º–º–µ–Ω–∞—Ä–∏—è—Ö
    –û—á–∏—Å—Ç–∫–∞ –æ—Ç –ø—É—Å—Ç—ã—Ö —Å—Ç—Ä–æ–∫
    """
    # –í–µ–∫—Ç–æ—Ä–∏–∑–∞—Ü–∏—è
    X_matrix = tfidf.transform(data).toarray()
    # –£–¥–∞–ª—è–µ–º —Å—Ç—Ä–æ–∫–∏ –≤ –º–∞—Ç—Ä–∏—Ü–µ —Å –ø—É—Å—Ç—ã–º–∏ –∑–Ω–∞—á–µ–Ω–∏—è–º–∏
    mask = (np.nan_to_num(X_matrix) != 0).any(axis=1)
    return X_matrix[mask]

In [9]:
comments_clean = get_clean_text(comments, stopwords.words(config['stopwords']))
tfidf = TfidfVectorizer(**config['tf_model']).fit(comments_clean)

In [10]:
comments_clean[:10]

['ApnaCollegeOfficial Respected shraddha please tell about eligibility criteria admission your esteemed curriculum',
 'diploma mechanical wale student data science sakte',
 'this explanation really very easy understand',
 'came long stop studying Medical school made happy curing myself terrible satisfying inject yourself with drug made',
 'Data Scientist Masters Program Discount Code YTBE IITK Professional Certificate Course Data Science India Only Caltech Post Graduate Program Data Science Brown University Applied Data Science',
 'Thank soooo much have helped greatly understanding this feild great amount stress what should chose older sister also data scientist told would able didn know single thing regarding this feild thanks your video apps that recomended Inshallah will successfull this',
 'have more query feature engineering being taken under exploratory data analysis phase This different phase altogether',
 'Data analytics science rage computer science software engineering',
 'Wh

In [12]:
X_matrix = vectorize_text(comments_clean, tfidf)

In [13]:
X_matrix.shape

(109, 300)

In [11]:
tfidf.get_feature_names_out()[:12]

array(['about', 'accounts', 'advice', 'after', 'also', 'analysis',
       'analyst', 'analysts', 'analytics', 'answer', 'answering',
       'application'], dtype=object)

In [42]:
def get_clusters(data, count_max_clusters, random_state, affinity,
                 silhouette_metric):
    """
    –ü–æ–¥–±–æ—Ä –Ω–∞–∏–ª—É—á—à–µ–≥–æ —á–∏—Å–ª–∞ –∫–ª–∞—Å—Ç–µ—Ä–æ–≤, –≤–æ–∑–≤—Ä–∞—â–∞–µ—Ç –ø–æ–ª—É—á–µ–Ω–Ω—ã–µ –∫–ª–∞—Å—Ç–µ—Ä–∞ —Ç–µ–º–∞—Ç–∏–∫
    """
    cluster_labels = {}
    silhouette_mean = []

    for i in range(2, count_max_clusters, 1):
        clf = SpectralClustering(n_clusters=i,
                                 affinity=affinity,
                                 random_state=random_state)
        #clf = KMeans(n_clusters=n, max_iter=1000, n_init=1)
        clf.fit(data)
        labels = clf.labels_
        cluster_labels[i] = labels
        silhouette_mean.append(
            silhouette_score(data, labels, metric=silhouette_metric))
    n_clusters = silhouette_mean.index(max(silhouette_mean)) + 2
    return cluster_labels[n_clusters]


def get_f1_score(y_test, y_pred, unique_cluster_labels):
    """
    –í–æ–∑—Ä–∞—â–∞–µ—Ç —Ä–µ–∑—É–ª—å—Ç–∞—Ç –æ–±—É—á–µ–Ω–∏—è –∫–ª–∞—Å—Å–∏—Ñ–∏–∫–∞—Ç–æ—Ä–∞ –ø–æ —Ç–µ–º–∞—Ç–∏–∫–∞–º
    """
    return f1_score(
        y_test, y_pred,
        average='macro') \
        if len(unique_cluster_labels) > 2 \
        else f1_score(y_test, y_pred)

In [43]:
cluster_labels = get_clusters(X_matrix,
                                 random_state=SEED,
                                 **config['clustering'])

ValueError: Found array with 1 sample(s) (shape=(1, 6)) while a minimum of 2 is required by SpectralClustering.

In [17]:
config

{'SEED': 10,
 'clustering': {'affinity': 'cosine',
  'count_max_clusters': 15,
  'silhouette_metric': 'euclidean'},
 'comments': {'YOUTUBE_API_KEY': 'AIzaSyCPYNxHdsk6_-UX60p9Hm65cPXWXifut9A',
  'count_video': 5,
  'limit': 30,
  'maxResults': 20,
  'nextPageToken': '',
  'query': '–¥–∞—Ç–∞ —Å–∞–π–µ–Ω—Å'},
 'cross_val': {'test_size': 0.3},
 'dir_folder': '/Users/miracl6/airflow-mlflow-tutorial',
 'model': {'class_weight': 'balanced'},
 'model_lr': 'LogisticRegression',
 'model_vec': 'vector_tfidf',
 'name_experiment': 'my_first',
 'stopwords': 'russian',
 'tf_model': {'max_features': 300}}

In [18]:
cluster_labels[:10]

array([3, 0, 3, 0, 3, 6, 0, 6, 0, 5], dtype=int32)

In [19]:
X_train, X_test, y_train, y_test = train_test_split(X_matrix,
                                                    cluster_labels,
                                                    **config['cross_val'],
                                                    random_state=SEED)

In [20]:
clf_lr = LogisticRegression(**config['model'])

In [22]:
%%bash
export MLFLOW_REGISTRY_URI=../mlflow

In [27]:
mlflow.set_tracking_uri("http://localhost:5001")
mlflow.set_experiment('comments')
with mlflow.start_run():
    clf_lr.fit(X_train, y_train)
    print(clf_lr.predict_proba(X_test))

    # –õ–æ–≥–∏—Ä–æ–≤–∞–Ω–∏–µ –º–æ–¥–µ–ª–∏ –∏ –ø–∞—Ä–∞–º–µ—Ç—Ä–æ–≤
    mlflow.log_param(
        'f1', get_f1_score(y_test, clf_lr.predict(X_test),
                           set(cluster_labels)))
    mlflow.sklearn.log_model(
        tfidf,
        artifact_path="vector",
        registered_model_name=f"{config['model_vec']}")
    mlflow.sklearn.log_model(
        clf_lr,
        artifact_path='model_lr',
        registered_model_name=f"{config['model_lr']}")
    mlflow.end_run()

Registered model 'vector_tfidf' already exists. Creating a new version of this model...
2021/05/02 19:53:43 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation.                     Model name: vector_tfidf, version 29


[[0.12980699 0.13219332 0.141417   0.12500189 0.12521816 0.11938949
  0.11138085 0.1155923 ]
 [0.13225054 0.09461199 0.14431436 0.14921074 0.12746151 0.12149697
  0.11319859 0.1174553 ]
 [0.1488154  0.09091486 0.15653628 0.14699363 0.12095594 0.11566808
  0.10800232 0.11211349]
 [0.14166712 0.09029179 0.1767976  0.12378369 0.12853311 0.11464353
  0.11344233 0.11084083]
 [0.14735099 0.09412684 0.14295515 0.12872404 0.12654938 0.12060076
  0.12294893 0.11674391]
 [0.14852655 0.08947448 0.13987674 0.12342287 0.12322714 0.11324573
  0.15235379 0.10987269]
 [0.1433901  0.09291234 0.14875431 0.12435637 0.12450994 0.14043332
  0.11060968 0.11503393]
 [0.17761867 0.09219381 0.13923505 0.12629707 0.12340639 0.11763627
  0.10961023 0.11400252]
 [0.13568458 0.0949781  0.15717993 0.13047939 0.12802549 0.12203414
  0.11358408 0.11803429]
 [0.13182455 0.09333186 0.15002128 0.12495374 0.13229934 0.13286687
  0.1191945  0.11550786]
 [0.14957397 0.09164231 0.15114628 0.12472615 0.12895741 0.12000372
  

Created version '29' of model 'vector_tfidf'.
Registered model 'LogisticRegression' already exists. Creating a new version of this model...
2021/05/02 19:53:43 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation.                     Model name: LogisticRegression, version 29
Created version '29' of model 'LogisticRegression'.


In [28]:
mlflow.get_artifact_uri()

'mlflow/2/da17c6f5dbce43aeaa6727a3674d2376/artifacts'

In [29]:
def get_version_model(config_name, client):
    """
    –ü–æ–ª—É—á–µ–Ω–∏–µ –ø–æ—Å–ª–µ–¥–Ω–µ–π –≤–µ—Ä—Å–∏–∏ –º–æ–¥–µ–ª–∏ –∏–∑ MLFlow
    """
    dict_push = {}
    for count, value in enumerate(
        client.search_model_versions(f"name='{config_name}'")):
        # client.list_registered_models()):
        # –í—Å–µ –≤–µ—Ä—Å–∏–∏ –º–æ–¥–µ–ª–∏
        dict_push[count] = value
    return dict(list(dict_push.items())[-1][1])['version']

In [31]:
client = MlflowClient()
last_version_lr = get_version_model(config['model_lr'], client)
last_version_vec = get_version_model(config['model_vec'], client)

In [32]:
last_version_lr

'29'

In [33]:
last_version_vec

'29'