In [1]:
import pandas as pd
import numpy as np
import re
import time

import bs4
import json

import glob
import tqdm

pd.set_option("max.columns",131)

%matplotlib inline
%pylab inline

Populating the interactive namespace from numpy and matplotlib


In [2]:
df = pd.read_csv("raw_data_with_labels.csv", index_col=0)
df = df[df['y'].notnull()]
df.shape

(501, 16)

In [3]:
df = df[df['watch-time-text'].notnull()]
df = df[df['watch-time-text'].str.contains('horas')==False]
df.shape

(473, 16)

In [4]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

In [5]:
df_limpo = pd.DataFrame(index=df.index)
df_limpo['title'] = df['watch-title']

## 1. Data Cleaner

In [6]:
clean_date = df['watch-time-text'].str.extract(r"(\d+) de ([a-z]+)\. de (\d+)")
clean_date[0] = clean_date[0].map(lambda x: "0"+x[0] if len(x) == 1 else x)
#clean_date[1] = clean_date[1].map(lambda x: x[0].upper()+x[1:])

mapa_meses = {"jan": "Jan",
              "fev": "Feb",
              "mar": "Mar", 
              "abr": "Apr", 
              "mai": "May", 
              "jun": "Jun",
              "jul": "Jul",
              "ago": "Aug", 
              "set": "Sep", 
              "out": "Oct", 
              "nov": "Nov",
              "dez": "Dec"}

clean_date[1] = clean_date[1].map(mapa_meses)

clean_date = clean_date.apply(lambda x: " ".join(x), axis=1)
clean_date.head()
df_limpo['date'] = pd.to_datetime(clean_date, format="%d %b %Y")

## 2.Views cleaner

In [7]:
views = df['watch-view-count'].str.extract(r"(\d+\.?\d*)", expand=False).str.replace(".", "").fillna(0).astype(int)
df_limpo['views'] = views

## 3.Features

In [8]:
features = pd.DataFrame(index=df_limpo.index)
y = df['y'].copy()

In [9]:
features['publish_time'] = (pd.to_datetime('2020-06-01') - df_limpo['date']) / np.timedelta64(1,'D')
features['publish_time'] = features['publish_time'].replace(0,1)
features['views'] = df_limpo['views']
features['day_views'] = features['views'] / features['publish_time']
features = features.drop(['publish_time'], axis=1)

In [10]:
mask_train = df_limpo['date'] < "2019-09-01"
mask_val = df_limpo['date'] >= "2019-09-01"

Xtrain, Xval = features[mask_train], features[mask_val]
ytrain, yval = y[mask_train], y[mask_val]
Xtrain.shape, Xval.shape, ytrain.shape, yval.shape

((237, 2), (236, 2), (237,), (236,))

In [11]:
df_limpo.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 473 entries, 0 to 500
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype         
---  ------  --------------  -----         
 0   title   473 non-null    object        
 1   date    473 non-null    datetime64[ns]
 2   views   473 non-null    int64         
dtypes: datetime64[ns](1), int64(1), object(1)
memory usage: 14.8+ KB


In [12]:
from sklearn.feature_extraction.text import TfidfVectorizer

title_train = df_limpo[mask_train]['title']
title_val = df_limpo[mask_val]['title']

title_vec = TfidfVectorizer(min_df=2)
title_bow_train = title_vec.fit_transform(title_train)
title_bow_val = title_vec.transform(title_val)


In [13]:
title_bow_train.shape

(237, 194)

In [14]:
title_bow_train

<237x194 sparse matrix of type '<class 'numpy.float64'>'
	with 1418 stored elements in Compressed Sparse Row format>

In [15]:
from scipy.sparse import hstack, vstack

Xtrain_wtitle = hstack([Xtrain, title_bow_train])
Xval_wtitle = hstack([Xval, title_bow_val])

In [16]:
Xtrain_wtitle.shape, Xval_wtitle.shape

((237, 196), (236, 196))

In [17]:
mdl = RandomForestClassifier(n_estimators=1000, random_state=0, class_weight="balanced", n_jobs=5)
mdl.fit(Xtrain_wtitle, ytrain)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight='balanced',
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=1000,
                       n_jobs=5, oob_score=False, random_state=0, verbose=0,
                       warm_start=False)

In [18]:
p = mdl.predict_proba(Xval_wtitle)[:, 1]

In [19]:
from sklearn.metrics import roc_auc_score, average_precision_score

In [20]:
average_precision_score(yval, p)

0.19554074062718557

In [21]:
roc_auc_score(yval, p)

0.5232142857142857

## 4.Active Learning

70 exemplos que o modelo tenha dificuldade

30 exemplos aleatoriamente

In [22]:
df_unlabeled = pd.read_csv("raw_data_with_labels.csv", index_col=0)
df_unlabeled = df_unlabeled[df_unlabeled['y'].isnull()].dropna(how='all')
df_unlabeled.shape

(857, 16)

In [23]:
df_unlabeled = df_unlabeled[df_unlabeled['watch-time-text'].notnull()]
df_unlabeled = df_unlabeled[df_unlabeled['watch-time-text'].str.contains('horas')==False]
df_unlabeled.shape

(801, 16)

In [24]:
df_unlabeled.head(1)

Unnamed: 0,watch-title,y,watch-view-count,watch-time-text,content_watch-info-tag-list,watch7-headline,watch7-user-header,watch8-sentiment-actions,og:image,og:image:width,og:image:height,og:description,og:video:width,og:video:height,og:video:tag,channel_link_0
501,Why I left my Data Science Job at FANG (Facebo...,,686.902 visualizações,Publicado em 11 de abr. de 2019,Pessoas e blogs,Why I left my Data Science Job at FANG (Facebo...,Joma Tech\nCarregando...\nCancelar inscrição d...,686.902 visualizações\n13.710\nGostou deste ví...,https://i.ytimg.com/vi/M5v1nXiUaOI/maxresdefau...,1280,720,► Check out CoderPro for 100+ Video Explanatio...,1280.0,720.0,data scientist,/channel/UCV0qA-eDDICsRR9rPcnG7tw


In [25]:
df_limpo_u = pd.DataFrame(index=df_unlabeled.index)
df_limpo_u['title'] = df_unlabeled['watch-title']

In [26]:
clean_date = df_unlabeled['watch-time-text'].str.extract(r"(\d+) de ([a-z]+)\. de (\d+)")
clean_date[0] = clean_date[0].map(lambda x: "0"+x[0] if len(x) == 1 else x)
#clean_date[1] = clean_date[1].map(lambda x: x[0].upper()+x[1:])

mapa_meses = {"jan": "Jan",
              "fev": "Feb",
              "mar": "Mar", 
              "abr": "Apr", 
              "mai": "May", 
              "jun": "Jun",
              "jul": "Jul",
              "ago": "Aug", 
              "set": "Sep", 
              "out": "Oct", 
              "nov": "Nov",
              "dez": "Dec"}

clean_date[1] = clean_date[1].map(mapa_meses)

clean_date = clean_date.apply(lambda x: " ".join(x), axis=1)
clean_date.head()
df_limpo_u['date'] = pd.to_datetime(clean_date, format="%d %b %Y")

In [27]:
df_limpo_u.head()

Unnamed: 0,title,date
501,Why I left my Data Science Job at FANG (Facebo...,2019-04-11
502,Real-World Python Machine Learning Tutorial w/...,2019-09-30
504,Self-Studying Machine Learning? Remind yoursel...,2019-03-02
505,Best Online Data Science Courses,2018-12-13
506,Papo de Negócios: É real o valor de Data Scien...,2020-05-27


In [28]:
views = df_unlabeled['watch-view-count'].str.extract(r"(\d+\.?\d*)", expand=False).str.replace(".", "").fillna(0).astype(int)
df_limpo_u['views'] = views

In [29]:
features_u = pd.DataFrame(index=df_limpo_u.index)

In [30]:
features_u['publish_time'] = (pd.to_datetime('2020-06-01') - df_limpo_u['date']) / np.timedelta64(1,'D')
features_u['publish_time'] = features_u['publish_time'].replace(0,1)
features_u['views'] = df_limpo_u['views']
features_u['day_views'] = features_u['views'] / features_u['publish_time']
features_u = features_u.drop(['publish_time'], axis=1)

In [31]:
features_u.head()

Unnamed: 0,views,day_views
501,686902,1647.247002
502,35785,146.061224
504,30973,67.774617
505,196112,365.880597
506,62,12.4


In [32]:
from sklearn.feature_extraction.text import TfidfVectorizer

title_u = df_limpo_u['title']
title_bow_u = title_vec.transform(title_u)


In [33]:
title_bow_u

<801x194 sparse matrix of type '<class 'numpy.float64'>'
	with 3882 stored elements in Compressed Sparse Row format>

In [34]:
Xu_wtitle = hstack([features_u, title_bow_u])

In [35]:
Xu_wtitle

<801x196 sparse matrix of type '<class 'numpy.float64'>'
	with 5472 stored elements in COOrdinate format>

In [36]:
pu = mdl.predict_proba(Xu_wtitle)[:, 1]

In [37]:
df_unlabeled['p'] = pu

In [38]:
df_unlabeled.head(1)

Unnamed: 0,watch-title,y,watch-view-count,watch-time-text,content_watch-info-tag-list,watch7-headline,watch7-user-header,watch8-sentiment-actions,og:image,og:image:width,og:image:height,og:description,og:video:width,og:video:height,og:video:tag,channel_link_0,p
501,Why I left my Data Science Job at FANG (Facebo...,,686.902 visualizações,Publicado em 11 de abr. de 2019,Pessoas e blogs,Why I left my Data Science Job at FANG (Facebo...,Joma Tech\nCarregando...\nCancelar inscrição d...,686.902 visualizações\n13.710\nGostou deste ví...,https://i.ytimg.com/vi/M5v1nXiUaOI/maxresdefau...,1280,720,► Check out CoderPro for 100+ Video Explanatio...,1280.0,720.0,data scientist,/channel/UCV0qA-eDDICsRR9rPcnG7tw,0.134


In [39]:
mask_u = (df_unlabeled['p'] >= 0.43) & (df_unlabeled['p'] <= 1.)
mask_u.sum()

66

In [40]:
df_unlabeled[mask_u]

Unnamed: 0,watch-title,y,watch-view-count,watch-time-text,content_watch-info-tag-list,watch7-headline,watch7-user-header,watch8-sentiment-actions,og:image,og:image:width,og:image:height,og:description,og:video:width,og:video:height,og:video:tag,channel_link_0,p
510,Lecture 11 - Introduction to Neural Networks |...,,8.831 visualizações,Publicado em 17 de abr. de 2020,Educação,Lecture 11 - Introduction to Neural Networks |...,stanfordonline\nCarregando...\nCancelar inscri...,8.831 visualizações\n127\nGostou deste vídeo?\...,https://i.ytimg.com/vi/MfIjxPh6Pys/maxresdefau...,1280,720,Take an adapted version of this course as part...,1280.0,720.0,,/channel/UCBa5G_ESCn8Yd4vw5U-gIcg,0.529
530,Kaggle Competition- Dengue or Malaria Predicti...,,4.679 visualizações,Publicado em 17 de set. de 2019,Educação,#MalariaDetection\nKaggle Competition- Dengue ...,Krish Naik\nCarregando...\nCancelar inscrição ...,4.679 visualizações\n92\nGostou deste vídeo?\n...,https://i.ytimg.com/vi/NjvX4BhOjOw/hqdefault.jpg,480,360,In this video we will implement transfer learn...,1280.0,720.0,VGG19,/channel/UCNU_lfiiWBdtULKOw6X0Dig,0.542
538,Career AMA with Jeff Dean | CareerCon 2019| Ka...,,1.293 visualizações,Publicado em 15 de ago. de 2019,Ciência e tecnologia,Career AMA with Jeff Dean | CareerCon 2019| Ka...,Kaggle\nCarregando...\nCancelar inscrição de K...,1.293 visualizações\n45\nGostou deste vídeo?\n...,https://i.ytimg.com/vi/OCij9sjR1xU/maxresdefau...,1280,720,"""In this AMA, Jeff will share his path into da...",1280.0,720.0,CS,/channel/UCSNeZleDn9c74yQc-EKnVTA,0.718
547,Lecture 13 - Debugging ML Models and Error Ana...,,3.022 visualizações,Publicado em 17 de abr. de 2020,Educação,Lecture 13 - Debugging ML Models and Error Ana...,stanfordonline\nCarregando...\nCancelar inscri...,3.022 visualizações\n55\nGostou deste vídeo?\n...,https://i.ytimg.com/vi/ORrStCArmP4/maxresdefau...,1280,720,Take an adapted version of this course as part...,1280.0,720.0,,/channel/UCBa5G_ESCn8Yd4vw5U-gIcg,0.533
559,Porque estudar Data Science com a Udacity e a ...,,9.431 visualizações,Publicado em 23 de nov. de 2018,Ciência e tecnologia,Porque estudar Data Science com a Udacity e a ...,TecMundo\nCarregando...\nCancelar inscrição de...,9.431 visualizações\n686\nGostou deste vídeo?\...,https://i.ytimg.com/vi/Opc9D-CTPuQ/maxresdefau...,1280,720,#[Tipo: Publieditorial] CRÉDITOS Texto: Leonar...,1280.0,720.0,notícias,/channel/UCdmGjywrxeOPfC7vDllmSgQ,0.587
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1313,How to Spec a Deep learning PC | 2019 PC Buyin...,,28.530 visualizações,Publicado em 10 de mai. de 2019,Educação,How to Spec a Deep learning PC | 2019 PC Buyin...,Machine Learning with Phil\nCarregando...\nCan...,28.530 visualizações\n518\nGostou deste vídeo?...,https://i.ytimg.com/vi/xsnVlMWQj8o/maxresdefau...,1280,720,Today I'll give my recommendations on what com...,1280.0,720.0,deep reinforcement learning,/channel/UC58v9cLitc8VaCjrcKyAbrw,0.434
1317,Conda Environments : Data Science Code,,16 visualizações,Publicado em 1 de jun. de 2020,Educação,Conda Environments : Data Science Code,ritvikmath\nCarregando...\nCancelar inscrição ...,16 visualizações\n2\nGostou deste vídeo?\nFaça...,https://i.ytimg.com/vi/xyQn8cNOP78/hqdefault.jpg,480,360,All about Conda Environments!,1280.0,720.0,code,/channel/UCUcpVoi5KkJmnE3bvEhHR0Q,0.498
1331,Kaggle Meetup: Landmark Recognition,,900 visualizações,Publicado em 13 de jul. de 2018,Ciência e tecnologia,Kaggle Meetup: Landmark Recognition,Learn Data Science\nCarregando...\nCancelar in...,900 visualizações\n6\nGostou deste vídeo?\nFaç...,https://i.ytimg.com/vi/ymMsDG5kA94/maxresdefau...,1280,720,Meetup page: https://www.meetup.com/LearnDataS...,1280.0,720.0,yvr,/channel/UCJhW_16uxALr0X4olEW2p5A,0.483
1354,How to find Data Science Jobs During Recession...,,177 visualizações,Publicado em 31 de mai. de 2020,Educação,#vision2020 #aiforeveryone #pythonnumpy\nHow t...,Artificial Intelligence Tutorials\nCarregando....,177 visualizações\nGostou deste vídeo?\nFaça l...,https://i.ytimg.com/vi/zoJCD7-vaxQ/maxresdefau...,1280,720,"#vision2020 #aiforeveryone #pythonnumpy : ""My ...",1280.0,720.0,how to search job,/channel/UCNJuXA_sPosZGolgPzdWnXw,0.446


In [41]:
dificeis = df_unlabeled[mask_u]

In [42]:
aleatorios = df_unlabeled[~mask_u].sample(28, random_state=0)

In [43]:
pd.concat([dificeis, aleatorios]).to_csv("active_label1.csv")

In [44]:
dificeis.head()

Unnamed: 0,watch-title,y,watch-view-count,watch-time-text,content_watch-info-tag-list,watch7-headline,watch7-user-header,watch8-sentiment-actions,og:image,og:image:width,og:image:height,og:description,og:video:width,og:video:height,og:video:tag,channel_link_0,p
510,Lecture 11 - Introduction to Neural Networks |...,,8.831 visualizações,Publicado em 17 de abr. de 2020,Educação,Lecture 11 - Introduction to Neural Networks |...,stanfordonline\nCarregando...\nCancelar inscri...,8.831 visualizações\n127\nGostou deste vídeo?\...,https://i.ytimg.com/vi/MfIjxPh6Pys/maxresdefau...,1280,720,Take an adapted version of this course as part...,1280.0,720.0,,/channel/UCBa5G_ESCn8Yd4vw5U-gIcg,0.529
530,Kaggle Competition- Dengue or Malaria Predicti...,,4.679 visualizações,Publicado em 17 de set. de 2019,Educação,#MalariaDetection\nKaggle Competition- Dengue ...,Krish Naik\nCarregando...\nCancelar inscrição ...,4.679 visualizações\n92\nGostou deste vídeo?\n...,https://i.ytimg.com/vi/NjvX4BhOjOw/hqdefault.jpg,480,360,In this video we will implement transfer learn...,1280.0,720.0,VGG19,/channel/UCNU_lfiiWBdtULKOw6X0Dig,0.542
538,Career AMA with Jeff Dean | CareerCon 2019| Ka...,,1.293 visualizações,Publicado em 15 de ago. de 2019,Ciência e tecnologia,Career AMA with Jeff Dean | CareerCon 2019| Ka...,Kaggle\nCarregando...\nCancelar inscrição de K...,1.293 visualizações\n45\nGostou deste vídeo?\n...,https://i.ytimg.com/vi/OCij9sjR1xU/maxresdefau...,1280,720,"""In this AMA, Jeff will share his path into da...",1280.0,720.0,CS,/channel/UCSNeZleDn9c74yQc-EKnVTA,0.718
547,Lecture 13 - Debugging ML Models and Error Ana...,,3.022 visualizações,Publicado em 17 de abr. de 2020,Educação,Lecture 13 - Debugging ML Models and Error Ana...,stanfordonline\nCarregando...\nCancelar inscri...,3.022 visualizações\n55\nGostou deste vídeo?\n...,https://i.ytimg.com/vi/ORrStCArmP4/maxresdefau...,1280,720,Take an adapted version of this course as part...,1280.0,720.0,,/channel/UCBa5G_ESCn8Yd4vw5U-gIcg,0.533
559,Porque estudar Data Science com a Udacity e a ...,,9.431 visualizações,Publicado em 23 de nov. de 2018,Ciência e tecnologia,Porque estudar Data Science com a Udacity e a ...,TecMundo\nCarregando...\nCancelar inscrição de...,9.431 visualizações\n686\nGostou deste vídeo?\...,https://i.ytimg.com/vi/Opc9D-CTPuQ/maxresdefau...,1280,720,#[Tipo: Publieditorial] CRÉDITOS Texto: Leonar...,1280.0,720.0,notícias,/channel/UCdmGjywrxeOPfC7vDllmSgQ,0.587
