In [1]:
import pandas as pd
import numpy as np
import re
import time

import bs4 as bs4
import json

import glob
import tqdm

pd.set_option("max.columns", 131)

#https://strftime.org/
%matplotlib inline
%pylab inline

Populating the interactive namespace from numpy and matplotlib


## Active Learning Results

In [2]:
df1 = pd.read_csv("raw_data_with_labels.csv", index_col=0)
df1 = df1[df1['y'].notnull()]
df1.shape

(501, 16)

In [3]:
df1 = df1[df1['watch-time-text'].notnull()]
df1 = df1[df1['watch-time-text'].str.contains('horas')==False]
df1.shape

(473, 16)

In [4]:
df2 = pd.read_csv("active_labels1_done.csv", index_col=0)
df2 = df2[df2['y'].notnull()]
df2['novo'] = 1
df2.shape

(100, 18)

In [5]:
df2.head(1)

Unnamed: 0,watch-title,y,watch-view-count,watch-time-text,content_watch-info-tag-list,watch7-headline,watch7-user-header,watch8-sentiment-actions,og:image,og:image:width,og:image:height,og:description,og:video:width,og:video:height,og:video:tag,channel_link_0,p,novo
510,Lecture 11 - Introduction to Neural Networks |...,0,8.831 visualizações,Publicado em 17 de abr. de 2020,Educação,Lecture 11 - Introduction to Neural Networks |...,stanfordonline\nCarregando...\nCancelar inscri...,8.831 visualizações\n127\nGostou deste vídeo?\...,https://i.ytimg.com/vi/MfIjxPh6Pys/maxresdefau...,1280,720,Take an adapted version of this course as part...,1280.0,720.0,,/channel/UCBa5G_ESCn8Yd4vw5U-gIcg,0.512,1


In [6]:
from sklearn.metrics import roc_auc_score, average_precision_score
average_precision_score(df2['y'],df2['p']), roc_auc_score(df2['y'],df2['p'])

(0.1337192228346687, 0.5888888888888889)

In [7]:
df = pd.concat([df1, df2.drop("p", axis=1)])

In [28]:
df.to_csv("teste.csv")

In [8]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

In [9]:
df_limpo = pd.DataFrame(index=df.index)
df_limpo['title'] = df['watch-title']
df_limpo['novo'] = df['novo'].fillna(0)

## 1. Date cleaner

In [10]:
clean_date = df['watch-time-text'].str.extract(r"(\d+) de ([a-z]+)\. de (\d+)")
clean_date[0] = clean_date[0].map(lambda x: "0"+x[0] if len(x) == 1 else x)
#clean_date[1] = clean_date[1].map(lambda x: x[0].upper()+x[1:])

mapa_meses = {"jan": "Jan",
              "fev": "Feb",
              "mar": "Mar", 
              "abr": "Apr", 
              "mai": "May", 
              "jun": "Jun",
              "jul": "Jul",
              "ago": "Aug", 
              "set": "Sep", 
              "out": "Oct", 
              "nov": "Nov",
              "dez": "Dec"}

clean_date[1] = clean_date[1].map(mapa_meses)

clean_date = clean_date.apply(lambda x: " ".join(x), axis=1)
clean_date.head()
df_limpo['date'] = pd.to_datetime(clean_date, format="%d %b %Y")

## 2. Views cleaner

In [11]:
views = df['watch-view-count'].str.extract(r"(\d+\.?\d*)", expand=False).str.replace(".", "").fillna(0).astype(int)
df_limpo['views'] = views

## 3. Features

In [12]:
features = pd.DataFrame(index=df_limpo.index)
y = df['y'].copy()

In [13]:
features['publish_time'] = (pd.to_datetime('2020-06-01') - df_limpo['date']) / np.timedelta64(1,'D')
features['publish_time'] = features['publish_time'].replace(0,1)
features['views'] = df_limpo['views']
features['day_views'] = features['views'] / features['publish_time']
features = features.drop(['publish_time'], axis=1)

In [14]:
features.head()

Unnamed: 0,views,day_views
0,33445,428.782051
1,4640,11.717172
2,1570,2.859745
3,699800,1065.144597
4,300,4.285714


## Aumenta validação

In [15]:
df_limpo.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 573 entries, 0 to 1060
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype         
---  ------  --------------  -----         
 0   title   573 non-null    object        
 1   novo    573 non-null    float64       
 2   date    573 non-null    datetime64[ns]
 3   views   573 non-null    int64         
dtypes: datetime64[ns](1), float64(1), int64(1), object(1)
memory usage: 22.4+ KB


In [16]:
mask_train = (df_limpo['date'] < "2019-09-01") & (df_limpo['novo'] == 0)
#mask_train = (df_limpo['date'] < "2019-08-01")
mask_val = (df_limpo['date'] >= "2019-09-01")

Xtrain, Xval = features[mask_train], features[mask_val]
ytrain, yval = y[mask_train], y[mask_val]
Xtrain.shape, Xval.shape, ytrain.shape, yval.shape

((237, 2), (280, 2), (237,), (280,))

In [17]:
from sklearn.feature_extraction.text import TfidfVectorizer

title_train = df_limpo[mask_train]['title']
title_val = df_limpo[mask_val]['title']

title_vec = TfidfVectorizer(min_df=2)
title_bow_train = title_vec.fit_transform(title_train)
title_bow_val = title_vec.transform(title_val)


In [18]:
title_bow_train.shape

(237, 194)

In [19]:
from scipy.sparse import hstack, vstack

Xtrain_wtitle = hstack([Xtrain, title_bow_train])
Xval_wtitle = hstack([Xval, title_bow_val])

In [20]:
Xtrain_wtitle.shape, Xval_wtitle.shape

((237, 196), (280, 196))

In [21]:
mdl = RandomForestClassifier(n_estimators=1000, random_state=0, class_weight="balanced", n_jobs=6)
mdl.fit(Xtrain_wtitle, ytrain)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight='balanced',
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=1000,
                       n_jobs=6, oob_score=False, random_state=0, verbose=0,
                       warm_start=False)

In [22]:
p = mdl.predict_proba(Xval_wtitle)[:, 1]

In [23]:
from sklearn.metrics import roc_auc_score, average_precision_score

In [24]:
average_precision_score(yval, p), roc_auc_score(yval, p)

(0.17864865126006127, 0.524942218798151)

## Aumenta treino


In [25]:
mask_train = (df_limpo['date'] < "2019-09-01")
mask_val = (df_limpo['date'] >= "2019-09-01") & (df_limpo['novo'] == 0)
#mask_val = (df_limpo['date'] >= "2019-08-01")

In [26]:
Xtrain, Xval = features[mask_train], features[mask_val]
ytrain, yval = y[mask_train], y[mask_val]
Xtrain.shape, Xval.shape, ytrain.shape, yval.shape

title_train = df_limpo[mask_train]['title']
title_val = df_limpo[mask_val]['title']

title_vec = TfidfVectorizer(min_df=2)
title_bow_train = title_vec.fit_transform(title_train)
title_bow_val = title_vec.transform(title_val)

Xtrain_wtitle = hstack([Xtrain, title_bow_train])
Xval_wtitle = hstack([Xval, title_bow_val])


mdl = RandomForestClassifier(n_estimators=1000, random_state=0, class_weight="balanced", n_jobs=6)
mdl.fit(Xtrain_wtitle, ytrain)

p = mdl.predict_proba(Xval_wtitle)[:, 1]

average_precision_score(yval, p), roc_auc_score(yval, p)

(0.24096517017522276, 0.5729591836734694)