# Ensemble Model

````
# RDF
#  (0.39545173300116343, 0.6776220857432473)
# LightGBM HTuned
# (0.39010753360298556, 0.6728221334446366)
# Logistic
# (0.4295233599530305, 0.671194323534673)
````

In [1]:
import pandas as pd
import numpy as np
import re
import time

import bs4 as bs4
import json
import pickle
import glob
import tqdm

pd.set_option("max.columns", 131)

from sklearn.metrics import roc_auc_score, average_precision_score
from sklearn.preprocessing import MaxAbsScaler, StandardScaler
from scipy.sparse import csr_matrix
from scipy import sparse
import joblib as jb
from sklearn.feature_extraction.text import TfidfVectorizer
from lightgbm import LGBMClassifier

#https://strftime.org/
%matplotlib inline
%pylab inline

Populating the interactive namespace from numpy and matplotlib


In [23]:
df = pd.read_csv("raw_data_all_labeled2.csv", index_col=0).dropna(subset=["y"])
df.shape

(1410, 15)

In [24]:
df.head()

Unnamed: 0,uploader,title,y,upload_date,user,view_count,like_count,dislike_count,thumbnail,width,height,categories,tags,channel_url,description
0,Yanjun Qi,S0-Introduction-Module3: Deep Learning and AI ...,0.0,2020-08-25,UCHMYETgeGbNHVHLidZSV8BQ,22,,,https://i.ytimg.com/vi/LkPmTGw1jqo/hqdefault.j...,1280,672,Science & Technology,Machine Learning,http://www.youtube.com/channel/UCHMYETgeGbNHVH...,Course Web: \nhttps://qiyanjun.github.io/2020f...
1,Ciência dos Dados,Machine Learning no Ensino Médio,0.0,2020-08-25,UCd3ThZLzVDDnKSZMsbK0icg,3,,,https://i.ytimg.com/vi_webp/R_gBq8IfwJc/maxres...,1920,1080,Education,machine learning|data science,http://www.youtube.com/channel/UCd3ThZLzVDDnKS...,"A matemática, sempre ela....\n\nDe uma maneira..."
2,iKennyHD,NBA LIVE 22: EA COULD USE DEEP MACHINE LEARNIN...,0.0,2020-08-25,KennyCallOfDuty,47,,,https://i.ytimg.com/vi/Tix2xon9MSs/maxresdefau...,1920,1080,Gaming,iKennyHD|nba live20|nba live 20|nba 2k20|live2...,http://www.youtube.com/channel/UCGMtoj9V9Go_im...,Wanna Donate? paypal.me/iKennyYT is where you ...
3,Amazon Web Services,Amazon Aurora Machine Learning – SageMaker Int...,0.0,2020-08-25,AmazonWebServices,335,,,https://i.ytimg.com/vi/w-2ip78NxAw/maxresdefau...,1920,1080,Science & Technology,AWS|Amazon Web Services|Cloud|AWS Cloud|Cloud ...,http://www.youtube.com/channel/UCd6MoB9NC6uYN2...,Learn how you can turn relational data into in...
4,"GMRIT, Rajam, AP",Machine Learning and Deep Learning Implementat...,1.0,2020-08-25,UC8g7hz4oXFzXNryt8h1gRPw,1486,,,https://i.ytimg.com/vi/f6XIY_M7FlA/hqdefault.j...,1280,720,People & Blogs,,http://www.youtube.com/channel/UC8g7hz4oXFzXNr...,Resource Person\nMr.S.Aravinth Seshadri\nCerti...


In [25]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

In [26]:
df_limpo = pd.DataFrame(index=df.index)
df_limpo['title'] = df['title']

## 1. Limpeza da data

In [27]:
df_limpo['date'] = pd.to_datetime(df['upload_date'])

## 2. Limpeza de Views

In [28]:
views = df['view_count'].fillna(0)
df_limpo['views'] = views

In [29]:
df_limpo.shape

(1410, 3)

## 3. Features

In [30]:
features = pd.DataFrame(index=df_limpo.index)
y = df['y'].copy()
today = datetime.datetime.today().strftime("%Y-%m-%d")
features['tempo_desde_pub'] = (pd.to_datetime(today) -  df_limpo['date']) / np.timedelta64(1, 'D')
features['views'] = df_limpo['views']
features['views_por_dia'] = (features['views'] / features['tempo_desde_pub']).round(3)
features.drop(['tempo_desde_pub'], axis=1, inplace=True)

In [31]:
features.shape

(1410, 2)

In [32]:
uploaders = df['uploader'].str.get_dummies()

In [33]:
df_limpo = pd.concat([df_limpo, uploaders], axis=1)

In [35]:
df_limpo.head(1)

Unnamed: 0,title,date,views,1littlecoder,365 Data Science,:CodeWebsDuh :,A.I.M Learning,AI Coding,AI Learner hub,AI era,AICamp,AIPRA,AP2V Academy,ASQStatsDivision,AWS Online Tech Talks,Aakash N S,Abacus AI,Abhishek Agarrwal,Abhishek Thakur,Abuzar Ghaffari,Academind,Accenture,Accenture in the Philippines,Acsia Technologies,Adriano Gianini,Adriano Marcos Rodrigues Figueiredo,Adventures in the Cloud,Affiliate World Conferences,Ahmed Nour,Akash Dash,Albert Coronado,Alberto Olla,Alex The Analyst,Alexander Amini,Algoritma Data Science School,Ali Nemati,All About GATE Exam,Alteryx,AltexSoft,Alura Cursos Online,AmandaLovesToAudit,Amanpour and Company,Amarpreet Singh,Amaze Lab,Amazon Web Services,Amod Sachintha,Analytics India Magazine,Analytics Vidhya,Andrew Schonfeld,Andrey Sozykin,André Furchner,Andy Jake,Aniedi Udo-Obong,Applied AI Course,Art of Engineer,Artificial Intelligence Tutorials,Arvind Kumawat,Ashutoshh Singh,Ashvin Nair,Association Quantum,Association for Computing Machinery (ACM),Asumsi,AtomsTalk,Australian Institute for Machine Learning,BEPEC - Career Transition Simplified,...,Whirldata,WhiteHatHacking,Wikitechy,XCTEQ Limited,Xuming Wang,YANGCOM Korea,Yanjun Qi,Yannic Kilcher,Yudi J,Yury Kashnitsky,Zach Star,ZaranTech,Zeeshan Usmani,a ydobon,akshay shekkari,ankitrathi.com,anıl Kaynar,ashish pondit,bespokeDS,biostatistique,codebasics,danny iskandar,datasciencearth,deeplizard,e-tube KTU by jasmin,eMaster Class Academy,eXtremegenerationIT,edX,edureka!,freeCodeCamp.org,geekbytes,iKennyHD,iNeuron iNtelligence,iT24Hrs,instituto cpfl,inzva team,jpmorgan,lakshay aggarwal,miracl6,nETSETOS,nejimakijima,njan,outcompete,pyGuru,sentdex,stanfordonline,stanley kan,study mart,suthichai live,teacher4u,techninja betechnical,upGrad,uΔΔTube,vcubingx,vectoroad,veryacademy,{‘フーテンのグラさん’:‘ジェントルふじふじ’},АйТиБорода,Продюсер будущего,Флесс,أكاديمية الفهرية Al-Fihriya Academy,へちやぼらけ・データサイエンティスト,エンジニアを目指すSESチャンネル,元専業Kaggler カレーちゃん,渡邉裕亮
0,S0-Introduction-Module3: Deep Learning and AI ...,2020-08-25,22,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [36]:
for uploader in df_limpo.drop(['title', 'date', 'views'], axis=1).columns:
    features[uploader] = df_limpo[uploader]

  features[uploader] = df_limpo[uploader]


In [37]:
mask_train = df_limpo['date'] < '2020-03-10'
mask_val = df_limpo['date'] >= '2020-03-10'

Xtrain, Xval = features[mask_train], features[mask_val]
ytrain, yval = y[mask_train], y[mask_val]
Xtrain.shape, Xval.shape, ytrain.shape, yval.shape

((714, 642), (696, 642), (714,), (696,))

In [38]:
features[mask_train].shape

(714, 642)

In [39]:
mask_train = df_limpo['date'] < '2020-03-10'
mask_val = df_limpo['date'] >= '2020-03-10'

Xtrain, Xval = features[mask_train], features[mask_val]
ytrain, yval = y[mask_train], y[mask_val]
Xtrain.shape, Xval.shape, ytrain.shape, yval.shape

((714, 642), (696, 642), (714,), (696,))

In [40]:
title_train = df_limpo[mask_train]['title']
title_val = df_limpo[mask_val]['title']

# Min df - minimo de vezes que palavra tem que aparecer pra virar coluna
title_vec = TfidfVectorizer(min_df=2, ngram_range=(1,2))

title_bow_train = title_vec.fit_transform(title_train)
title_bow_val = title_vec.transform(title_val)

In [41]:
title_bow_train.shape

(714, 1134)

In [43]:
# Concatenando as variávels numéricas com as geradas pelo TfidfVectorizer
Xtrain_wtitle = sparse.hstack([Xtrain, title_bow_train])
Xval_wtitle = sparse.hstack([Xval, title_bow_val])

In [44]:
Xtrain_wtitle.shape, Xval_wtitle.shape

((714, 1776), (696, 1776))

# 4 RF

In [45]:
Xtrain_wtitle.shape

(714, 1776)

In [46]:
mdl_rf = RandomForestClassifier(n_estimators=1000, min_samples_leaf=1, random_state=0, class_weight='balanced', n_jobs=4)
mdl_rf.fit(Xtrain_wtitle, ytrain)

RandomForestClassifier(class_weight='balanced', n_estimators=1000, n_jobs=4,
                       random_state=0)

In [47]:
p_rf = mdl_rf.predict_proba(Xval_wtitle)[: ,1]

In [48]:
# O resultado pode ter dado diferente peplo random_state
average_precision_score(yval, p_rf), roc_auc_score(yval, p_rf)

(0.39605472026665745, 0.683620535448095)

In [76]:
# RDF
#  (0.39545173300116343, 0.6776220857432473)

# 5 LGBM

In [49]:
params = [0.003924937303997735,
 10,
 9,
 0.42896739513988846,
 0.08080324515701484,
 196,
 1,
 5]

lr = params[0]
max_depth = params[1]
min_child_samples = params[2]
subsample = params[3]
colsample_bytree = params[4]
n_estimators = params[5]

min_df = params[6]
ngram_range = (1, params[7])

title_vec = TfidfVectorizer(min_df=min_df, ngram_range=ngram_range)
title_bow_train = title_vec.fit_transform(title_train)
title_bow_val = title_vec.transform(title_val)

Xtrain_wtitle = sparse.hstack([Xtrain, title_bow_train])
Xval_wtitle = sparse.hstack([Xval, title_bow_val])

mdl_lgbm = LGBMClassifier(learning_rate=lr, num_leaves=2 ** max_depth, max_depth=max_depth, 
                     min_child_samples=min_child_samples, subsample=subsample,
                     colsample_bytree=colsample_bytree, bagging_freq=1,n_estimators=n_estimators, random_state=0, 
                     class_weight="balanced", n_jobs=6)
mdl_lgbm.fit(Xtrain_wtitle, ytrain)

p_lgbm = mdl_lgbm.predict_proba(Xval_wtitle)[:, 1]






In [50]:
average_precision_score(yval, p_lgbm), roc_auc_score(yval, p_lgbm)

(0.39010753360298556, 0.6728221334446366)

In [None]:
# LightGBM HTuned
# (0.39010753360298556, 0.6728221334446366)

# 7 Logistic Reg

Usamos `makepipeline` para aplicar o scaler e em seguida o modelo, em sequencia

In [51]:
from sklearn.pipeline import make_pipeline

In [52]:
Xtrain_wtitle2 = csr_matrix(Xtrain_wtitle.copy())
Xval_wtitle2 = csr_matrix(Xval_wtitle.copy())

#scaler = StandardScaler()
#scaler = MaxAbsScaler()

#Xtrain_wtitle2[:, :2] = scaler.fit_transform(Xtrain_wtitle2[:, :2].todense())
#Xval_wtitle2[:, :2] = scaler.transform(Xval_wtitle2[:, :2].todense())
#Xtrain_wtitle2 = scaler.fit_transform(Xtrain_wtitle2)
#Xval_wtitle2 = scaler.transform(Xval_wtitle2)

lr_pipeline = make_pipeline(MaxAbsScaler(), LogisticRegression(C=0.5, penalty='l2',n_jobs=6, random_state=0))
lr_pipeline.fit(Xtrain_wtitle2, ytrain)

Pipeline(steps=[('maxabsscaler', MaxAbsScaler()),
                ('logisticregression',
                 LogisticRegression(C=0.5, n_jobs=6, random_state=0))])

In [53]:
p_lr = lr_pipeline.predict_proba(Xval_wtitle2)[:, 1]

In [54]:
average_precision_score(yval, p_lr), roc_auc_score(yval, p_lr)

(0.4295233599530305, 0.671194323534673)

In [None]:
# Logistic
# (0.4295233599530305, 0.671194323534673)

# 8 Ensemble

(0.3969764593456903, 0.677681712479876) RF  
(0.4120975561037116, 0.677729413869179) LGBM  
(0.4176249062603176, 0.6702999224852424)LR  

(0.4050885250264036, 0.6983244887007335)LGBM NGRAM 2.3

````
# RAFAEL
# RDF
#  (0.39545173300116343, 0.6776220857432473)
# LightGBM HTuned
# (0.39010753360298556, 0.6728221334446366)
# Logistic
# (0.4295233599530305, 0.671194323534673)
````


In [55]:
p = (p_lr + p_rf + p_lgbm)/3
average_precision_score(yval, p), roc_auc_score(yval, p)

(0.43406392130802807, 0.6988253532884146)

### Pearson Corr

LightGBM tem pouco com Logistic e RandomForest, indicando asism que eles preve de forma bem diferente os dados entre si. ENtao usando juntos, é sinal que via ter ganho, pois um vai puxar pra cima onde o outro nao puxaria


In [56]:
pd.DataFrame({"LR": p_lr, "RF": p_rf, "LGBM": p_lgbm}).corr()

Unnamed: 0,LR,RF,LGBM
LR,1.0,0.734372,0.531961
RF,0.734372,1.0,0.629294
LGBM,0.531961,0.629294,1.0


In [61]:
# Tentao apenas 2 deles
p = 0.5*p_lgbm + 0.5*p_lr
average_precision_score(yval, p), roc_auc_score(yval, p)

(0.4427779169972088, 0.6897978653628287)

In [None]:
## RAFAEL - BEST SCORE
# (0.4427779169972088, 0.6897978653628287) 0.5/0.5

In [None]:
# RDF
#  (0.39545173300116343, 0.6776220857432473)
# LightGBM HTuned
# (0.39010753360298556, 0.6728221334446366)
# Logistic
# (0.4295233599530305, 0.671194323534673)

# 9 Salvar modelos

In [63]:
import joblib as jb

In [64]:
jb.dump(mdl_lgbm, "lgbm_2021-12-11.pkl.z")
jb.dump(lr_pipeline, "logistic_reg_2021-12-11.pkl.z")
#jb.dump(lr_pipeline, "logistic_reg_20200208.pkl.z")
jb.dump(title_vec, "title_vectorizer_2021-12-11.pkl.z")

['title_vectorizer_2021-12-11.pkl.z']

## Observação

Fizemos mas nâo testamos em test. Train/Test.

O mais interressante é por o mdoelo em produçâo o mais ceo possível e de lá testar se a solução está valendo ou não

In [None]:
# CESAR EXECUTION

In [29]:
lgbm_20200911 = open('lgbm_20200911', 'wb')
pickle.dump(mdl_lgbm, lgbm_20200911)
lgbm_20200911.close()

In [30]:
random_forest_20200911 = open('random_forest_20200911', 'wb')
pickle.dump(mdl_rf, random_forest_20200911)
random_forest_20200911.close()

In [31]:
title_vectorizer_20200911 = open('title_vectorizer_20200911', 'wb')
pickle.dump(title_vec, title_vectorizer_20200911)
title_vectorizer_20200911.close()

In [32]:
jb.dump(mdl_lgbm, "lgbm_20200911.pkl.z")
jb.dump(mdl_rf, "random_forest_20200911.pkl.z")
#jb.dump(lr_pipeline, "logistic_reg_20200911.pkl.z")
jb.dump(title_vec, "title_vectorizer_20200911.pkl.z")

['title_vectorizer_20200911.pkl.z']

In [33]:
mdl_rf = pickle.load( open( "random_forest_20200911", "rb" ) )