In [1]:
import pandas as pd
import numpy as np
import re
import time

import bs4 as bs4
import json

import glob
import tqdm

pd.set_option("max.columns", 131)

#https://strftime.org/
%matplotlib inline
%pylab inline

Populating the interactive namespace from numpy and matplotlib


In [2]:
df = pd.read_csv("labels_curso - to_label_2.csv", index_col=0).dropna(subset=["y","watch-title"])

In [3]:
df.duplicated().mean()

0.0

In [4]:
df.duplicated(['watch-title']).mean()

0.003108003108003108

In [5]:
df.shape

(1287, 16)

In [6]:
df = df[df['watch-time-text'].notnull()]
df = df[df['watch-time-text'].str.contains('horas')==False]
df.shape

(1274, 16)

In [7]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

In [8]:
df_limpo = pd.DataFrame(index=df.index)
df_limpo['title'] = df['watch-title']

## 1. Data Cleaner

In [9]:
clean_date = df['watch-time-text'].str.extract(r"(\d+) de ([a-z]+)\. de (\d+)")
clean_date[0] = clean_date[0].map(lambda x: "0"+x[0] if len(x) == 1 else x)
#clean_date[1] = clean_date[1].map(lambda x: x[0].upper()+x[1:])

mapa_meses = {"jan": "Jan",
              "fev": "Feb",
              "mar": "Mar", 
              "abr": "Apr", 
              "mai": "May", 
              "jun": "Jun",
              "jul": "Jul",
              "ago": "Aug", 
              "set": "Sep", 
              "out": "Oct", 
              "nov": "Nov",
              "dez": "Dec"}

clean_date[1] = clean_date[1].map(mapa_meses)

clean_date = clean_date.apply(lambda x: " ".join(x), axis=1)
clean_date.head()
df_limpo['date'] = pd.to_datetime(clean_date, format="%d %b %Y")

## 2. Views Cleaner

In [10]:
views = df['watch-view-count'].str.extract(r"(\d+\.?\d*)", expand=False).str.replace(".", "").fillna(0).astype(int)
df_limpo['views'] = views

## 3. Features

In [11]:
features = pd.DataFrame(index=df_limpo.index)
y = df['y'].copy()


In [12]:
features['publish_time'] = (pd.to_datetime('2020-06-01') - df_limpo['date']) / np.timedelta64(1,'D')
features['publish_time'] = features['publish_time'].replace(0,1)
features['views'] = df_limpo['views']
features['day_views'] = features['views'] / features['publish_time']
features = features.drop(['publish_time'], axis=1)

In [13]:
mask_train = df_limpo['date'] < "2019-09-01"
mask_val = df_limpo['date'] >= "2019-09-01"

Xtrain, Xval = features[mask_train], features[mask_val]
ytrain, yval = y[mask_train], y[mask_val]
Xtrain.shape, Xval.shape, ytrain.shape, yval.shape

((636, 2), (638, 2), (636,), (638,))

In [14]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [15]:
title_train = df_limpo[mask_train]['title']
title_val = df_limpo[mask_val]['title']

title_vec = TfidfVectorizer(min_df=2)
#title_vec = TfidfVectorizer(min_df=2,ngram_range=(1,2))
title_bow_train = title_vec.fit_transform(title_train)
title_bow_val = title_vec.transform(title_val)


In [16]:
title_bow_train.shape

(636, 474)

In [17]:
from scipy.sparse import hstack, vstack

Xtrain_wtitle = hstack([Xtrain, title_bow_train])
Xval_wtitle = hstack([Xval, title_bow_val])

In [26]:
Xtrain_wtitle.shape, Xval_wtitle.shape

((636, 476), (638, 476))

## 4. Random Forest

In [61]:
mdl = RandomForestClassifier(n_estimators=1000, random_state=0, min_samples_leaf = 2, class_weight="balanced", n_jobs=6)
mdl.fit(Xtrain_wtitle, ytrain)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight='balanced',
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=2, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=1000,
                       n_jobs=6, oob_score=False, random_state=0, verbose=0,
                       warm_start=False)

In [62]:
p = mdl.predict_proba(Xval_wtitle)[:, 1]

In [63]:
from sklearn.metrics import roc_auc_score, average_precision_score

In [64]:
average_precision_score(yval, p)

0.18187105538013426

In [65]:
roc_auc_score(yval, p)

0.638463505412393

ap 0.14515110965950923 auc 0.6286993940458417 - min_df=2, min_sample_leaf=1 

ap 0.18187105538013426auc 0.638463505412393 min_sample_leaf=2

ap 0.14627188339357072 auc 0.6273235561019876 n_estimators=100, min_sample_lead=2

ap 0.14662005939653658 auc 0.6072129036035245 min_df=1,ngram_range=(1,2)

ap 0.1538379848663552 auc 0.6338807411961007 min_df=2,ngram_range=(1,2)

RF: ap 0.18187105538013426auc 0.638463505412393 min_sample_leaf=2

## 5.Light GBM

In [67]:
from lightgbm import LGBMClassifier

In [68]:
mdl = LGBMClassifier(random_state=0, class_weight="balanced", n_jobs=6)
mdl.fit(Xtrain_wtitle, ytrain)

LGBMClassifier(boosting_type='gbdt', class_weight='balanced',
               colsample_bytree=1.0, importance_type='split', learning_rate=0.1,
               max_depth=-1, min_child_samples=20, min_child_weight=0.001,
               min_split_gain=0.0, n_estimators=100, n_jobs=6, num_leaves=31,
               objective=None, random_state=0, reg_alpha=0.0, reg_lambda=0.0,
               silent=True, subsample=1.0, subsample_for_bin=200000,
               subsample_freq=0)

In [69]:
p = mdl.predict_proba(Xval_wtitle)[:, 1]



In [70]:
average_precision_score(yval, p), roc_auc_score(yval, p)

(0.1400434244002939, 0.624683921925164)

# 6 Bayesian Optimization

In [73]:
from skopt import forest_minimize

In [74]:
def tune_lgbm(params):
    print(params)
    lr = params[0]
    max_depth = params[1]
    min_child_samples = params[2]
    subsample = params[3]
    colsample_bytree = params[4]
    n_estimators = params[5]
    
    min_df = params[6]
    ngram_range = (1, params[7])
    
    title_vec = TfidfVectorizer(min_df=min_df, ngram_range=ngram_range)
    title_bow_train = title_vec.fit_transform(title_train)
    title_bow_val = title_vec.transform(title_val)
    
    Xtrain_wtitle = hstack([Xtrain, title_bow_train])
    Xval_wtitle = hstack([Xval, title_bow_val])
    
    mdl = LGBMClassifier(learning_rate=lr, num_leaves=2 ** max_depth, max_depth=max_depth, 
                         min_child_samples=min_child_samples, subsample=subsample,
                         colsample_bytree=colsample_bytree, bagging_freq=1,n_estimators=n_estimators, random_state=0, 
                         class_weight="balanced", n_jobs=6)
    mdl.fit(Xtrain_wtitle, ytrain)
    
    p = mdl.predict_proba(Xval_wtitle)[:, 1]
    
    print(roc_auc_score(yval, p))
    
    return -average_precision_score(yval, p)


space = [(1e-3, 1e-1, 'log-uniform'), # lr
          (1, 10), # max_depth
          (1, 20), # min_child_samples
          (0.05, 1.), # subsample
          (0.05, 1.), # colsample_bytree
          (100,1000), # n_estimators
          (1,5), # min_df
          (1,5)] # ngram_range

res = forest_minimize(tune_lgbm, space, random_state=160745, n_random_starts=20, n_calls=50, verbose=1)

Iteration No: 1 started. Evaluating function at random point.
[0.009944912110647982, 5, 1, 0.4677107511929402, 0.49263223036174764, 272, 3, 1]




0.6151660652896553
Iteration No: 1 ended. Evaluation done at random point.
Time taken: 0.5293
Function value obtained: -0.1377
Current minimum: -0.1377
Iteration No: 2 started. Evaluating function at random point.
[0.053887464791860025, 1, 15, 0.7437489153990157, 0.8675167974293533, 549, 3, 4]




0.6034179049350797
Iteration No: 2 ended. Evaluation done at random point.
Time taken: 0.5538
Function value obtained: -0.1545
Current minimum: -0.1545
Iteration No: 3 started. Evaluating function at random point.
[0.004151454520895999, 6, 20, 0.8682075103820793, 0.9491436163200662, 411, 4, 3]




0.6070829900275592
Iteration No: 3 ended. Evaluation done at random point.
Time taken: 1.0932
Function value obtained: -0.1378
Current minimum: -0.1545
Iteration No: 4 started. Evaluating function at random point.
[0.0014099928811969545, 9, 9, 0.6502182010234373, 0.6866210554187129, 828, 5, 2]




0.5945108958149842
Iteration No: 4 ended. Evaluation done at random point.
Time taken: 2.0631
Function value obtained: -0.1404
Current minimum: -0.1545
Iteration No: 5 started. Evaluating function at random point.
[0.08530558241838007, 8, 19, 0.2137736299768322, 0.1313765544201984, 961, 4, 1]




0.5334545557860044
Iteration No: 5 ended. Evaluation done at random point.
Time taken: 0.6325
Function value obtained: -0.1040
Current minimum: -0.1545
Iteration No: 6 started. Evaluating function at random point.
[0.003567949451535685, 10, 19, 0.7232951768944309, 0.7298538828427115, 939, 4, 3]




0.6106486348268318
Iteration No: 6 ended. Evaluation done at random point.
Time taken: 2.0365
Function value obtained: -0.1433
Current minimum: -0.1545
Iteration No: 7 started. Evaluating function at random point.
[0.014828577273549474, 7, 1, 0.18428087097824575, 0.3261556557915816, 274, 1, 2]




0.6129783788391057
Iteration No: 7 ended. Evaluation done at random point.
Time taken: 1.3242
Function value obtained: -0.1347
Current minimum: -0.1545
Iteration No: 8 started. Evaluating function at random point.
[0.0015212976972079912, 3, 12, 0.44234694306528044, 0.399351303640462, 272, 3, 5]




0.5785436258772055
Iteration No: 8 ended. Evaluation done at random point.
Time taken: 0.3688
Function value obtained: -0.1276
Current minimum: -0.1545
Iteration No: 9 started. Evaluating function at random point.
[0.01946212855369041, 9, 18, 0.5235636153223084, 0.6728679300083596, 747, 4, 5]




0.5845668664943034
Iteration No: 9 ended. Evaluation done at random point.
Time taken: 1.2369
Function value obtained: -0.1368
Current minimum: -0.1545
Iteration No: 10 started. Evaluating function at random point.
[0.0012116790683302117, 3, 2, 0.06616307483844217, 0.23025600705315752, 677, 2, 5]




0.5363241185328295
Iteration No: 10 ended. Evaluation done at random point.
Time taken: 0.9376
Function value obtained: -0.1098
Current minimum: -0.1545
Iteration No: 11 started. Evaluating function at random point.
[0.0053139776214487944, 6, 9, 0.14251441334450304, 0.8175761405215897, 297, 1, 5]




0.5209534903542916
Iteration No: 11 ended. Evaluation done at random point.
Time taken: 0.5526
Function value obtained: -0.1035
Current minimum: -0.1545
Iteration No: 12 started. Evaluating function at random point.
[0.0068572961982704935, 10, 5, 0.2390386584472456, 0.49053406102209746, 176, 2, 4]




0.5925220899508481
Iteration No: 12 ended. Evaluation done at random point.
Time taken: 1.0181
Function value obtained: -0.1315
Current minimum: -0.1545
Iteration No: 13 started. Evaluating function at random point.
[0.00781968225875022, 3, 4, 0.7078936710077383, 0.31818755505678337, 275, 4, 4]




0.621885387959201
Iteration No: 13 ended. Evaluation done at random point.
Time taken: 0.4544
Function value obtained: -0.1637
Current minimum: -0.1637
Iteration No: 14 started. Evaluating function at random point.
[0.017293945600511968, 2, 15, 0.9007557574888567, 0.41026441194439994, 316, 5, 1]




0.6508083075262096
Iteration No: 14 ended. Evaluation done at random point.
Time taken: 0.2249
Function value obtained: -0.1594
Current minimum: -0.1637
Iteration No: 15 started. Evaluating function at random point.
[0.012250750764764855, 8, 6, 0.5976582413192033, 0.2474882432951916, 516, 4, 4]




0.5916413330681591
Iteration No: 15 ended. Evaluation done at random point.
Time taken: 1.1169
Function value obtained: -0.1571
Current minimum: -0.1637
Iteration No: 16 started. Evaluating function at random point.
[0.018353598126553926, 4, 3, 0.47305622526323254, 0.1404164811277527, 133, 4, 1]




0.5930619086853993
Iteration No: 16 ended. Evaluation done at random point.
Time taken: 0.3907
Function value obtained: -0.1609
Current minimum: -0.1637
Iteration No: 17 started. Evaluating function at random point.
[0.0010383234748454694, 9, 19, 0.9256771571832196, 0.9321438677645206, 312, 4, 3]




0.5944114555217774
Iteration No: 17 ended. Evaluation done at random point.
Time taken: 0.8832
Function value obtained: -0.1301
Current minimum: -0.1637
Iteration No: 18 started. Evaluating function at random point.
[0.004955229758078229, 5, 5, 0.06939551310802591, 0.4193273080472823, 725, 4, 1]




0.5523197999829531
Iteration No: 18 ended. Evaluation done at random point.
Time taken: 1.3287
Function value obtained: -0.1236
Current minimum: -0.1637
Iteration No: 19 started. Evaluating function at random point.
[0.0699516121742407, 9, 10, 0.6477856515609233, 0.8594430701440198, 616, 1, 1]




0.5892831775435405
Iteration No: 19 ended. Evaluation done at random point.
Time taken: 2.7411
Function value obtained: -0.1168
Current minimum: -0.1637
Iteration No: 20 started. Evaluating function at random point.
[0.0014752743467850462, 5, 4, 0.9747950537021096, 0.982207187458162, 909, 2, 4]




0.5837713441486491
Iteration No: 20 ended. Evaluation done at random point.
Time taken: 2.9088
Function value obtained: -0.1434
Current minimum: -0.1637
Iteration No: 21 started. Searching for the next optimal point.
[0.06678239559066271, 3, 3, 0.5944925708055352, 0.19153366949320433, 498, 5, 5]




0.5632298207233571
Iteration No: 21 ended. Search finished for the next optimal point.
Time taken: 0.9581
Function value obtained: -0.1684
Current minimum: -0.1684
Iteration No: 22 started. Searching for the next optimal point.
[0.0218706109432929, 1, 3, 0.3130381066286577, 0.16628854748923497, 392, 5, 4]




0.5949796857686733
Iteration No: 22 ended. Search finished for the next optimal point.
Time taken: 0.8183
Function value obtained: -0.1687
Current minimum: -0.1687
Iteration No: 23 started. Searching for the next optimal point.
[0.033034733270233485, 2, 2, 0.26560368628748854, 0.050588864414525224, 667, 5, 5]




0.5834588175128562
Iteration No: 23 ended. Search finished for the next optimal point.
Time taken: 0.8209
Function value obtained: -0.1760
Current minimum: -0.1760
Iteration No: 24 started. Searching for the next optimal point.
[0.04602461606959627, 3, 4, 0.11794534323486512, 0.33064224170859646, 972, 5, 4]




0.5924936784385033
Iteration No: 24 ended. Search finished for the next optimal point.
Time taken: 1.8213
Function value obtained: -0.1410
Current minimum: -0.1760
Iteration No: 25 started. Searching for the next optimal point.
[0.004411724909981534, 1, 1, 0.10838806301023426, 0.20779096317359774, 388, 5, 3]




0.5340654033014177
Iteration No: 25 ended. Search finished for the next optimal point.
Time taken: 0.9842
Function value obtained: -0.1092
Current minimum: -0.1760
Iteration No: 26 started. Searching for the next optimal point.
[0.027691873004735343, 1, 6, 0.25462975552371864, 0.17630015238258756, 177, 5, 4]




0.5896525272040231
Iteration No: 26 ended. Search finished for the next optimal point.
Time taken: 0.6716
Function value obtained: -0.1480
Current minimum: -0.1760
Iteration No: 27 started. Searching for the next optimal point.
[0.06867403550375935, 2, 2, 0.08714820916502644, 0.211145435457258, 612, 5, 4]




0.5068329687189249
Iteration No: 27 ended. Search finished for the next optimal point.
Time taken: 0.9356
Function value obtained: -0.1528
Current minimum: -0.1760
Iteration No: 28 started. Searching for the next optimal point.
[0.008568807086520799, 1, 4, 0.25891922811229284, 0.15960430102145823, 541, 5, 1]




0.6113589226354519
Iteration No: 28 ended. Search finished for the next optimal point.
Time taken: 0.8971
Function value obtained: -0.1308
Current minimum: -0.1760
Iteration No: 29 started. Searching for the next optimal point.
[0.019137783750985106, 4, 2, 0.3520445125141224, 0.10817752770075473, 843, 3, 5]




0.5668949058158367
Iteration No: 29 ended. Search finished for the next optimal point.
Time taken: 1.9523
Function value obtained: -0.1326
Current minimum: -0.1760
Iteration No: 30 started. Searching for the next optimal point.
[0.03151323293820827, 2, 2, 0.19510525967243225, 0.3184089653542606, 730, 5, 5]




0.5974088700741541
Iteration No: 30 ended. Search finished for the next optimal point.
Time taken: 1.0519
Function value obtained: -0.1922
Current minimum: -0.1922
Iteration No: 31 started. Searching for the next optimal point.
[0.09978037360271744, 3, 3, 0.13305841683236241, 0.36080714220572513, 868, 5, 5]




0.5328579140267636
Iteration No: 31 ended. Search finished for the next optimal point.
Time taken: 1.7176
Function value obtained: -0.1262
Current minimum: -0.1922
Iteration No: 32 started. Searching for the next optimal point.
[0.02109538685865611, 2, 5, 0.15854539991319244, 0.06504129113420354, 743, 5, 5]




0.5832883484387874
Iteration No: 32 ended. Search finished for the next optimal point.
Time taken: 1.0901
Function value obtained: -0.1766
Current minimum: -0.1922
Iteration No: 33 started. Searching for the next optimal point.
[0.010870859941957262, 2, 8, 0.1334290560807414, 0.9698081366511343, 674, 5, 5]




0.5614683069579793
Iteration No: 33 ended. Search finished for the next optimal point.
Time taken: 1.0704
Function value obtained: -0.1170
Current minimum: -0.1922
Iteration No: 34 started. Searching for the next optimal point.
[0.038361027685163404, 2, 1, 0.9251385602429641, 0.7348075383778321, 616, 5, 5]




0.5860584708924056
Iteration No: 34 ended. Search finished for the next optimal point.
Time taken: 1.9092
Function value obtained: -0.1681
Current minimum: -0.1922
Iteration No: 35 started. Searching for the next optimal point.
[0.03683291231200575, 2, 18, 0.12593276473604814, 0.4164507123600083, 759, 5, 5]




0.47050885018609545
Iteration No: 35 ended. Search finished for the next optimal point.
Time taken: 1.2487
Function value obtained: -0.0956
Current minimum: -0.1922
Iteration No: 36 started. Searching for the next optimal point.
[0.022521337968405487, 2, 1, 0.21975691968178762, 0.41946067312680463, 126, 5, 5]




0.5950365087933631
Iteration No: 36 ended. Search finished for the next optimal point.
Time taken: 0.8003
Function value obtained: -0.1583
Current minimum: -0.1922
Iteration No: 37 started. Searching for the next optimal point.
[0.017338576195684335, 2, 10, 0.2075350092711376, 0.08043867015811222, 782, 5, 5]




0.5076284910645793
Iteration No: 37 ended. Search finished for the next optimal point.
Time taken: 1.1951
Function value obtained: -0.1020
Current minimum: -0.1922
Iteration No: 38 started. Searching for the next optimal point.
[0.027035241802913952, 1, 3, 0.33397697274506905, 0.3140136337076569, 802, 5, 5]




0.6133761400119329
Iteration No: 38 ended. Search finished for the next optimal point.
Time taken: 1.7938
Function value obtained: -0.2142
Current minimum: -0.2142
Iteration No: 39 started. Searching for the next optimal point.
[0.01822314678946355, 2, 2, 0.28367140616017356, 0.9088323014260379, 956, 5, 5]




0.6066426115862147
Iteration No: 39 ended. Search finished for the next optimal point.
Time taken: 1.9790
Function value obtained: -0.1652
Current minimum: -0.2142
Iteration No: 40 started. Searching for the next optimal point.
[0.03703199532154493, 1, 6, 0.5830862734918894, 0.2513609753808816, 681, 5, 5]




0.605946529533767
Iteration No: 40 ended. Search finished for the next optimal point.
Time taken: 1.2742
Function value obtained: -0.1652
Current minimum: -0.2142
Iteration No: 41 started. Searching for the next optimal point.
[0.032438141813694026, 2, 5, 0.30181080840002394, 0.6945275234853964, 689, 5, 5]




0.5812427195499615
Iteration No: 41 ended. Search finished for the next optimal point.
Time taken: 1.2818
Function value obtained: -0.1457
Current minimum: -0.2142
Iteration No: 42 started. Searching for the next optimal point.
[0.031761906463518064, 2, 4, 0.16355582422222004, 0.0955735072107845, 797, 4, 5]




0.6143705429440008
Iteration No: 42 ended. Search finished for the next optimal point.
Time taken: 1.3128
Function value obtained: -0.1634
Current minimum: -0.2142
Iteration No: 43 started. Searching for the next optimal point.
[0.03248836984899965, 1, 2, 0.47082729434562953, 0.45729317006193193, 869, 4, 5]




0.6085745944256613
Iteration No: 43 ended. Search finished for the next optimal point.
Time taken: 1.3756
Function value obtained: -0.1710
Current minimum: -0.2142
Iteration No: 44 started. Searching for the next optimal point.
[0.023844084301267533, 1, 2, 0.14663846178515247, 0.06283971573719856, 710, 5, 5]




0.5550188936557093
Iteration No: 44 ended. Search finished for the next optimal point.
Time taken: 0.9516
Function value obtained: -0.1824
Current minimum: -0.2142
Iteration No: 45 started. Searching for the next optimal point.
[0.03129493300253853, 1, 2, 0.3247686541504322, 0.8140630433078869, 836, 3, 5]




0.6058612949967327
Iteration No: 45 ended. Search finished for the next optimal point.
Time taken: 1.4242
Function value obtained: -0.1635
Current minimum: -0.2142
Iteration No: 46 started. Searching for the next optimal point.
[0.025093086072000576, 1, 3, 0.1514753247952883, 0.253725583373641, 788, 4, 5]




0.5864420263090605
Iteration No: 46 ended. Search finished for the next optimal point.
Time taken: 1.0711
Function value obtained: -0.1748
Current minimum: -0.2142
Iteration No: 47 started. Searching for the next optimal point.
[0.027481350757176202, 1, 3, 0.6915054766001799, 0.07762599279047386, 823, 1, 4]




0.6065715828053527
Iteration No: 47 ended. Search finished for the next optimal point.
Time taken: 1.5396
Function value obtained: -0.1377
Current minimum: -0.2142
Iteration No: 48 started. Searching for the next optimal point.
[0.028130333329282612, 1, 1, 0.21913851535604434, 0.22160588766161043, 611, 5, 5]




0.5835724635622354
Iteration No: 48 ended. Search finished for the next optimal point.
Time taken: 1.1683
Function value obtained: -0.1761
Current minimum: -0.2142
Iteration No: 49 started. Searching for the next optimal point.
[0.019087344512738236, 1, 3, 0.6568054299161659, 0.46722745650919895, 200, 5, 5]




0.6205216353666506
Iteration No: 49 ended. Search finished for the next optimal point.
Time taken: 0.7609
Function value obtained: -0.1847
Current minimum: -0.2142
Iteration No: 50 started. Searching for the next optimal point.
[0.016238983884766554, 1, 1, 0.19764622784463604, 0.0686305109803104, 981, 5, 5]




0.5663834985936302
Iteration No: 50 ended. Search finished for the next optimal point.
Time taken: 1.1809
Function value obtained: -0.1222
Current minimum: -0.2142


LGBM: ap 0.2142 auc 0.6133761400119329


In [76]:
res.x

[0.027035241802913952,
 1,
 3,
 0.33397697274506905,
 0.3140136337076569,
 802,
 5,
 5]

# 7 Logistic Reg

In [77]:
from sklearn.preprocessing import MaxAbsScaler, StandardScaler
from scipy.sparse import csr_matrix

In [99]:
Xtrain_wtitle2 = csr_matrix(Xtrain_wtitle.copy())
Xval_wtitle2 = csr_matrix(Xval_wtitle.copy())

scaler = StandardScaler()
#scaler = MaxAbsScaler()


Xtrain_wtitle2[:, :2] = scaler.fit_transform(Xtrain_wtitle2[:, :2].todense())
Xval_wtitle2[:, :2] = scaler.transform(Xval_wtitle2[:, :2].todense())

#Xtrain_wtitle2 = scaler.fit_transform(Xtrain_wtitle2)
#Xval_wtitle2 = scaler.transform(Xval_wtitle2)

  self._set_arrayXarray(i, j, x)
  self._set_arrayXarray(i, j, x)


In [100]:
Xval_wtitle2.shape

(638, 476)

In [122]:

mdl = LogisticRegression(C=0.5,n_jobs=6, random_state=0)
mdl.fit(Xtrain_wtitle2, ytrain)

LogisticRegression(C=0.5, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=6, penalty='l2', random_state=0,
                   solver='lbfgs', tol=0.0001, verbose=0, warm_start=False)

In [123]:
p = mdl.predict_proba(Xval_wtitle2)[:, 1]

In [124]:
average_precision_score(yval, p), roc_auc_score(yval, p)

(0.20804541108738267, 0.6431798164616301)

ap 0.20804541108738267 auc 0.6431798164616301 - standard
ap 0.18643772721623786 auc 0.6524419694860357 - abs