In [3]:
# Import libraries

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [4]:
data = pd.read_csv("raw_data1_with_labels.csv", index_col = 0)
data.head()

Unnamed: 0,title,y,upload_date,view_count,query
0,Assessing Federated Machine Learning's Potenti...,0,2020-10-26,1,machine+learning
1,Alexa: Which is the Best Instance to Run Machi...,1,2020-10-26,28,machine+learning
2,Tesla vs comma.ai approach to machine learning...,0,2020-10-26,1,machine+learning
3,Explore Machine Learning Models with Explainab...,1,2020-10-26,8,machine+learning
4,8. Deep Learning Tutorial (Bengali) | Padding ...,0,2020-10-26,16,machine+learning


In [5]:
# Getting the labels and the title column

y = data['y']
text = data['title']

In [6]:
# It'll be create functions to process the data like the data processing notebook

def getting_numeric_features(data):
    
    df = data.copy()
    features = pd.DataFrame(index = df.index)
    
    features['date'] = pd.to_datetime(df['upload_date'])
    features['views'] = df['view_count']
    
    features['time_since_pub'] = (pd.to_datetime("2020-12-12") - features['date']) / np.timedelta64(1, 'D') 
    features['views_per_day'] = features['views'] /  features['time_since_pub']
    features.drop(columns = ["time_since_pub", "date"], inplace = True)
    
    return features
    

In [7]:
features = getting_numeric_features(data)
features.head()

Unnamed: 0,views,views_per_day
0,1,0.021277
1,28,0.595745
2,1,0.021277
3,8,0.170213
4,16,0.340426


In [8]:
# splitting the data into train and validation

mask1 = data['upload_date'] <= "2020-08-31" # 63% - be use to train the model
mask2 = data['upload_date'] > "2020-08-31" # 27% - be use to test the model

X_train, X_val = features[mask1], features[mask2]
y_train, y_val = y[mask1], y[mask2]

X_train.shape, y_train.shape, X_val.shape, y_val.shape

((573, 2), (573,), (325, 2), (325,))

In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import hstack

def getting_text_features(text, mask1, mask2, min_df = 2, ngram_range = (1, 3)):
    
    X_train_text = np.squeeze(text[mask1])
    X_val_text = np.squeeze(text[mask2])
    
    
    title_vec = TfidfVectorizer(min_df=min_df, ngram_range=ngram_range)
    
    X_train_bow = title_vec.fit_transform(X_train_text)
    X_val_bow = title_vec.transform(X_val_text)
    
    return X_train_bow, X_val_bow, title_vec
        
    
def final_data(x_train, x_val, x_train_bow, x_val_bow):
    
    return hstack([x_train, x_train_bow]), hstack([x_val, x_val_bow])

In [11]:
x_train_bow, x_val_bow, title_vec_rf = getting_text_features(text, mask1, mask2)

x_train_bow.shape, x_val_bow.shape

((573, 1657), (325, 1657))

In [12]:
X_train_with_title, X_val_with_title = final_data(X_train, X_val, x_train_bow, x_val_bow)
X_train_with_title.shape, X_val_with_title.shape

((573, 1659), (325, 1659))

*After getting the data prepared to feed into the model, we are building some models to compare their metrics.*

In [13]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression


### Random Forest Classifier

In [14]:
mdl1 = RandomForestClassifier(n_estimators = 1000, random_state = 0, n_jobs = -1, class_weight = "balanced", min_samples_leaf= 1)
mdl1.fit(X_train_with_title, y_train)

RandomForestClassifier(class_weight='balanced', n_estimators=1000, n_jobs=-1,
                       random_state=0)

In [15]:
from sklearn.metrics import roc_auc_score, average_precision_score

p_rf = mdl1.predict_proba(X_val_with_title)[:, 1]

average_precision_score(y_val, p_rf), roc_auc_score(y_val, p_rf)

(0.47479006692854603, 0.7139520398629711)

                                                    
           (0.4398304045590999, 0.6974981833281428)  - n_estimators = 1000 - min_samples_leaf= 1 - n_gram (1,2)
           (0.4281493555354776, 0.6715716806809924)  - n_estimators = 1000 - min_samples_leaf= 2 - n_gram (1,2)
           (0.41291598480012126, 0.6572978303747534) - n_estimators = 100  - min_samples_leaf= 2 - n_gram (1,2)
           (0.4390506690818257, 0.6829648084708814)  - n_estimators = 1000 - min_samples_leaf= 1 - n_gram (1,2)
           (0.47479006692854603, 0.7139520398629711) - n_estimators = 1000 - min_samples_leaf= 1 - n_gram (1,3) ~ best

### LGBM Classifier 

In [16]:
from lightgbm import LGBMClassifier

mdl2 = LGBMClassifier(random_state = 0, class_weight="balanced", n_jobs = -1)
mdl2.fit(X_train, y_train)

LGBMClassifier(class_weight='balanced', random_state=0)

In [17]:
p = mdl2.predict_proba(X_val)[:, 1]
average_precision_score(y_val, p), roc_auc_score(y_val, p)

(0.251331677093827, 0.5179590989307589)

In [18]:
# Now we will use the scikit-optimize library (skopt) to tune the lgbm classifier

from skopt import forest_minimize

In [19]:
title_train = np.squeeze(text[mask1])
title_val = np.squeeze(text[mask2])

In [20]:
def tune_lgbm(params):
    
    lr = params[0]
    max_depth = params[1]
    min_child_samples = params[2]
    subsample = params[3]
    colsample_bytree = params[4]
    n_estimators = params[5]
    min_df = params[6]
    ngram_range = (1, params[7])
    
    title_vec = TfidfVectorizer(min_df = min_df, ngram_range =  ngram_range)
    title_bow_train = title_vec.fit_transform(title_train)
    title_bow_val = title_vec.transform(title_val)
    
    X_train_with_title = hstack([X_train, title_bow_train])
    X_val_with_title = hstack([X_val, title_bow_val])
    
    mdl = LGBMClassifier(learning_rate = lr,  max_depth=max_depth,
                        min_child_samples=min_child_samples, subsample=subsample,
                        colsample_bytree=colsample_bytree, bagging_freq = 1, n_estimators= n_estimators,
                        random_state =0, class_weight = "balanced", n_jobs=-1)
    mdl.fit(X_train_with_title, y_train)
    
    p = mdl.predict_proba(X_val_with_title)[:, 1]
    
    print(roc_auc_score(y_val, p))
    
    return -average_precision_score(y_val, p)

In [21]:
space = [(1e-3, 1e-1, 'log-uniform'), #lr
        (1, 20), # max_depth
        (1, 20), #min_child_samples
        (0.05, 1.), #subsample
        (0.05, 1.), #colsample_bytree
        (100, 1000), # n_estimators
        (1, 5), # min_df
        (1, 5)] #n_gram range
    
res = forest_minimize(tune_lgbm, space, random_state = 160475, n_random_starts = 20, n_calls = 50, verbose = 1)

Iteration No: 1 started. Evaluating function at random point.
0.61463199418665
Iteration No: 1 ended. Evaluation done at random point.
Time taken: 0.1855
Function value obtained: -0.3300
Current minimum: -0.3300
Iteration No: 2 started. Evaluating function at random point.



bagging_freq is set=1, subsample_freq=0 will be ignored. Current value: bagging_freq=1




0.6103238866396761
Iteration No: 2 ended. Evaluation done at random point.
Time taken: 0.2730
Function value obtained: -0.4002
Current minimum: -0.4002
Iteration No: 3 started. Evaluating function at random point.




0.5756514066230665
Iteration No: 3 ended. Evaluation done at random point.
Time taken: 0.3470
Function value obtained: -0.2933
Current minimum: -0.4002
Iteration No: 4 started. Evaluating function at random point.




0.6310339458112737
Iteration No: 4 ended. Evaluation done at random point.
Time taken: 0.4130
Function value obtained: -0.3663
Current minimum: -0.4002
Iteration No: 5 started. Evaluating function at random point.
0.6162410464029897
Iteration No: 5 ended. Evaluation done at random point.
Time taken: 0.1380
Function value obtained: -0.3426
Current minimum: -0.4002
Iteration No: 6 started. Evaluating function at random point.




0.5836447627945603
Iteration No: 6 ended. Evaluation done at random point.
Time taken: 0.6764
Function value obtained: -0.3431
Current minimum: -0.4002
Iteration No: 7 started. Evaluating function at random point.
0.5721737776393647
Iteration No: 7 ended. Evaluation done at random point.
Time taken: 0.1440
Function value obtained: -0.3171
Current minimum: -0.4002
Iteration No: 8 started. Evaluating function at random point.




0.6471244679746704
Iteration No: 8 ended. Evaluation done at random point.
Time taken: 0.2660
Function value obtained: -0.3820
Current minimum: -0.4002
Iteration No: 9 started. Evaluating function at random point.




0.6672116682238141
Iteration No: 9 ended. Evaluation done at random point.
Time taken: 0.4710
Function value obtained: -0.4566
Current minimum: -0.4566
Iteration No: 10 started. Evaluating function at random point.




0.6217429668846672
Iteration No: 10 ended. Evaluation done at random point.
Time taken: 0.3430
Function value obtained: -0.3522
Current minimum: -0.4566
Iteration No: 11 started. Evaluating function at random point.
0.6011626699885809
Iteration No: 11 ended. Evaluation done at random point.
Time taken: 0.1781
Function value obtained: -0.3545
Current minimum: -0.4566
Iteration No: 12 started. Evaluating function at random point.
0.6161891414927851
Iteration No: 12 ended. Evaluation done at random point.
Time taken: 0.1920
Function value obtained: -0.3550
Current minimum: -0.4566
Iteration No: 13 started. Evaluating function at random point.








0.5941036022007682
Iteration No: 13 ended. Evaluation done at random point.
Time taken: 0.2741
Function value obtained: -0.3064
Current minimum: -0.4566
Iteration No: 14 started. Evaluating function at random point.
0.6198743901173052
Iteration No: 14 ended. Evaluation done at random point.
Time taken: 0.1020
Function value obtained: -0.3367
Current minimum: -0.4566
Iteration No: 15 started. Evaluating function at random point.




0.6202896293989412
Iteration No: 15 ended. Evaluation done at random point.
Time taken: 0.2190
Function value obtained: -0.3249
Current minimum: -0.4566
Iteration No: 16 started. Evaluating function at random point.




0.613178656700924
Iteration No: 16 ended. Evaluation done at random point.
Time taken: 0.5070
Function value obtained: -0.3630
Current minimum: -0.4566
Iteration No: 17 started. Evaluating function at random point.




0.6175905740683069
Iteration No: 17 ended. Evaluation done at random point.
Time taken: 1.2910
Function value obtained: -0.3415
Current minimum: -0.4566
Iteration No: 18 started. Evaluating function at random point.
0.5398889234921623
Iteration No: 18 ended. Evaluation done at random point.
Time taken: 0.0900
Function value obtained: -0.2851
Current minimum: -0.4566
Iteration No: 19 started. Evaluating function at random point.




0.6220024914356899
Iteration No: 19 ended. Evaluation done at random point.
Time taken: 0.2100
Function value obtained: -0.3352
Current minimum: -0.4566
Iteration No: 20 started. Evaluating function at random point.
0.5532803903249247
Iteration No: 20 ended. Evaluation done at random point.
Time taken: 0.4582
Function value obtained: -0.2779
Current minimum: -0.4566
Iteration No: 21 started. Searching for the next optimal point.




0.6989255683587667
Iteration No: 21 ended. Search finished for the next optimal point.
Time taken: 0.7250
Function value obtained: -0.4504
Current minimum: -0.4566
Iteration No: 22 started. Searching for the next optimal point.




0.6335253815010901
Iteration No: 22 ended. Search finished for the next optimal point.
Time taken: 1.0340
Function value obtained: -0.3842
Current minimum: -0.4566
Iteration No: 23 started. Searching for the next optimal point.
0.6302034672480017




Iteration No: 23 ended. Search finished for the next optimal point.
Time taken: 0.4210
Function value obtained: -0.3790
Current minimum: -0.4566
Iteration No: 24 started. Searching for the next optimal point.




0.696070798297519
Iteration No: 24 ended. Search finished for the next optimal point.
Time taken: 0.6790
Function value obtained: -0.4283
Current minimum: -0.4566
Iteration No: 25 started. Searching for the next optimal point.




0.6893231599709333
Iteration No: 25 ended. Search finished for the next optimal point.
Time taken: 0.8200
Function value obtained: -0.4396
Current minimum: -0.4566
Iteration No: 26 started. Searching for the next optimal point.




0.6827831412851655
Iteration No: 26 ended. Search finished for the next optimal point.
Time taken: 0.8156
Function value obtained: -0.4367
Current minimum: -0.4566
Iteration No: 27 started. Searching for the next optimal point.




0.6487335201910102
Iteration No: 27 ended. Search finished for the next optimal point.
Time taken: 1.0060
Function value obtained: -0.3864
Current minimum: -0.4566
Iteration No: 28 started. Searching for the next optimal point.
0.5501141908024499




Iteration No: 28 ended. Search finished for the next optimal point.
Time taken: 0.4020
Function value obtained: -0.2903
Current minimum: -0.4566
Iteration No: 29 started. Searching for the next optimal point.




0.6574535451053669
Iteration No: 29 ended. Search finished for the next optimal point.
Time taken: 0.5540
Function value obtained: -0.3800
Current minimum: -0.4566
Iteration No: 30 started. Searching for the next optimal point.




0.6624364164849995
Iteration No: 30 ended. Search finished for the next optimal point.
Time taken: 0.5490
Function value obtained: -0.4184
Current minimum: -0.4566
Iteration No: 31 started. Searching for the next optimal point.




0.6446330322848541
Iteration No: 31 ended. Search finished for the next optimal point.
Time taken: 0.5980
Function value obtained: -0.4079
Current minimum: -0.4566
Iteration No: 32 started. Searching for the next optimal point.




0.6851188622443682
Iteration No: 32 ended. Search finished for the next optimal point.
Time taken: 1.0940
Function value obtained: -0.4307
Current minimum: -0.4566
Iteration No: 33 started. Searching for the next optimal point.




0.7038565348281947
Iteration No: 33 ended. Search finished for the next optimal point.
Time taken: 0.5960
Function value obtained: -0.4414
Current minimum: -0.4566
Iteration No: 34 started. Searching for the next optimal point.
0.7007941451261289




Iteration No: 34 ended. Search finished for the next optimal point.
Time taken: 0.5460
Function value obtained: -0.4413
Current minimum: -0.4566
Iteration No: 35 started. Searching for the next optimal point.




0.6876622028443891
Iteration No: 35 ended. Search finished for the next optimal point.
Time taken: 0.9650
Function value obtained: -0.4240
Current minimum: -0.4566
Iteration No: 36 started. Searching for the next optimal point.




0.7001193812934704
Iteration No: 36 ended. Search finished for the next optimal point.
Time taken: 0.6830
Function value obtained: -0.4262
Current minimum: -0.4566
Iteration No: 37 started. Searching for the next optimal point.




0.6229886847295755
Iteration No: 37 ended. Search finished for the next optimal point.
Time taken: 1.0880
Function value obtained: -0.3890
Current minimum: -0.4566
Iteration No: 38 started. Searching for the next optimal point.




0.6744264507422402
Iteration No: 38 ended. Search finished for the next optimal point.
Time taken: 0.7690
Function value obtained: -0.4407
Current minimum: -0.4566
Iteration No: 39 started. Searching for the next optimal point.
0.6722983494238555




Iteration No: 39 ended. Search finished for the next optimal point.
Time taken: 0.4650
Function value obtained: -0.4455
Current minimum: -0.4566
Iteration No: 40 started. Searching for the next optimal point.
0.6534049621094156




Iteration No: 40 ended. Search finished for the next optimal point.
Time taken: 0.4720
Function value obtained: -0.4281
Current minimum: -0.4566
Iteration No: 41 started. Searching for the next optimal point.




0.6983027094363127
Iteration No: 41 ended. Search finished for the next optimal point.
Time taken: 0.7690
Function value obtained: -0.4699
Current minimum: -0.4699
Iteration No: 42 started. Searching for the next optimal point.




0.6800840859545313
Iteration No: 42 ended. Search finished for the next optimal point.
Time taken: 0.6200
Function value obtained: -0.4163
Current minimum: -0.4699
Iteration No: 43 started. Searching for the next optimal point.




0.6133862763417419
Iteration No: 43 ended. Search finished for the next optimal point.
Time taken: 0.5380
Function value obtained: -0.3710
Current minimum: -0.4699
Iteration No: 44 started. Searching for the next optimal point.
0.6277120315581854




Iteration No: 44 ended. Search finished for the next optimal point.
Time taken: 0.3780
Function value obtained: -0.4320
Current minimum: -0.4699
Iteration No: 45 started. Searching for the next optimal point.




0.6765545520606249
Iteration No: 45 ended. Search finished for the next optimal point.
Time taken: 0.7030
Function value obtained: -0.4519
Current minimum: -0.4699
Iteration No: 46 started. Searching for the next optimal point.




0.6365358662929513
Iteration No: 46 ended. Search finished for the next optimal point.
Time taken: 0.5200
Function value obtained: -0.4083
Current minimum: -0.4699
Iteration No: 47 started. Searching for the next optimal point.




0.66186546247275
Iteration No: 47 ended. Search finished for the next optimal point.
Time taken: 0.4680
Function value obtained: -0.4148
Current minimum: -0.4699
Iteration No: 48 started. Searching for the next optimal point.




0.6884926814076612
Iteration No: 48 ended. Search finished for the next optimal point.
Time taken: 0.6470
Function value obtained: -0.4102
Current minimum: -0.4699
Iteration No: 49 started. Searching for the next optimal point.




0.6695473891830167
Iteration No: 49 ended. Search finished for the next optimal point.
Time taken: 0.7400
Function value obtained: -0.4250
Current minimum: -0.4699
Iteration No: 50 started. Searching for the next optimal point.
0.6986141388975398




Iteration No: 50 ended. Search finished for the next optimal point.
Time taken: 0.3970
Function value obtained: -0.4664
Current minimum: -0.4699


In [22]:
print(f'Best parameters: {res.x}')

Best parameters: [0.015286972843636785, 14, 1, 0.5605787546434184, 0.9289814346488457, 390, 4, 1]


### Best Model

In [24]:
x_train_bow, x_val_bow, title_vec_lgbm = getting_text_features(text, mask1, mask2, min_df=4, ngram_range=(1, 1))
X_train_with_title, X_val_with_title = final_data(X_train, X_val, x_train_bow, x_val_bow)
X_train_with_title.shape, 

((573, 209),)

In [25]:
mdl2 = LGBMClassifier(random_state = 0, class_weight="balanced", n_jobs = -1, learning_rate=0.015286972843636785, max_depth = 14,
                     min_child_samples = 1, subsample = 0.5605787546434184, colsample_bytree=0.9289814346488457,
                     n_estimators = 539)
mdl2.fit(X_train_with_title, y_train)



LGBMClassifier(class_weight='balanced', colsample_bytree=0.9289814346488457,
               learning_rate=0.015286972843636785, max_depth=14,
               min_child_samples=1, n_estimators=539, random_state=0,
               subsample=0.5605787546434184)

In [26]:
p_lgbm = mdl2.predict_proba(X_val_with_title)[:, 1]
average_precision_score(y_val, p_lgbm), roc_auc_score(y_val, p_lgbm)



(0.46428526009645316, 0.7138741824976643)

## Ensemble

LGBM Classifier -  (0.469143426580733, 0.715275615073186)


Random Forest - (0.47479006692854603, 0.7139520398629711)

In [27]:
pd.DataFrame({"LR" : p_rf, "LGBM": p_lgbm}).corr()

Unnamed: 0,LR,LGBM
LR,1.0,0.688484
LGBM,0.688484,1.0


*the low correlation value indicates that getting this two models together will improve our model*

In [28]:
p = 0.5*p_rf + 0.5*p_lgbm
average_precision_score(y_val, p), roc_auc_score(y_val, p)

(0.4776522568572185, 0.7224903975916122)

## Save our models

In [29]:
import joblib as jb

jb.dump(mdl2, "mdl_lgbm.pkl.z")
jb.dump(mdl1, "mdl_random_forest.pkl.z")

['mdl_random_forest.pkl.z']

In [30]:
jb.dump(title_vec_rf, "title_vectorizer_rf.pkl.z")
jb.dump(title_vec_lgbm, "title_vectorizer_lgbm.pkl.z")

['title_vectorizer_lgbm.pkl.z']