In [306]:
import pandas as pd
import numpy as np
import os
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import ElasticNet
from sklearn.linear_model import Lasso
from sklearn.model_selection import cross_validate
from sklearn.preprocessing import LabelBinarizer
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import LassoCV
import ast
from sklearn.feature_selection import RFE
from sklearn.svm import SVR
import lightgbm as lgb

In [13]:
def load_feats(feats_dir, folder, file_type, load_func):
    dict_to_feed = {}
    full_path = os.path.join(feats_dir, folder)
    files = [f for f in os.listdir(full_path) 
             if os.path.isfile(os.path.join(full_path, f)) and f[-4:] =='.'+file_type]
    
    for f in files:
        dict_to_feed[f[:-4]] = load_func(os.path.join(full_path, f))
    return dict_to_feed

In [14]:
def load_feats_csv(feats_dir, folder):
    return load_feats(feats_dir, folder, 'csv', pd.read_csv)

In [15]:
def load_feats_npy(feats_dir, folder):
    return load_feats(feats_dir, folder, 'npy', np.load)

In [17]:
base_dir = r'D:\Machine Learning\Datasets\ted-talks\feats'

In [301]:
train_feats_pd = load_feats_csv(base_dir, 'train')
test_feats_pd = load_feats_csv(base_dir, 'test')
train_feats_np = load_feats_npy(base_dir, 'train')
test_feats_np = load_feats_npy(base_dir, 'test')

In [302]:
list(train_feats_pd.keys())

['date_feats',
 'events_ohe',
 'lsa_ch25',
 'lsa_ch50',
 'lsa_w25',
 'lsa_w50',
 'ratings',
 'ratings_sum',
 'related_talks_avg_views',
 'tags_gt10',
 'tags_gt50']

In [20]:
list(train_feats_np.keys())

['description_tw_emb',
 'description_wiki_emb',
 'speaker_occupation_tw_emb',
 'speaker_occupation_wiki_emb',
 'tags_tweet_emb',
 'tags_wiki_emb',
 'title_tw_emb',
 'title_wiki_emb']

In [21]:
train_feats_np['description_wiki_emb']

array([[12.855329  ,  4.5100203 ,  2.7005692 , ..., -2.2964358 ,
        -3.5763156 , -4.8642936 ],
       [ 9.889155  ,  6.263903  , -2.4831548 , ...,  0.02368131,
        -0.31124094, -3.4509144 ],
       [21.641573  , 15.473295  , -2.8141096 , ..., -6.4330144 ,
        -3.6041358 ,  3.9712694 ],
       ...,
       [18.981138  , 12.716001  , -2.0708969 , ...,  8.943259  ,
        -0.08483845,  5.255693  ],
       [12.534835  , 10.645477  , -0.9604598 , ...,  0.5883128 ,
        -1.3418088 ,  3.6585114 ],
       [ 9.509973  , 12.537871  , -6.684204  , ..., -1.0933027 ,
         2.0165484 , 10.728305  ]], dtype=float32)

In [22]:
X_train = pd.read_csv(r'D:\Machine Learning\Datasets\ted-talks\X_train.csv')
X_test = pd.read_csv(r'D:\Machine Learning\Datasets\ted-talks\X_test.csv')
y_train = pd.read_csv(r'D:\Machine Learning\Datasets\ted-talks\y_train.csv', header=None)
y_test = pd.read_csv(r'D:\Machine Learning\Datasets\ted-talks\y_test.csv', header=None)

# Base model using comments count and rating sum 

In [27]:
x_train_base = pd.concat([X_train.comments, train_feats_pd['ratings_sum']], axis=1)

In [28]:
sc = StandardScaler()
x_train_base_sd = sc.fit_transform(x_train_base)

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)


In [32]:
cv = cross_validate(LinearRegression(), x_train_base_sd, y_train, cv=5)

In [40]:
print('Train score: %f +/- %f'%(cv['train_score'].mean(), cv['train_score'].std()))

Train score: 0.752420 +/- 0.025718


In [41]:
print('Test score: %f +/- %f'%(cv['test_score'].mean(), cv['test_score'].std()))

Test score: 0.700262 +/- 0.153308


In [305]:
train_base_rl = pd.concat([x_train_base, train_feats_pd['related_talks_avg_views']], axis=1)
train_base_rl_sd = StandardScaler().fit_transform(train_base_rl)
cv = cross_validate(LinearRegression(), train_base_rl_sd, y_train, cv=5)
print('Train r2 score: %f +/- %f'%(cv['train_score'].mean(), cv['train_score'].std()))
print('Test  f2 score: %f +/- %f'%(cv['test_score'].mean(), cv['test_score'].std()))

Train r2 score: 0.752827 +/- 0.026050
Test  f2 score: 0.698179 +/- 0.155511


  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)


In [64]:
cv = cross_validate(LinearRegression(), x_train_base['ratings_sum'].values.reshape(-1,1), y_train, cv=5)
print('Train r2 score: %f +/- %f'%(cv['train_score'].mean(), cv['train_score'].std()))
print('Test  f2 score: %f +/- %f'%(cv['test_score'].mean(), cv['test_score'].std()))

Train r2 score: 0.750884 +/- 0.025324
Test  f2 score: 0.699794 +/- 0.150370




In [63]:
cv = cross_validate(LinearRegression(), x_train_base, y_train, cv=5, scoring='neg_mean_absolute_error')
print('Train nmae score: %f +/- %f'%(cv['train_score'].mean(), cv['train_score'].std()))
print('Test  nmae score: %f +/- %f'%(cv['test_score'].mean(), cv['test_score'].std()))

Train nmae score: -590235.197642 +/- 13929.964286
Test  nmae score: -596622.073571 +/- 44055.548158




In [None]:
rg = lightgbm.LGBMRegressor()

In [None]:
lgb_train_df = lgb.Dataset(data = x_train_base, label = y_train, feature_name = list(x_train_base))
params = {'objective': 'regression'}
cv_results = lgb.cv(
        params,
        lgb_train_df,
        num_boost_round=100,
        nfold=3,
        metrics='mae',
        early_stopping_rounds=10,
        stratified=False
        )

In [61]:
print('CV  mae score: %f +/- %f'%(np.mean(cv_results['l1-mean']), np.mean(cv_results['l1-stdv'])))

CV  mae score: 762191.869634 +/- 74082.586643


# Experimenting with different feats 

## Add Date feats

In [75]:
date_feats = train_feats_pd['date_feats']

In [85]:
month_lb = LabelBinarizer()

In [93]:
month_ohe = month_lb.fit_transform(date_feats.month)[:,:-1]

In [94]:
day_lb = LabelBinarizer()
day_ohe = day_lb.fit_transform(date_feats.day)[:,:-1]

In [96]:
date_feats = pd.concat([date_feats, pd.DataFrame(day_ohe),pd.DataFrame(month_ohe)], axis=1)

In [99]:
date_feats = date_feats.drop(['month', 'day'], axis=1)

In [110]:
date_sd = StandardScaler()

In [111]:
date_quant_sd = date_sd.fit_transform(date_feats[['year','publishing_lag','elapsed_time']])

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)


In [119]:
date_and_base = np.concatenate([date_quant_sd, x_train_base_sd,
                                date_feats.iloc[:,3:].values], axis=1)

In [120]:
cv = cross_validate(LinearRegression(), date_and_base, y_train, cv=5)
print('Train r2 score: %f +/- %f'%(cv['train_score'].mean(), cv['train_score'].std()))
print('Test  f2 score: %f +/- %f'%(cv['test_score'].mean(), cv['test_score'].std()))

Train r2 score: 0.766093 +/- 0.025208
Test  f2 score: 0.708208 +/- 0.154050




In [122]:
cv = cross_validate(RandomForestRegressor(), date_and_base, y_train, cv=5, scoring='r2')
print('Train r2 score: %f +/- %f'%(cv['train_score'].mean(), cv['train_score'].std()))
print('Test  f2 score: %f +/- %f'%(cv['test_score'].mean(), cv['test_score'].std()))

  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)


Train r2 score: 0.934495 +/- 0.020886
Test  f2 score: 0.616150 +/- 0.134729


  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)


In [146]:
date_and_base_tags50 = np.concatenate([date_and_base, train_feats_pd['tags_gt50'].values], axis=1)

In [143]:
cv = cross_validate(LinearRegression(), date_and_base_tags10, y_train, cv=5)
print('Train r2 score: %f +/- %f'%(cv['train_score'].mean(), cv['train_score'].std()))
print('Test  f2 score: %f +/- %f'%(cv['test_score'].mean(), cv['test_score'].std()))

Train r2 score: 0.781257 +/- 0.023662
Test  f2 score: 0.701642 +/- 0.154441




In [156]:
date_and_base_tags50_occupation = np.concatenate([date_and_base_tags50,
                                                  train_feats_np['speaker_occupation_wiki_emb'],
                                                 train_feats_np['description_wiki_emb']], axis=1)

In [157]:
cv = cross_validate(LinearRegression(), date_and_base_tags50_occupation, y_train, cv=5)
print('Train r2 score: %f +/- %f'%(cv['train_score'].mean(), cv['train_score'].std()))
print('Test  f2 score: %f +/- %f'%(cv['test_score'].mean(), cv['test_score'].std()))

Train r2 score: 0.800144 +/- 0.021282
Test  f2 score: 0.692808 +/- 0.152318




In [158]:
date_and_base_tags50_occupation_events = np.concatenate([date_and_base_tags50_occupation,
                                                        train_feats_pd['events_ohe'].values], axis=1)

In [160]:
cv = cross_validate(LinearRegression(), date_and_base_tags50_occupation_events, y_train, cv=5)
print('Train r2 score: %f +/- %f'%(cv['train_score'].mean(), cv['train_score'].std()))
print('Test  f2 score: %f +/- %f'%(cv['test_score'].mean(), cv['test_score'].std()))

Train r2 score: 0.802003 +/- 0.021146
Test  f2 score: 0.695036 +/- 0.151090




## Preparing to use LSA feats

In [168]:
X_train['url'] = X_train['url'].replace(r'\s+|\\n', '', regex=True)
X_test['url'] = X_test['url'].replace(r'\s+|\\n', '', regex=True)

In [206]:
lsa_samples = X_train.url.isin(train_feats_pd['lsa_ch25'].url.values)

In [207]:
cv = cross_validate(LinearRegression(), train_feats_pd['lsa_ch50'].iloc[:,:-1], y_train[lsa_samples], cv=5)
print('Train r2 score: %f +/- %f'%(cv['train_score'].mean(), cv['train_score'].std()))
print('Test  f2 score: %f +/- %f'%(cv['test_score'].mean(), cv['test_score'].std()))

Train r2 score: 0.025310 +/- 0.002182
Test  f2 score: -0.055998 +/- 0.037492




In [210]:
date_and_base_tags50_occupation_events_ch50 = np.concatenate([date_and_base_tags50_occupation_events[lsa_samples],
                                                        train_feats_pd['lsa_ch50'].iloc[:,:-1].values], axis=1)

In [211]:
cv = cross_validate(LinearRegression(), date_and_base_tags50_occupation_events_ch50, y_train[lsa_samples], cv=5)
print('Train r2 score: %f +/- %f'%(cv['train_score'].mean(), cv['train_score'].std()))
print('Test  f2 score: %f +/- %f'%(cv['test_score'].mean(), cv['test_score'].std()))

Train r2 score: 0.819739 +/- 0.018678
Test  f2 score: 0.697020 +/- 0.125493




In [212]:
date_and_base_tags50_occupation_events_ch25_w25 = np.concatenate([date_and_base_tags50_occupation_events[lsa_samples],
                                                        train_feats_pd['lsa_ch25'].iloc[:,:-1].values,
                                                        train_feats_pd['lsa_w25'].iloc[:,:-1].values], axis=1)

In [213]:
cv = cross_validate(LinearRegression(), date_and_base_tags50_occupation_events_ch25_w25, y_train[lsa_samples], cv=5)
print('Train r2 score: %f +/- %f'%(cv['train_score'].mean(), cv['train_score'].std()))
print('Test  f2 score: %f +/- %f'%(cv['test_score'].mean(), cv['test_score'].std()))

Train r2 score: 0.819963 +/- 0.018717
Test  f2 score: 0.699528 +/- 0.127246




In [214]:
date_and_base_tags50_occupation_events_ch50_w50 = np.concatenate([date_and_base_tags50_occupation_events[lsa_samples],
                                                        train_feats_pd['lsa_ch50'].iloc[:,:-1].values,
                                                        train_feats_pd['lsa_w50'].iloc[:,:-1].values], axis=1)

In [215]:
cv = cross_validate(LinearRegression(), date_and_base_tags50_occupation_events_ch50_w50, y_train[lsa_samples], cv=5)
print('Train r2 score: %f +/- %f'%(cv['train_score'].mean(), cv['train_score'].std()))
print('Test  f2 score: %f +/- %f'%(cv['test_score'].mean(), cv['test_score'].std()))

Train r2 score: 0.825482 +/- 0.018735
Test  f2 score: 0.682960 +/- 0.137494




In [227]:
cv = cross_validate(Ridge(alpha=150), date_and_base_tags50_occupation_events_ch25_w25, y_train[lsa_samples], cv=5)
print('Train r2 score: %f +/- %f'%(cv['train_score'].mean(), cv['train_score'].std()))
print('Test  f2 score: %f +/- %f'%(cv['test_score'].mean(), cv['test_score'].std()))

Train r2 score: 0.794779 +/- 0.021903
Test  f2 score: 0.717925 +/- 0.116946




In [230]:
cv = cross_validate(Ridge(alpha=95), date_and_base_tags50_occupation_events_ch50_w50, y_train[lsa_samples], cv=5)
print('Train r2 score: %f +/- %f'%(cv['train_score'].mean(), cv['train_score'].std()))
print('Test  f2 score: %f +/- %f'%(cv['test_score'].mean(), cv['test_score'].std()))

Train r2 score: 0.802390 +/- 0.020814
Test  f2 score: 0.722679 +/- 0.116986




In [250]:
cv = cross_validate(Lasso(alpha=70000), date_and_base_tags50_occupation_events_ch50_w50, y_train[lsa_samples], cv=5)
print('Train r2 score: %f +/- %f'%(cv['train_score'].mean(), cv['train_score'].std()))
print('Test  f2 score: %f +/- %f'%(cv['test_score'].mean(), cv['test_score'].std()))

Train r2 score: 0.779285 +/- 0.022140
Test  f2 score: 0.730100 +/- 0.121008




In [269]:
lassoCV = LassoCV(cv=5)

In [274]:
sfm = SelectFromModel(lassoCV, threshold=0.05)
sfm.fit(date_and_base_tags50_occupation_events_ch50_w50, y_train[lsa_samples].values)
n_features = sfm.transform(date_and_base_tags50_occupation_events_ch50_w50).shape[1]

  y = column_or_1d(y, warn=True)




In [276]:
cv = cross_validate(LinearRegression(),
                    sfm.transform(date_and_base_tags50_occupation_events_ch50_w50),
                    y_train[lsa_samples], cv=5)
print('Train r2 score: %f +/- %f'%(cv['train_score'].mean(), cv['train_score'].std()))
print('Test  f2 score: %f +/- %f'%(cv['test_score'].mean(), cv['test_score'].std()))

Train r2 score: 0.781996 +/- 0.021789
Test  f2 score: 0.731841 +/- 0.115826




In [257]:
lasso.fit(date_and_base_tags50_occupation_events_ch50_w50, y_train[lsa_samples])

Lasso(alpha=70000, copy_X=True, fit_intercept=True, max_iter=1000,
   normalize=False, positive=False, precompute=False, random_state=None,
   selection='cyclic', tol=0.0001, warm_start=False)

In [288]:
cv = cross_validate(RandomForestRegressor(max_depth=3, n_estimators=1000,n_jobs=-1),
                    sfm.transform(date_and_base_tags50_occupation_events_ch50_w50),
                    y_train[lsa_samples], cv=5, scoring='r2')
print('Train r2 score: %f +/- %f'%(cv['train_score'].mean(), cv['train_score'].std()))
print('Test  f2 score: %f +/- %f'%(cv['test_score'].mean(), cv['test_score'].std()))

  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)


Train r2 score: 0.810501 +/- 0.012342
Test  f2 score: 0.638242 +/- 0.081023




In [287]:
cv = cross_validate(RandomForestRegressor(max_depth=2, n_estimators=1000,n_jobs=-1),
                    sfm.transform(date_and_base_tags50_occupation_events_ch50_w50),
                    y_train[lsa_samples], cv=5, scoring='r2')
print('Train r2 score: %f +/- %f'%(cv['train_score'].mean(), cv['train_score'].std()))
print('Test  f2 score: %f +/- %f'%(cv['test_score'].mean(), cv['test_score'].std()))

  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)


Train r2 score: 0.752296 +/- 0.017820
Test  f2 score: 0.622366 +/- 0.072708




In [289]:
cv = cross_validate(RandomForestRegressor(max_depth=5, n_estimators=1000, n_jobs=-1),
                    date_and_base_tags50_occupation_events_ch50_w50,
                    y_train[lsa_samples], cv=5, scoring='r2')
print('Train r2 score: %f +/- %f'%(cv['train_score'].mean(), cv['train_score'].std()))
print('Test  f2 score: %f +/- %f'%(cv['test_score'].mean(), cv['test_score'].std()))

  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)


Train r2 score: 0.892416 +/- 0.008210
Test  f2 score: 0.585172 +/- 0.070678




In [290]:
cv = cross_validate(RandomForestRegressor(max_depth=2, n_estimators=1000, n_jobs=-1),
                    date_and_base_tags50_occupation_events_ch50_w50,
                    y_train[lsa_samples], cv=5, scoring='r2')
print('Train r2 score: %f +/- %f'%(cv['train_score'].mean(), cv['train_score'].std()))
print('Test  f2 score: %f +/- %f'%(cv['test_score'].mean(), cv['test_score'].std()))

  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)


Train r2 score: 0.748497 +/- 0.016605
Test  f2 score: 0.569811 +/- 0.059720




# Feats Selection

## Feats concatenation

In [296]:
X_train.head()

Unnamed: 0,comments,description,duration,event,film_date,languages,main_speaker,name,num_speaker,published_date,ratings,related_talks,speaker_occupation,tags,title,url
0,837,Michael Shermer says the human tendency to bel...,1141,TED2010,1265760000,34,Michael Shermer,Michael Shermer: The pattern behind self-decep...,1,1276507380,"[{'id': 7, 'name': 'Funny', 'count': 475}, {'i...","[{'id': 22, 'hero': 'https://pe.tedcdn.com/ima...",Skeptic,"['God', 'faith', 'neuroscience', 'psychology',...",The pattern behind self-deception,https://www.ted.com/talks/michael_shermer_the_...
1,305,Collective compassion has meant an overall dec...,1328,TED2015,1426550400,30,Gary Haugen,Gary Haugen: The hidden reason for poverty the...,1,1429542970,"[{'id': 1, 'name': 'Beautiful', 'count': 195},...","[{'id': 644, 'hero': 'https://pe.tedcdn.com/im...",Human rights attorney,"['inequality', 'poverty', 'violence']",The hidden reason for poverty the world needs ...,https://www.ted.com/talks/gary_haugen_the_hidd...
2,289,Gayla Benefield was just doing her job -- unti...,878,TEDxDanubia,1363910400,30,Margaret Heffernan,Margaret Heffernan: The dangers of willful bli...,1,1376319512,"[{'id': 3, 'name': 'Courageous', 'count': 413}...","[{'id': 1533, 'hero': 'https://pe.tedcdn.com/i...",Management thinker,"['TEDx', 'corruption', 'culture', 'global issu...",The dangers of willful blindness,https://www.ted.com/talks/margaret_heffernan_t...
3,101,We often think of bias and prejudice as rooted...,983,TEDSalon NY2014,1389744000,26,Paul Bloom,Paul Bloom: Can prejudice ever be a good thing?,1,1404399605,"[{'id': 11, 'name': 'Longwinded', 'count': 35}...","[{'id': 1198, 'hero': 'https://pe.tedcdn.com/i...",Psychologist,"['behavioral economics', 'mind', 'psychology']",Can prejudice ever be a good thing?,https://www.ted.com/talks/paul_bloom_can_preju...
4,101,"Blind river dolphins, reclusive lemurs, a parr...",5256,University of California,989971200,0,Douglas Adams,"Douglas Adams: Parrots, the universe and every...",1,1268762040,"[{'id': 22, 'name': 'Fascinating', 'count': 29...","[{'id': 635, 'hero': 'https://pe.tedcdn.com/im...","Author, satirist","['biodiversity', 'biology', 'comedy', 'humor',...","Parrots, the universe and everything",https://www.ted.com/talks/douglas_adams_parrot...


In [303]:
list(train_feats_pd.keys())

['date_feats',
 'events_ohe',
 'lsa_ch25',
 'lsa_ch50',
 'lsa_w25',
 'lsa_w50',
 'ratings',
 'ratings_sum',
 'related_talks_avg_views',
 'tags_gt10',
 'tags_gt50']

In [294]:
list(train_feats_np.keys())

['description_tw_emb',
 'description_wiki_emb',
 'speaker_occupation_tw_emb',
 'speaker_occupation_wiki_emb',
 'tags_tweet_emb',
 'tags_wiki_emb',
 'title_tw_emb',
 'title_wiki_emb']

### Quantitative feats

In [316]:
X_train_quant = pd.DataFrame(X_train.comments)
X_test_quant  = pd.DataFrame(X_test.comments)

In [317]:
train_feats_pd['date_feats'].head()

Unnamed: 0,year,month,day,publishing_lag,elapsed_time
0,2010,February,Wednesday,-124,-2962
1,2015,March,Tuesday,-34,-4823
2,2013,March,Friday,-143,-4098
3,2014,January,Wednesday,-169,-4397
4,2001,May,Wednesday,-1000,230


In [318]:
date_quant_columns = ['year', 'publishing_lag', 'elapsed_time']

In [319]:
X_train_quant = pd.concat([X_train_quant,
                            train_feats_pd['date_feats'][date_quant_columns]],
                          axis=1)
X_test_quant = pd.concat([X_test_quant,
                            test_feats_pd['date_feats'][date_quant_columns]],
                          axis=1)

In [320]:
X_train_quant = pd.concat([X_train_quant,
                            train_feats_pd['ratings_sum']],
                          axis=1)
X_test_quant = pd.concat([X_test_quant,
                            test_feats_pd['ratings_sum']],
                          axis=1)

In [321]:
X_train_quant = pd.concat([X_train_quant,
                            train_feats_pd['related_talks_avg_views']],
                          axis=1)
X_test_quant = pd.concat([X_test_quant,
                            test_feats_pd['related_talks_avg_views']],
                          axis=1)

### OhE 

In [322]:
train_feats_pd['date_feats'].head()

Unnamed: 0,year,month,day,publishing_lag,elapsed_time
0,2010,February,Wednesday,-124,-2962
1,2015,March,Tuesday,-34,-4823
2,2013,March,Friday,-143,-4098
3,2014,January,Wednesday,-169,-4397
4,2001,May,Wednesday,-1000,230


In [323]:
day_lb = LabelBinarizer()
day_ohe_train = day_lb.fit_transform(train_feats_pd['date_feats'].day)[:,:-1]
day_ohe_test = day_lb.transform(test_feats_pd['date_feats'].day)[:,:-1]

In [324]:
month_lb = LabelBinarizer()
month_ohe_train = month_lb.fit_transform(train_feats_pd['date_feats'].month)[:,:-1]
month_ohe_test = month_lb.transform(test_feats_pd['date_feats'].month)[:,:-1]

In [325]:
X_train_ohe = pd.DataFrame(data=np.concatenate([day_ohe_train, month_ohe_train],axis=1))
X_test_ohe = pd.DataFrame(data=np.concatenate([day_ohe_test, month_ohe_test],axis=1))

In [326]:
X_train_ohe.shape

(2040, 17)

In [327]:
X_test_ohe.shape

(510, 17)

In [328]:
X_train_ohe = pd.concat([X_train_ohe, train_feats_pd['events_ohe']], axis=1)
X_test_ohe = pd.concat([X_test_ohe, test_feats_pd['events_ohe']], axis=1)

### Tags

In [329]:
X_train_tags = pd.DataFrame(train_feats_pd['tags_gt10'])
X_test_tags = pd.DataFrame(test_feats_pd['tags_gt10'])

### Embeddings 

In [330]:
X_train_emb = pd.DataFrame(np.concatenate([train_feats_np['description_wiki_emb'],
                                          train_feats_np['tags_wiki_emb'],
                                          train_feats_np['speaker_occupation_wiki_emb'],], axis=1))
X_test_emb = pd.DataFrame(np.concatenate([test_feats_np['description_wiki_emb'],
                                          test_feats_np['tags_wiki_emb'],
                                          test_feats_np['speaker_occupation_wiki_emb'],], axis=1))

### LSA

In [336]:
X_train['url'] = X_train['url'].replace(r'\s+|\\n', '', regex=True)
X_test['url'] = X_test['url'].replace(r'\s+|\\n', '', regex=True)
lsa_samples_train = X_train.url.isin(train_feats_pd['lsa_ch50'].url.values)
lsa_samples_test = X_test.url.isin(test_feats_pd['lsa_ch50'].url.values)

In [369]:
X_train_lsa = pd.DataFrame(np.concatenate([train_feats_pd['lsa_ch50'].iloc[:,:-1],
                                    train_feats_pd['lsa_w50'].iloc[:,:-1]], axis=1))
X_test_lsa = pd.DataFrame(np.concatenate([test_feats_pd['lsa_ch50'].iloc[:,:-1],
                                    test_feats_pd['lsa_w50'].iloc[:,:-1]], axis=1))

## Concatenate All 

In [333]:
sc = StandardScaler()

In [340]:
X_train_quant_sd = pd.DataFrame(sc.fit_transform(X_train_quant))
X_test_quant_sd = pd.DataFrame(sc.transform(X_test_quant))

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)
  


In [370]:
X_train_all = pd.concat([X_train_quant_sd, X_train_ohe, X_train_tags, X_train_emb], axis=1)
X_test_all = pd.concat([X_test_quant_sd, X_test_ohe, X_test_tags, X_test_emb], axis=1)

###  Leave only the samples that have  transcripts

In [371]:
X_train_all = pd.concat([X_train_all[lsa_samples_train].reset_index(),
                        X_train_lsa], axis=1)
X_test_all = pd.concat([X_test_all[lsa_samples_test].reset_index(),
                        X_test_lsa], axis=1)

In [372]:
X_train_all.shape

(1964, 553)

In [373]:
X_test_all.shape

(500, 553)

In [374]:
y_train_lsa = y_train[lsa_samples_train].values
y_test_lsa = y_test[lsa_samples_test].values

In [387]:
cv = cross_validate(LinearRegression(),
                    X_train_all,
                    y_train_lsa, cv=5)
print('Train r2 score: %f +/- %f'%(cv['train_score'].mean(), cv['train_score'].std()))
print('Test  f2 score: %f +/- %f'%(cv['test_score'].mean(), cv['test_score'].std()))

Train r2 score: 0.854646 +/- 0.016611
Test  f2 score: 0.594032 +/- 0.161329




## Feats selection with Lasso

In [380]:
lassoCV = LassoCV(cv=5)
sfm = SelectFromModel(lassoCV, threshold=0.05)
sfm.fit(X_train_all, y_train_lsa)
n_features = sfm.transform(X_train_all).shape[1]
n_features

  y = column_or_1d(y, warn=True)


34

In [381]:
cv = cross_validate(LinearRegression(),
                    sfm.transform(X_train_all),
                    y_train_lsa, cv=5)
print('Train r2 score: %f +/- %f'%(cv['train_score'].mean(), cv['train_score'].std()))
print('Test  f2 score: %f +/- %f'%(cv['test_score'].mean(), cv['test_score'].std()))

Train r2 score: 0.785887 +/- 0.022640
Test  f2 score: 0.728569 +/- 0.122812




In [382]:
lassoCV = LassoCV(cv=5)
sfm = SelectFromModel(lassoCV, threshold='mean')
sfm.fit(X_train_all, y_train_lsa)
n_features = sfm.transform(X_train_all).shape[1]
n_features

  y = column_or_1d(y, warn=True)


20

In [383]:
cv = cross_validate(LinearRegression(),
                    sfm.transform(X_train_all),
                    y_train_lsa, cv=5)
print('Train r2 score: %f +/- %f'%(cv['train_score'].mean(), cv['train_score'].std()))
print('Test  f2 score: %f +/- %f'%(cv['test_score'].mean(), cv['test_score'].std()))

Train r2 score: 0.783930 +/- 0.022846
Test  f2 score: 0.730714 +/- 0.122780




In [384]:
sfm = SelectFromModel(RandomForestRegressor(max_depth=3,
                                            n_estimators=1000,n_jobs=-1), threshold='mean')
sfm.fit(X_train_all, y_train_lsa)
n_features = sfm.transform(X_train_all).shape[1]
n_features

  self.estimator_.fit(X, y, **fit_params)


16

In [385]:
cv = cross_validate(LinearRegression(),
                    sfm.transform(X_train_all),
                    y_train_lsa, cv=5)
print('Train r2 score: %f +/- %f'%(cv['train_score'].mean(), cv['train_score'].std()))
print('Test  f2 score: %f +/- %f'%(cv['test_score'].mean(), cv['test_score'].std()))

Train r2 score: 0.772486 +/- 0.022453
Test  f2 score: 0.723166 +/- 0.120695




In [386]:
sfm = SelectFromModel(RandomForestRegressor(max_depth=3,
                                            n_estimators=1000,n_jobs=-1), threshold=0.05)
sfm.fit(X_train_all, y_train_lsa)
n_features = sfm.transform(X_train_all).shape[1]
print(n_features)
cv = cross_validate(LinearRegression(),
                    sfm.transform(X_train_all),
                    y_train_lsa, cv=5)
print('Train r2 score: %f +/- %f'%(cv['train_score'].mean(), cv['train_score'].std()))
print('Test  f2 score: %f +/- %f'%(cv['test_score'].mean(), cv['test_score'].std()))

  self.estimator_.fit(X, y, **fit_params)


1
Train r2 score: 0.766004 +/- 0.022495
Test  f2 score: 0.721430 +/- 0.117131




In [393]:
estimator = LinearRegression()
selector = RFE(estimator, 100, step=1)
selector = selector.fit(X_train_all, y_train_lsa)

  y = column_or_1d(y, warn=True)


In [None]:
cv = cross_validate(LinearRegression(),
                    selector.transform(X_train_all),
                    y_train_lsa, cv=5)
print('Train r2 score: %f +/- %f'%(cv['train_score'].mean(), cv['train_score'].std()))
print('Test  f2 score: %f +/- %f'%(cv['test_score'].mean(), cv['test_score'].std()))