In [1]:
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm
import itertools

In [2]:
DATA_PATH = '../data/'

In [3]:
from os.path import isfile, join
from os import listdir

## get only files parquet files
onlyfiles = [f for f in listdir(DATA_PATH) if isfile(join(DATA_PATH, f)) and 'snappy.parquet' in f]
onlyfiles

['part-00000-1b8fcd71-6348-4510-a9dc-bdd7dcf82f2d-c000.snappy.parquet',
 'part-00001-1b8fcd71-6348-4510-a9dc-bdd7dcf82f2d-c000.snappy.parquet',
 'part-00002-1b8fcd71-6348-4510-a9dc-bdd7dcf82f2d-c000.snappy.parquet',
 'part-00003-1b8fcd71-6348-4510-a9dc-bdd7dcf82f2d-c000.snappy.parquet',
 'part-00004-1b8fcd71-6348-4510-a9dc-bdd7dcf82f2d-c000.snappy.parquet']

In [4]:
dfs = [pd.read_parquet(DATA_PATH + file) for file in onlyfiles]
len(dfs)

5

In [5]:
data = pd.concat(dfs, ignore_index=True)
data

Unnamed: 0,url,target,day
0,https://www.cdiscount.com/bricolage/electricit...,"[1831, 1751, 1192, 745, 1703]",4
1,https://www.mystalk.net/profile/vitoriafcorrea,"[847, 978, 582, 1381, 529]",4
2,https://www.lequipe.fr/Tennis/TennisFicheJoueu...,"[20, 1077, 294]",4
3,http://m.jeuxvideo.com/forums/42-32625-6018005...,"[381, 935, 1343, 622, 933]",4
4,https://context.reverso.net/traduction/espagno...,"[692, 1265, 725, 1264, 1266]",4
...,...,...,...
67590,https://www.jeu-concours.biz/gagner-cafetiere-...,"[1276, 65, 1113]",1
67591,https://www.sto.cx/book-186042-471.html,"[608, 617, 1033, 220, 1021]",1
67592,http://jeu.info/solution/4-images-1-mot-niveau...,"[381, 925, 622, 1494, 937]",16
67593,https://grossesse.aufeminin.com/forum/levres-g...,"[638, 253, 419, 558, 401]",1


In [6]:
## check null values
data.isna().sum()

url       0
target    0
day       0
dtype: int64

In [7]:
## check duplicated in url
data['url'].duplicated().any()

True

In [8]:
## get duplicated columns where ther's duplicated values
data[data['url'].duplicated()]

Unnamed: 0,url,target,day
8989,https://www.linternaute.fr/dictionnaire/fr/def...,"[692, 1265, 1494, 474, 3]",3
9229,https://www.capital.fr/votre-argent/taxe-puma-...,"[1161, 1283, 1137, 1705, 1428]",1
9853,https://fr.shopping.rakuten.com/mfp/5516127/sa...,"[1071, 1534, 390, 531, 18]",12
11080,http://ghislaine17.ek.la/gigot-d-agneau-laque-...,"[1538, 909, 1545, 122, 1551]",16
16296,https://www.cdiscount.com/bricolage/chauffage/...,"[1835, 1836, 1724, 828, 830]",17
...,...,...,...
66719,https://plomberie.ooreka.fr/fiche/voir/106367/...,"[1365, 1836, 828, 839, 830]",13
66753,https://www.senscritique.com/sc2/liste/754899/...,"[507, 34, 184, 1116, 358]",12
66786,http://forum.doctissimo.fr/grossesse-bebe/rama...,"[1513, 907, 122, 1573, 1526]",12
67141,https://www.marmiton.org/recettes/recette_cook...,"[1513, 907, 122, 1573, 1526]",17


In [9]:
# targets[0], targets[1]

In [10]:
# ### handle duplicated values
# for url in data[data['url'].duplicated()].url.values:
#     ## check if same labels are on both urls
#     targets = data[data['url'] == url]['target'].values
#     if targets[0] != targets[1]:
#         print(url)

In [11]:
url_example = data['url'].values[2]
url_example

'https://www.lequipe.fr/Tennis/TennisFicheJoueur1500000000003017.html'

In [12]:
import re
from setuptools.namespaces import flatten
from urllib.parse import urlparse, unquote_plus

In [13]:
def preprocess_url(url):
    ## convert to urlparse with quoted
    url_parsed = urlparse(unquote_plus(url))
    ## join all url attributes
    url_text = ''.join(x for x in [url_parsed.netloc, url_parsed.path, url_parsed.params, url_parsed.query])
    ## split url to tokens ie: words
    tokens = re.split('[- _ % : , / \. \+ = ]', url_text)
    ## spliting by upper case
    tokens = list(flatten([re.split(r'(?<![A-Z\W])(?=[A-Z])', s) for s in tokens]))
    ## delete token with digits with len < 2
    tokens = [token for token in tokens if (not any(c.isdigit() for c in token)) and (not len(token) <=2)]
    tokens = [token for token in tokens if token not in ['www', 'html', 'com', 'net', 'org']]
    return ' '.join(token for token in tokens)

In [14]:
print(url_example)
preprocess_url(url_example)

https://www.lequipe.fr/Tennis/TennisFicheJoueur1500000000003017.html


'lequipe Tennis Tennis Fiche'

In [15]:
data['url_cleaned'] = data['url'].apply(preprocess_url)

In [16]:
data

Unnamed: 0,url,target,day,url_cleaned
0,https://www.cdiscount.com/bricolage/electricit...,"[1831, 1751, 1192, 745, 1703]",4,cdiscount bricolage electricite batterie plomb...
1,https://www.mystalk.net/profile/vitoriafcorrea,"[847, 978, 582, 1381, 529]",4,mystalk profile vitoriafcorrea
2,https://www.lequipe.fr/Tennis/TennisFicheJoueu...,"[20, 1077, 294]",4,lequipe Tennis Tennis Fiche
3,http://m.jeuxvideo.com/forums/42-32625-6018005...,"[381, 935, 1343, 622, 933]",4,jeuxvideo forums guilde fourmi legionnaire rec...
4,https://context.reverso.net/traduction/espagno...,"[692, 1265, 725, 1264, 1266]",4,context reverso traduction espagnol francais Para
...,...,...,...,...
67590,https://www.jeu-concours.biz/gagner-cafetiere-...,"[1276, 65, 1113]",1,jeu concours biz gagner cafetiere expresso
67591,https://www.sto.cx/book-186042-471.html,"[608, 617, 1033, 220, 1021]",1,sto book
67592,http://jeu.info/solution/4-images-1-mot-niveau...,"[381, 925, 622, 1494, 937]",16,jeu info solution images mot niveau
67593,https://grossesse.aufeminin.com/forum/levres-g...,"[638, 253, 419, 558, 401]",1,grossesse aufeminin forum levres gonflees acco...


In [17]:
# unique_labels = []
# for idx, row in data.iterrows():
#     for label in row['target']:
#         if label not in unique_labels: unique_labels.append(label)

In [18]:
unique_labels = list(set(itertools.chain(*data.target.values)))

In [19]:
len(unique_labels)

1903

In [20]:
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split

In [21]:
import itertools
labels = list(set(itertools.chain(*[i for i in data["target"]])))

In [22]:
from collections import Counter
label_count = dict(Counter(flatten(data['target'])))

In [23]:
threshold = 300
labels_to_keep = [k for k, v in label_count.items() if v > threshold]
len(labels_to_keep)

267

In [24]:
def keep_selected_labels(labels, labels_to_keep):
    return [label for label in labels if label in labels_to_keep]

In [25]:
data["target_cleaned"] = data["target"].apply(lambda obj: keep_selected_labels(obj, labels_to_keep))

In [26]:
data = data[data['target_cleaned'].apply(lambda x: len(x) != 0)]
data.reset_index(drop=True, inplace=True)

In [27]:
data

Unnamed: 0,url,target,day,url_cleaned,target_cleaned
0,https://www.cdiscount.com/bricolage/electricit...,"[1831, 1751, 1192, 745, 1703]",4,cdiscount bricolage electricite batterie plomb...,[1192]
1,https://www.mystalk.net/profile/vitoriafcorrea,"[847, 978, 582, 1381, 529]",4,mystalk profile vitoriafcorrea,"[978, 582, 529]"
2,https://www.lequipe.fr/Tennis/TennisFicheJoueu...,"[20, 1077, 294]",4,lequipe Tennis Tennis Fiche,"[1077, 294]"
3,http://m.jeuxvideo.com/forums/42-32625-6018005...,"[381, 935, 1343, 622, 933]",4,jeuxvideo forums guilde fourmi legionnaire rec...,"[381, 935, 1343, 622, 933]"
4,https://context.reverso.net/traduction/espagno...,"[692, 1265, 725, 1264, 1266]",4,context reverso traduction espagnol francais Para,"[692, 1265, 725, 1264, 1266]"
...,...,...,...,...,...
63234,http://www.allocine.fr/personne/fichepersonne_...,"[1107, 1096, 1310, 1106, 184]",16,allocine personne fichepersonne gen cpersonne,"[1107, 1096, 1310, 1106, 184]"
63235,https://www.sto.cx/book-186042-471.html,"[608, 617, 1033, 220, 1021]",1,sto book,"[608, 617, 220, 1021]"
63236,http://jeu.info/solution/4-images-1-mot-niveau...,"[381, 925, 622, 1494, 937]",16,jeu info solution images mot niveau,"[381, 925, 622, 1494, 937]"
63237,https://grossesse.aufeminin.com/forum/levres-g...,"[638, 253, 419, 558, 401]",1,grossesse aufeminin forum levres gonflees acco...,"[253, 419, 558, 401]"


In [28]:
import pickle


with open(DATA_PATH + 'data_cleaned.pickle', 'wb') as handle:
    pickle.dump(data, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [29]:
from platform import python_version
print("Current Python Version-", python_version())

Current Python Version- 3.8.3


In [30]:
# data.to_csv(DATA_PATH + 'data_cleaned.csv')