In [1]:
import gc
import sys
import os
import warnings
from tqdm import tqdm
import seaborn as sns    

sys.path.append(os.path.abspath("../"))
os.environ['OPENBLAS_NUM_THREADS'] = '1'
warnings.filterwarnings('ignore')

In [2]:
import pandas as pd
import numpy as np
import time
import polars as pl
import scipy
import implicit
import bisect
import sklearn.metrics as m
from catboost import CatBoostClassifier, CatBoostRegressor, Pool
from sklearn.model_selection import train_test_split
from sklearn.calibration import calibration_curve, CalibratedClassifierCV

In [3]:
import importlib

import modules
import fe_modules

importlib.reload(modules)
importlib.reload(fe_modules)

from modules.memory_utils import pandas_reduce_mem_usage, pandas_string_to_cat
from fe_modules.text_manipulation import get_domain
from fe_modules.datetime_features import get_timestamp, get_relative_time, part_of_day_to_hour, add_hour_to_date
from fe_modules.preprocessing import clean_os_type

# Load data

In [4]:
LOCAL_DATA_PATH = '../data/'
SPLIT_SEED = 42

In [5]:
df=pd.read_excel('../external_data/sites_final.xlsx',index_col=0)
df.head()

Unnamed: 0,url_host,vpn_only,tod,is_broken
0,http://googleads.g.doubleclick.net,1,Google Marketing Platform offers an enterprise...,0
1,http://yandex.ru,0,Найдётся всё,0
2,http://i.ytimg.com,0,����_x0000__x0010_JFIF_x0000__x0001__x0001__x0...,0
3,http://vk.com,0,ВКонтакте – универсальное средство для общения...,0
4,http://avatars.mds.yandex.net,0,410 Gone410 Gonenginx,1


In [10]:
df

Unnamed: 0,url_host,vpn_only,tod,is_broken
0,http://googleads.g.doubleclick.net,1,Google Marketing Platform offers an enterprise...,0
1,http://yandex.ru,0,Найдётся всё,0
2,http://i.ytimg.com,0,����_x0000__x0010_JFIF_x0000__x0001__x0001__x0...,0
3,http://vk.com,0,ВКонтакте – универсальное средство для общения...,0
4,http://avatars.mds.yandex.net,0,410 Gone410 Gonenginx,1
...,...,...,...,...
9995,http://tolstoy-lit.ru,1,Ëåâ ÒîëñòîéËåâ Íèêîëàåâè÷ ÒîëñòîéÑî÷èíåíèÿ:Àíí...,0
9996,http://neprizyvnoi.ru,0,Все об армии и призыве: информационная помощь ...,0
9997,http://wik-end.com,0,Информационный портал Weekend-Тверь - всегда с...,0
9998,http://probeg.org,1,ПроБЕГ в России и миреКалендариПолныйМосква и ...,0


In [13]:
df.loc[df.tod.str.contains('Ë').fillna(False)].url_host.to_csv('to_check.csv')

In [14]:
df.loc[df.tod.str.contains('Ë').fillna(False)]

Unnamed: 0,url_host,vpn_only,tod,is_broken
1178,http://wroom.ru,1,Àâòîìîáèëüíûé ñàéò Wroom.ruËåíòàÊàòàëîãÎòçûâûÌ...,0
1534,http://pandia.ru,1,"Ïëàòôîðìà ìàòåðèàëîâ Pandia.ru. Àâòîðñêèå, ýíö...",0
1819,http://autolada.ru,1,Ãëàâíàÿ :: AUTOLADA.RUÂñå êàòåãîðèèÂñå êàòåãîð...,0
1934,http://dayname.ru,1,Êàðòèíêè äëÿ ïîçäðàâëåíèÿ ñ Äíåì Ðîæäåíèÿ. Ìîæ...,0
2470,http://fapl.ru,1,FAPL.ru - Àíãëèéñêèé ôóòáîëFAPL.ru — Àíãëèéñêà...,0
2654,http://m.good-menu.ru,1,Êóëèíàðíûé ñàéò Good-Menu.Ru. Êóëèíàðíûå ðåöåï...,0
3163,http://h5.cc.lerjin.com,1,_x0008__x0008_=z^_x0000__x0003_index.html_x0...,0
3557,http://helpiks.org,1,Õåëïèêñ - Èíòåðíåò ïîìîùíèêÃëàâíàÿÊàòåãîðèèÊîí...,0
3587,http://rockgig.net,1,Àôèøà | Ìîñêâà | RockGig×âõîäðåãèñòðàöèÿêîíöåð...,0
4351,http://ftour.otzyv.ru,1,"Òóðôîðóì, òóðèñòè÷åñêèé ôîðóìÐåãèñòðàöèÿÇàáûëè...",0
