In [28]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns


from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix
from sklearn.preprocessing import OneHotEncoder

#!pip install six
!pip install catboost

Defaulting to user installation because normal site-packages is not writeable


In [29]:
def explore(dataframe):
    # Shape
    print("Total Records: ", dataframe.shape[0])
          
    #Check Missing/Null
    x = dataframe.columns[dataframe.isnull().any()].tolist()   
    if not x:
        print("No Missing/Null Records")
    else:        
        print("Found Missing Records")

In [30]:
data = pd.read_csv("train.csv")
data.describe()

Unnamed: 0,Id,Predicted
count,185910.0,185910.0
mean,92954.5,0.501721
std,53667.73861,0.499998
min,0.0,0.0
25%,46477.25,0.0
50%,92954.5,1.0
75%,139431.75,1.0
max,185909.0,1.0


In [31]:
data.head()

Unnamed: 0,Id,url,Predicted
0,0,http://banqsuepoy.temp.swtest.ru/pb/assistance...,1
1,1,https://my.mail.ru/community/mir24.tv/,0
2,2,https://rmailidtrack-b484fa.ingress-bonde.ewp....,1
3,3,http://tjvodxie.cn.b2b168.com/m296765p1/,0
4,4,https://articulate.com/360/review,0


In [32]:
data["Predicted"].value_counts()

1    93275
0    92635
Name: Predicted, dtype: int64

In [33]:
data["url"].describe()

count                                                185910
unique                                               175579
top       https://ceska-posta-be61a7.ingress-erytho.ewp....
freq                                                     12
Name: url, dtype: object

In [34]:
explore(data)

Total Records:  185910
No Missing/Null Records


In [35]:
data["url"].value_counts()

https://ceska-posta-be61a7.ingress-erytho.ewp.live/verifici/manage/                                                     12
https://events-hype-subscribe.club/                                                                                     10
https://ads2list.com/m&t.verified/                                                                                      10
https://idsvssavorg.weebly.com/                                                                                         10
http://siphen.com/afi/upload                                                                                            10
                                                                                                                        ..
https://www.evga.com/support/faq/afmhome.aspx                                                                            1
http://xiaomivietnam.org/RetailInternetPortal../                                                                         1
https://ruliweb.

In [36]:
from nltk.tokenize import RegexpTokenizer
from nltk.stem.snowball import SnowballStemmer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split

In [37]:
X = data[['url']].copy()
y = data.Predicted.copy()

In [38]:
tokenizer = RegexpTokenizer(r'[A-Za-z]+')
stemmer = SnowballStemmer("english")
cv = CountVectorizer()

In [39]:
df_test = pd.read_csv("test.csv", sep=",")
X_test = df_test[['url']].copy()

In [40]:
X_test

Unnamed: 0,url
0,http://fb-ads-manager.multimo.co.id/immobilien...
1,https://www.hamdogs.net/login/wellsfargo/login...
2,https://help.ubuntu.com/community/UpgradeNotes
3,https://silverberrygroup.com/wp-admin/network/...
4,https://af.mil
...,...
46473,https://opensoul.me
46474,https://compag.cz/wp-content/upgrade/redirect/...
46475,https://66law.cn/www.66law.cn/ganxian/
46476,https://forum.guns.ru/forumtopics/155.html


In [41]:
def prepare_data(X) :
    X['text_tokenized'] = X.url.map(lambda t: tokenizer.tokenize(t))
    X['text_stemmed'] = X.text_tokenized.map(lambda t: [stemmer.stem(word) for word in t])
    X['text_sent'] = X.text_stemmed.map(lambda t: ' '.join(t))
    features = cv.fit_transform(X.text_sent)
    return X, features

In [42]:
def prepare_data_transform(X) :
    X['text_tokenized'] = X.url.map(lambda t: tokenizer.tokenize(t))
    X['text_stemmed'] = X.text_tokenized.map(lambda t: [stemmer.stem(word) for word in t])
    X['text_sent'] = X.text_stemmed.map(lambda t: ' '.join(t))
    features = cv.transform(X.text_sent)
    return X, features

In [43]:
_, _ = prepare_data(X_test)

In [44]:
X, features = prepare_data_transform(X)

In [45]:
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer


X_test['clean_url']=X_test.url.astype(str)
X_test.clean_url=X_test.clean_url.map(lambda x: tokenizer.tokenize(x))

nltk.download('omw-1.4')
wnl = WordNetLemmatizer()


X_test['lem_url'] = X_test['clean_url'].map(lambda x: [wnl.lemmatize(word) for word in x])


word_vectorizer = TfidfVectorizer(ngram_range=(1,1), max_features =1000)
word_vectorizer.fit(X_test['lem_url'].astype('str'))

data['clean_url']=data.url.astype(str)
data.clean_url=data.clean_url.map(lambda x: tokenizer.tokenize(x))
data['lem_url'] = data['clean_url'].map(lambda x: [wnl.lemmatize(word) for word in x])

X_test['clean_url']=X_test.url.astype(str)
X_test.clean_url=X_test.clean_url.map(lambda x: tokenizer.tokenize(x))
X_test['lem_url'] = X_test['clean_url'].map(lambda x: [wnl.lemmatize(word) for word in x])

unigramdataGet= word_vectorizer.transform(data['lem_url'].astype('str'))
unigramdataGet = unigramdataGet.toarray()
vocab = word_vectorizer.get_feature_names_out ()
x_tf=pd.DataFrame(np.round(unigramdataGet, 1), columns=vocab)
x_tf[x_tf>0] = 1

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\p_shebarshin\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\p_shebarshin\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [46]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import StackingClassifier
from sklearn import model_selection
import catboost
from catboost import CatBoostClassifier
from sklearn.svm import LinearSVC
from sklearn import svm
from sklearn.model_selection import GridSearchCV
from sklearn import preprocessing
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler

In [47]:
cbc = CatBoostClassifier()

In [48]:
trainX, testX, trainY, testY = train_test_split(features, y, test_size=1-0.7, stratify=y, random_state=42)
cbc.fit(trainX, trainY)
predY = cbc.predict(testX)
accuracy = accuracy_score(testY, predY)
precision = precision_score(testY, predY, pos_label=1)
recall = recall_score(testY, predY, pos_label=1)

print(f"accuracy = {accuracy} precision={precision} recall= {recall}")

Learning rate set to 0.082372
0:	learn: 0.6546340	total: 172ms	remaining: 2m 51s
1:	learn: 0.6229473	total: 306ms	remaining: 2m 32s
2:	learn: 0.5963042	total: 435ms	remaining: 2m 24s
3:	learn: 0.5742892	total: 558ms	remaining: 2m 18s
4:	learn: 0.5558140	total: 688ms	remaining: 2m 16s
5:	learn: 0.5412358	total: 817ms	remaining: 2m 15s
6:	learn: 0.5280143	total: 945ms	remaining: 2m 14s
7:	learn: 0.5171515	total: 1.07s	remaining: 2m 12s
8:	learn: 0.5083267	total: 1.2s	remaining: 2m 12s
9:	learn: 0.5010951	total: 1.33s	remaining: 2m 11s
10:	learn: 0.4945117	total: 1.46s	remaining: 2m 10s
11:	learn: 0.4888736	total: 1.59s	remaining: 2m 10s
12:	learn: 0.4836342	total: 1.72s	remaining: 2m 10s
13:	learn: 0.4794819	total: 1.86s	remaining: 2m 11s
14:	learn: 0.4763132	total: 1.99s	remaining: 2m 10s
15:	learn: 0.4730629	total: 2.11s	remaining: 2m 9s
16:	learn: 0.4690753	total: 2.26s	remaining: 2m 10s
17:	learn: 0.4661009	total: 2.4s	remaining: 2m 10s
18:	learn: 0.4627453	total: 2.52s	remaining: 2m

158:	learn: 0.3531207	total: 19.9s	remaining: 1m 45s
159:	learn: 0.3526818	total: 20s	remaining: 1m 45s
160:	learn: 0.3523475	total: 20.1s	remaining: 1m 44s
161:	learn: 0.3518950	total: 20.3s	remaining: 1m 44s
162:	learn: 0.3514979	total: 20.4s	remaining: 1m 44s
163:	learn: 0.3511722	total: 20.5s	remaining: 1m 44s
164:	learn: 0.3507760	total: 20.6s	remaining: 1m 44s
165:	learn: 0.3503428	total: 20.7s	remaining: 1m 44s
166:	learn: 0.3498913	total: 20.9s	remaining: 1m 44s
167:	learn: 0.3494645	total: 21s	remaining: 1m 43s
168:	learn: 0.3490969	total: 21.1s	remaining: 1m 43s
169:	learn: 0.3488178	total: 21.2s	remaining: 1m 43s
170:	learn: 0.3484906	total: 21.3s	remaining: 1m 43s
171:	learn: 0.3481098	total: 21.4s	remaining: 1m 43s
172:	learn: 0.3476727	total: 21.6s	remaining: 1m 43s
173:	learn: 0.3473808	total: 21.7s	remaining: 1m 42s
174:	learn: 0.3469569	total: 21.8s	remaining: 1m 42s
175:	learn: 0.3466309	total: 21.9s	remaining: 1m 42s
176:	learn: 0.3463692	total: 22.1s	remaining: 1m 4

314:	learn: 0.3124507	total: 39s	remaining: 1m 24s
315:	learn: 0.3123005	total: 39.1s	remaining: 1m 24s
316:	learn: 0.3120098	total: 39.3s	remaining: 1m 24s
317:	learn: 0.3119125	total: 39.4s	remaining: 1m 24s
318:	learn: 0.3116869	total: 39.5s	remaining: 1m 24s
319:	learn: 0.3115123	total: 39.6s	remaining: 1m 24s
320:	learn: 0.3113622	total: 39.7s	remaining: 1m 24s
321:	learn: 0.3111306	total: 39.9s	remaining: 1m 23s
322:	learn: 0.3109540	total: 40s	remaining: 1m 23s
323:	learn: 0.3107657	total: 40.1s	remaining: 1m 23s
324:	learn: 0.3106728	total: 40.2s	remaining: 1m 23s
325:	learn: 0.3104656	total: 40.4s	remaining: 1m 23s
326:	learn: 0.3102137	total: 40.5s	remaining: 1m 23s
327:	learn: 0.3100485	total: 40.6s	remaining: 1m 23s
328:	learn: 0.3097754	total: 40.7s	remaining: 1m 23s
329:	learn: 0.3096113	total: 40.9s	remaining: 1m 23s
330:	learn: 0.3094664	total: 41s	remaining: 1m 22s
331:	learn: 0.3092854	total: 41.2s	remaining: 1m 22s
332:	learn: 0.3090867	total: 41.3s	remaining: 1m 22s

471:	learn: 0.2897074	total: 59.5s	remaining: 1m 6s
472:	learn: 0.2896125	total: 59.6s	remaining: 1m 6s
473:	learn: 0.2895018	total: 59.8s	remaining: 1m 6s
474:	learn: 0.2893929	total: 59.9s	remaining: 1m 6s
475:	learn: 0.2892925	total: 1m	remaining: 1m 6s
476:	learn: 0.2891892	total: 1m	remaining: 1m 5s
477:	learn: 0.2890111	total: 1m	remaining: 1m 5s
478:	learn: 0.2889093	total: 1m	remaining: 1m 5s
479:	learn: 0.2887657	total: 1m	remaining: 1m 5s
480:	learn: 0.2885760	total: 1m	remaining: 1m 5s
481:	learn: 0.2884821	total: 1m	remaining: 1m 5s
482:	learn: 0.2883431	total: 1m 1s	remaining: 1m 5s
483:	learn: 0.2882015	total: 1m 1s	remaining: 1m 5s
484:	learn: 0.2881165	total: 1m 1s	remaining: 1m 5s
485:	learn: 0.2880692	total: 1m 1s	remaining: 1m 4s
486:	learn: 0.2879612	total: 1m 1s	remaining: 1m 4s
487:	learn: 0.2878304	total: 1m 1s	remaining: 1m 4s
488:	learn: 0.2876972	total: 1m 1s	remaining: 1m 4s
489:	learn: 0.2876094	total: 1m 2s	remaining: 1m 4s
490:	learn: 0.2875209	total: 1m 2

629:	learn: 0.2742040	total: 1m 19s	remaining: 46.6s
630:	learn: 0.2740964	total: 1m 19s	remaining: 46.5s
631:	learn: 0.2740030	total: 1m 19s	remaining: 46.4s
632:	learn: 0.2739180	total: 1m 19s	remaining: 46.2s
633:	learn: 0.2738726	total: 1m 19s	remaining: 46.1s
634:	learn: 0.2737920	total: 1m 19s	remaining: 46s
635:	learn: 0.2736964	total: 1m 20s	remaining: 45.8s
636:	learn: 0.2736231	total: 1m 20s	remaining: 45.7s
637:	learn: 0.2735920	total: 1m 20s	remaining: 45.6s
638:	learn: 0.2734763	total: 1m 20s	remaining: 45.4s
639:	learn: 0.2734285	total: 1m 20s	remaining: 45.3s
640:	learn: 0.2733338	total: 1m 20s	remaining: 45.2s
641:	learn: 0.2733039	total: 1m 20s	remaining: 45s
642:	learn: 0.2732271	total: 1m 20s	remaining: 44.9s
643:	learn: 0.2731007	total: 1m 21s	remaining: 44.8s
644:	learn: 0.2730121	total: 1m 21s	remaining: 44.7s
645:	learn: 0.2729387	total: 1m 21s	remaining: 44.5s
646:	learn: 0.2729020	total: 1m 21s	remaining: 44.4s
647:	learn: 0.2727881	total: 1m 21s	remaining: 44.

785:	learn: 0.2627966	total: 1m 37s	remaining: 26.7s
786:	learn: 0.2627369	total: 1m 38s	remaining: 26.5s
787:	learn: 0.2626600	total: 1m 38s	remaining: 26.4s
788:	learn: 0.2625993	total: 1m 38s	remaining: 26.3s
789:	learn: 0.2625087	total: 1m 38s	remaining: 26.2s
790:	learn: 0.2624220	total: 1m 38s	remaining: 26s
791:	learn: 0.2623402	total: 1m 38s	remaining: 25.9s
792:	learn: 0.2622689	total: 1m 38s	remaining: 25.8s
793:	learn: 0.2621526	total: 1m 38s	remaining: 25.7s
794:	learn: 0.2621018	total: 1m 39s	remaining: 25.5s
795:	learn: 0.2620179	total: 1m 39s	remaining: 25.4s
796:	learn: 0.2619805	total: 1m 39s	remaining: 25.3s
797:	learn: 0.2619010	total: 1m 39s	remaining: 25.2s
798:	learn: 0.2618736	total: 1m 39s	remaining: 25s
799:	learn: 0.2618340	total: 1m 39s	remaining: 24.9s
800:	learn: 0.2617397	total: 1m 39s	remaining: 24.8s
801:	learn: 0.2616707	total: 1m 40s	remaining: 24.7s
802:	learn: 0.2615897	total: 1m 40s	remaining: 24.6s
803:	learn: 0.2615300	total: 1m 40s	remaining: 24.

941:	learn: 0.2534269	total: 1m 58s	remaining: 7.28s
942:	learn: 0.2533648	total: 1m 58s	remaining: 7.15s
943:	learn: 0.2533040	total: 1m 58s	remaining: 7.03s
944:	learn: 0.2532715	total: 1m 58s	remaining: 6.9s
945:	learn: 0.2532179	total: 1m 58s	remaining: 6.78s
946:	learn: 0.2531334	total: 1m 58s	remaining: 6.66s
947:	learn: 0.2530954	total: 1m 59s	remaining: 6.53s
948:	learn: 0.2530453	total: 1m 59s	remaining: 6.41s
949:	learn: 0.2530141	total: 1m 59s	remaining: 6.29s
950:	learn: 0.2529937	total: 1m 59s	remaining: 6.16s
951:	learn: 0.2529676	total: 1m 59s	remaining: 6.04s
952:	learn: 0.2528932	total: 1m 59s	remaining: 5.91s
953:	learn: 0.2528703	total: 2m	remaining: 5.79s
954:	learn: 0.2528150	total: 2m	remaining: 5.66s
955:	learn: 0.2527878	total: 2m	remaining: 5.54s
956:	learn: 0.2527529	total: 2m	remaining: 5.41s
957:	learn: 0.2526990	total: 2m	remaining: 5.28s
958:	learn: 0.2526293	total: 2m	remaining: 5.16s
959:	learn: 0.2525647	total: 2m	remaining: 5.03s
960:	learn: 0.2525289	

In [49]:
X_test, features_test = prepare_data_transform(X_test)

In [50]:
pred_test = cbc.predict(features_test)

In [51]:
len(pred_test)

46478

In [52]:
submit = pd.DataFrame(pred_test, columns=["Predicted"])

In [53]:
submit.to_csv(f"sample_submit.csv",index_label="Id")