In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns


from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix
from sklearn.preprocessing import OneHotEncoder
from nltk.tokenize import RegexpTokenizer
from nltk.stem.snowball import SnowballStemmer
from sklearn.feature_extraction.text import CountVectorizer
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk

#!pip install six
#!pip install catboost

In [9]:
def explore(dataframe):
    # Shape
    print("Total Records: ", dataframe.shape[0])
          
    #Check Missing/Null
    x = dataframe.columns[dataframe.isnull().any()].tolist()   
    if not x:
        print("No Missing/Null Records")
    else:        
        print("Found Missing Records")

In [10]:
data = pd.read_csv("train.csv")
data.describe()

Unnamed: 0,Id,Predicted
count,185910.0,185910.0
mean,92954.5,0.501721
std,53667.73861,0.499998
min,0.0,0.0
25%,46477.25,0.0
50%,92954.5,1.0
75%,139431.75,1.0
max,185909.0,1.0


In [11]:
explore(data)

Total Records:  185910
No Missing/Null Records


In [12]:
data["url"].value_counts()

https://ceska-posta-be61a7.ingress-erytho.ewp.live/verifici/manage/                                                     12
https://events-hype-subscribe.club/                                                                                     10
https://ads2list.com/m&t.verified/                                                                                      10
https://idsvssavorg.weebly.com/                                                                                         10
http://siphen.com/afi/upload                                                                                            10
                                                                                                                        ..
https://www.evga.com/support/faq/afmhome.aspx                                                                            1
http://xiaomivietnam.org/RetailInternetPortal../                                                                         1
https://ruliweb.

In [13]:
X = data[['url']].copy()
y = data.Predicted.copy()

In [14]:
tokenizer = RegexpTokenizer(r'[A-Za-z]+')
stemmer = SnowballStemmer("english")
cv = CountVectorizer()

In [15]:
df_test = pd.read_csv("test.csv", sep=",")
X_test = df_test[['url']].copy()

In [16]:
X_test

Unnamed: 0,url
0,http://fb-ads-manager.multimo.co.id/immobilien...
1,https://www.hamdogs.net/login/wellsfargo/login...
2,https://help.ubuntu.com/community/UpgradeNotes
3,https://silverberrygroup.com/wp-admin/network/...
4,https://af.mil
...,...
46473,https://opensoul.me
46474,https://compag.cz/wp-content/upgrade/redirect/...
46475,https://66law.cn/www.66law.cn/ganxian/
46476,https://forum.guns.ru/forumtopics/155.html


In [17]:
def prepare_data(X) :
    X['text_tokenized'] = X.url.map(lambda t: tokenizer.tokenize(t))
    X['text_stemmed'] = X.text_tokenized.map(lambda t: [stemmer.stem(word) for word in t])
    X['text_sent'] = X.text_stemmed.map(lambda t: ' '.join(t))
    features = cv.fit_transform(X.text_sent)
    return X, features

In [18]:
def prepare_data_transform(X) :
    X['text_tokenized'] = X.url.map(lambda t: tokenizer.tokenize(t))
    X['text_stemmed'] = X.text_tokenized.map(lambda t: [stemmer.stem(word) for word in t])
    X['text_sent'] = X.text_stemmed.map(lambda t: ' '.join(t))
    features = cv.transform(X.text_sent)
    return X, features

In [19]:
_, _ = prepare_data(X_test)

In [20]:
X, features = prepare_data_transform(X)

In [30]:
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer

X_test['clean_url']=X_test.url.astype(str)
X_test.clean_url=X_test.clean_url.map(lambda x: tokenizer.tokenize(x))

nltk.download('omw-1.4')
wnl = WordNetLemmatizer()

X_test['lem_url'] = X_test['clean_url'].map(lambda x: [wnl.lemmatize(word) for word in x])

word_vectorizer = TfidfVectorizer(ngram_range=(1,1), max_features =1000)
word_vectorizer.fit(X_test['lem_url'].astype('str'))

data['clean_url']=data.url.astype(str)
data.clean_url=data.clean_url.map(lambda x: tokenizer.tokenize(x))
data['lem_url'] = data['clean_url'].map(lambda x: [wnl.lemmatize(word) for word in x])

X_test['clean_url']=X_test.url.astype(str)
X_test.clean_url=X_test.clean_url.map(lambda x: tokenizer.tokenize(x))
X_test['lem_url'] = X_test['clean_url'].map(lambda x: [wnl.lemmatize(word) for word in x])

unigramdataGet= word_vectorizer.transform(data['lem_url'].astype('str'))
unigramdataGet = unigramdataGet.toarray()
vocab = word_vectorizer.get_feature_names_out ()
x_tf=pd.DataFrame(np.round(unigramdataGet, 1), columns=vocab)
x_tf[x_tf>0] = 1

[nltk_data] Error loading wordnet: <urlopen error [SSL:
[nltk_data]     CERTIFICATE_VERIFY_FAILED] certificate verify failed:
[nltk_data]     unable to get local issuer certificate (_ssl.c:992)>
[nltk_data] Error loading omw-1.4: <urlopen error [SSL:
[nltk_data]     CERTIFICATE_VERIFY_FAILED] certificate verify failed:
[nltk_data]     unable to get local issuer certificate (_ssl.c:992)>


LookupError: 
**********************************************************************
  Resource [93mwordnet[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('wordnet')
  [0m
  For more information see: https://www.nltk.org/data.html

  Attempted to load [93mcorpora/wordnet[0m

  Searched in:
    - '/Users/pavel/nltk_data'
    - '/Library/Frameworks/Python.framework/Versions/3.11/nltk_data'
    - '/Library/Frameworks/Python.framework/Versions/3.11/share/nltk_data'
    - '/Library/Frameworks/Python.framework/Versions/3.11/lib/nltk_data'
    - '/usr/share/nltk_data'
    - '/usr/local/share/nltk_data'
    - '/usr/lib/nltk_data'
    - '/usr/local/lib/nltk_data'
**********************************************************************
