In [4]:
import numpy as np
import pandas as pd
import re

import os

data_path = "./drive/MyDrive/Datathon"

os.chdir(data_path)

In [5]:
data = pd.read_csv("datathon_train.csv",header=None)

In [6]:
data = data.rename({0:"URL",1:"fishy"},axis=1)

In [7]:

data.dropna(inplace=True)

In [8]:
data["fishy"] = data["fishy"].astype(int)

In [9]:
data.head()

Unnamed: 0,URL,fishy
0,https://www.peoplescollection.wales/discover/w...,0
1,http://yasli-sad.ru/css/chase/chaseall%20newin...,1
2,http://denizkent.net/wp-admin/js/login.alibaba...,1
3,http://www.marketbiz.net/mbz/wp-includes/js/jq...,1
4,http://guardiaoitau30horas.uniclassdispositivo...,1


#### Protocol

URLS with https://

In [10]:
data[data["URL"].apply(lambda x: True if "https://" in x else False )]["fishy"].value_counts()

0    27441
1     6834
Name: fishy, dtype: int64

URLs with http://

In [11]:
data[data["URL"].apply(lambda x: True if "http://" in x else False )]["fishy"].value_counts()

0    40918
1    25833
Name: fishy, dtype: int64

URLs having neither https nor http

In [12]:
data[data["URL"].apply(lambda x: True if ("https://" not in x and "http://" not in x) else False )]

Unnamed: 0,URL,fishy
35896,ftp://test1234567:test1234567@www.doorcomponen...,1


In [13]:
def hadHttps(x):
  if "https://" in x:
    return 1
  return 0 

In [14]:
data["SecuredProtocol"] = data["URL"].apply(hadHttps)

In [15]:
def removeProtocol(x):
  x = x.replace("https://","")
  x = x.replace("http://","")
  return x

Make Lowercase URLs

In [16]:
data["URL"] = data["URL"].apply(removeProtocol)

In [17]:
def make_lower(x):
  return x.lower()

In [18]:
data["URL"] = data["URL"].apply(make_lower)

no. of "." in url

In [19]:
def noOfdots(x):
  return x.count(".")

In [20]:
data["NoOfDots"] = data["URL"].apply(noOfdots)

In [21]:
data.head()

Unnamed: 0,URL,fishy,SecuredProtocol,NoOfDots
0,www.peoplescollection.wales/discover/what/70/q...,0,1,2
1,yasli-sad.ru/css/chase/chaseall%20newinfo_ad_3...,1,0,2
2,denizkent.net/wp-admin/js/login.alibaba.com,1,0,3
3,www.marketbiz.net/mbz/wp-includes/js/jquery/ui...,1,0,3
4,guardiaoitau30horas.uniclassdispositivos.com/i...,1,0,3


Digit Count

In [22]:
def digitcount(x):
  return len(re.findall("[0-9]",x))

In [23]:
data["DigitCount"] = data["URL"].apply(digitcount)

Checking for @ Symbol in URL

In [24]:
def hadat(x):
  if '@' in x:
    return 1 
  return 0

In [25]:
data["have_at_symbol"] = data["URL"].apply(hadat)

In [26]:
data[data["have_at_symbol"]==1]["fishy"].value_counts()

1    1711
0      28
Name: fishy, dtype: int64

Depth of URL

In [27]:
def depth(x):
  return len(re.findall("[/]",x))

In [28]:
data["DepthofURL"] = data["URL"].apply(depth)

## Tokenizing

In [29]:
import spacy
nlp = spacy.blank("en")

In [30]:
def tokenize(x):
  x=x.replace("/"," ")
  x=x.replace("."," ")
  doc = nlp(x)
  res = [token.text for token in doc]
  return " ".join(res)

In [31]:
data["Tokens"]=data["URL"].apply(tokenize)

In [32]:
data.head()

Unnamed: 0,URL,fishy,SecuredProtocol,NoOfDots,DigitCount,have_at_symbol,DepthofURL,Tokens
0,www.peoplescollection.wales/discover/what/70/q...,0,1,2,2,0,5,www peoplescollection wales discover what 70 q...
1,yasli-sad.ru/css/chase/chaseall%20newinfo_ad_3...,1,0,2,4,0,5,yasli - sad ru css chase chaseall%20newinfo_ad...
2,denizkent.net/wp-admin/js/login.alibaba.com,1,0,3,0,0,3,denizkent net wp - admin js login alibaba com
3,www.marketbiz.net/mbz/wp-includes/js/jquery/ui...,1,0,3,1,0,7,www marketbiz net mbz wp - includes js jquery ...
4,guardiaoitau30horas.uniclassdispositivos.com/i...,1,0,3,3,0,1,guardiaoitau30horas uniclassdispositivos com i...


In [33]:
from sklearn.feature_extraction.text import CountVectorizer

from sklearn.feature_extraction.text import TfidfVectorizer

In [34]:
cv = CountVectorizer()

In [35]:
X = cv.fit_transform(data['Tokens'])

In [36]:
from sklearn.model_selection import train_test_split

In [37]:
X_train,X_test,y_train,y_test = train_test_split(X,data["fishy"],test_size=0.3)

In [38]:
from sklearn.naive_bayes import MultinomialNB

In [39]:
nb = MultinomialNB()

In [40]:
nb.fit(X_train,y_train)

MultinomialNB()

In [41]:
y_pred = nb.predict(X_test)

In [42]:
from sklearn.metrics import accuracy_score

In [43]:
accuracy_score(y_test,y_pred)

0.9271597594024721

In [44]:
from sklearn.metrics import matthews_corrcoef

In [45]:
matthews_corrcoef(y_test,y_pred)

0.8311777838060167

In [46]:
tf = TfidfVectorizer()

In [47]:
X = tf.fit_transform(data["Tokens"])

In [48]:
print(X)

  (0, 106183)	0.4395243991209628
  (0, 129920)	0.34281222186728333
  (0, 21598)	0.3321223729239921
  (0, 165065)	0.29645849334209695
  (0, 60130)	0.3761784302982199
  (0, 163469)	0.3880117094875416
  (0, 122233)	0.4395243991209628
  (0, 166864)	0.07655180403310587
  (1, 131884)	0.35804931922953526
  (1, 123295)	0.11932404563550983
  (1, 146244)	0.3488342485702775
  (1, 159836)	0.28036794746008903
  (1, 8429)	0.35125078744880767
  (1, 49091)	0.35125078744880767
  (1, 49086)	0.289042740729902
  (1, 55203)	0.23453564886692757
  (1, 135386)	0.15787995443932212
  (1, 136228)	0.3476803708642997
  (1, 168073)	0.35518489547117493
  (2, 52179)	0.09312211558307526
  (2, 30567)	0.3696706226374355
  (2, 100759)	0.2621253601158558
  (2, 91860)	0.3425189318353403
  (2, 28472)	0.31299244248667146
  (2, 166479)	0.2582622148952411
  :	:
  (100857, 55535)	0.2946877148970459
  (100857, 77037)	0.2592166536391463
  (100857, 106848)	0.24476051646585112
  (100857, 123547)	0.2145936428166786
  (100857, 13468)

In [49]:
X_train,X_test,y_train,y_test = train_test_split(X,data["fishy"],test_size=0.2)

In [50]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

In [62]:
log = RandomForestClassifier(n_estimators=20)

In [65]:
import pickle as pkl

In [67]:
pkl.dump(cv,open("./App/Models/CV.pkl","wb"))

In [68]:
pkl.dump(tf,open("./App/Models/TFIDF.pkl","wb"))