In [12]:
import numpy as np
import pandas as pd
import re

import os

# data_path = "./drive/MyDrive/Datathon"

# os.chdir(data_path)

In [13]:
data = pd.read_csv("datathon_train.csv",header=None)

In [14]:
data = data.rename({0:"URL",1:"fishy"},axis=1)

In [15]:

data.dropna(inplace=True)

In [16]:
data["fishy"] = data["fishy"].astype(int)

In [17]:
data.head()

Unnamed: 0,URL,fishy
0,https://www.peoplescollection.wales/discover/w...,0
1,http://yasli-sad.ru/css/chase/chaseall%20newin...,1
2,http://denizkent.net/wp-admin/js/login.alibaba...,1
3,http://www.marketbiz.net/mbz/wp-includes/js/jq...,1
4,http://guardiaoitau30horas.uniclassdispositivo...,1


#### Protocol

URLS with https://

In [18]:
data[data["URL"].apply(lambda x: True if "https://" in x else False )]["fishy"].value_counts()

0    27441
1     6834
Name: fishy, dtype: int64

URLs with https: are less often phishy

URLs with http://

In [19]:
data[data["URL"].apply(lambda x: True if "http://" in x else False )]["fishy"].value_counts()

0    40918
1    25833
Name: fishy, dtype: int64

URLs with http are phishier than URLs with https

URLs having neither https nor http

In [20]:
data[data["URL"].apply(lambda x: True if ("https://" not in x and "http://" not in x) else False )]

Unnamed: 0,URL,fishy
35896,ftp://test1234567:test1234567@www.doorcomponen...,1


Only one URL without http or http

Adding a feature to represent whether it has https or not

In [21]:
def hadHttps(x):
  if "https://" in x:
    return 1
  return 0 

In [22]:
data["SecuredProtocol"] = data["URL"].apply(hadHttps)

In [23]:
def removeProtocol(x):
  x = x.replace("https://","")
  x = x.replace("http://","")
  return x

Make Lowercase URLs

In [24]:
data["URL"] = data["URL"].apply(removeProtocol)

In [25]:
def make_lower(x):
  return x.lower()

In [26]:
data["URL"] = data["URL"].apply(make_lower)

no. of "." in url

In [27]:
def noOfdots(x):
  return x.count(".")

In [28]:
data["NoOfDots"] = data["URL"].apply(noOfdots)

In [29]:
data.head()

Unnamed: 0,URL,fishy,SecuredProtocol,NoOfDots
0,www.peoplescollection.wales/discover/what/70/q...,0,1,2
1,yasli-sad.ru/css/chase/chaseall%20newinfo_ad_3...,1,0,2
2,denizkent.net/wp-admin/js/login.alibaba.com,1,0,3
3,www.marketbiz.net/mbz/wp-includes/js/jquery/ui...,1,0,3
4,guardiaoitau30horas.uniclassdispositivos.com/i...,1,0,3


Digit Count

In [30]:
def digitcount(x):
  return len(re.findall("[0-9]",x))

In [31]:
data["DigitCount"] = data["URL"].apply(digitcount)

Checking for @ Symbol in URL

In [32]:
def hadat(x):
  if '@' in x:
    return 1 
  return 0

In [33]:
data["have_at_symbol"] = data["URL"].apply(hadat)

In [34]:
data[data["have_at_symbol"]==1]["fishy"].value_counts()

1    1711
0      28
Name: fishy, dtype: int64

any URL having @ is more likely to be phishy

Depth of URL

In [35]:
def depth(x):
  return len(re.findall("[/]",x))

In [36]:
data["DepthofURL"] = data["URL"].apply(depth)

URLs having more depth are likely to be phishy

## Tokenizing

In [37]:
import spacy
nlp = spacy.blank("en")

In [38]:
def tokenize(x):
  x=x.replace("/"," ")
  x=x.replace("."," . ")
  doc = nlp(x)
  res = [token.text for token in doc]
  return " ".join(res)

In [39]:
data["Tokens"]=data["URL"].apply(tokenize)

In [40]:
data.head()

Unnamed: 0,URL,fishy,SecuredProtocol,NoOfDots,DigitCount,have_at_symbol,DepthofURL,Tokens
0,www.peoplescollection.wales/discover/what/70/q...,0,1,2,2,0,5,www . peoplescollection . wales discover what ...
1,yasli-sad.ru/css/chase/chaseall%20newinfo_ad_3...,1,0,2,4,0,5,yasli - sad . ru css chase chaseall%20newinfo_...
2,denizkent.net/wp-admin/js/login.alibaba.com,1,0,3,0,0,3,denizkent . net wp - admin js login . alibaba ...
3,www.marketbiz.net/mbz/wp-includes/js/jquery/ui...,1,0,3,1,0,7,www . marketbiz . net mbz wp - includes js jqu...
4,guardiaoitau30horas.uniclassdispositivos.com/i...,1,0,3,3,0,1,guardiaoitau30horas . uniclassdispositivos . c...


In [41]:
from sklearn.feature_extraction.text import CountVectorizer

from sklearn.feature_extraction.text import TfidfVectorizer

In [42]:
cv = CountVectorizer()
tf = TfidfVectorizer()

In [43]:
Xcv = cv.fit_transform(data['Tokens'])
Xtf = tf.fit_transform(data['Tokens'])

In [44]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import matthews_corrcoef
from sklearn.naive_bayes import MultinomialNB
from sklearn.preprocessing import StandardScaler

Count Vectorizer

In [45]:
X_train,X_test,y_train,y_test = train_test_split(Xcv,data["fishy"],test_size=0.3)

In [46]:
Cnb = MultinomialNB()

In [47]:
Cnb.fit(X_train,y_train)

MultinomialNB()

In [49]:
y_pred = Cnb.predict(X_test)
print(accuracy_score(y_test,y_pred))
print(matthews_corrcoef(y_test,y_pred))

0.9296714918368696
0.836853125759391


In [None]:
# --------------------------------------------------------------

In [53]:
from sklearn.ensemble import RandomForestClassifier

In [56]:
rf = RandomForestClassifier(n_estimators=6)

In [57]:
rf.fit(X_train,y_train)

RandomForestClassifier(n_estimators=6)

In [58]:
y_pred = rf.predict(X_test)
print(accuracy_score(y_test,y_pred))
print(matthews_corrcoef(y_test,y_pred))

0.9130477890144755
0.7987725182066882


In [None]:
# ---------------------------------------------------

TF IDF

In [45]:
Tnb = MultinomialNB()
Tnb.fit(Xtf,y)
y_pred2 = Tnb.predict(X_test)

In [47]:
accuracy_score(y_test,y_pred2)

0.9536320972965827

In [48]:
matthews_corrcoef(y_test,y_pred2)

0.892862905610268

Testing data


In [105]:
test = pd.read_csv('testing_data.csv',header=None)

In [106]:
test.head()

Unnamed: 0,0
0,http://www.skofija-novomesto.si/
1,http://ferlafashion.com/saint/www.loginalibaba...
2,http://spatrendonline.hu
3,http://defensasur.com.ar
4,http://www.digitalvd.de/dvds/37399


In [107]:
test = test.rename({0:"URL"},axis=1)

In [108]:
def preprocess(text):
  text = text.replace("https://","")
  text = text.replace("http://","")
  return tokenize(text)

In [109]:
XT = test["URL"].apply(preprocess)

In [54]:
print(XT)

0                           www . skofija - novomesto . si
1        ferlafashion . com saint www . loginalibaba . ...
2                                      spatrendonline . hu
3                                    defensasur . com . ar
4                          www . digitalvd . de dvds 37399
                               ...                        
25214                                      mohaseban . org
25215                               www . la - croix . com
25216                                     bit . ly 2CbFTLv
25217    beautyatjessicas . com . au wp - content uploa...
25218    www . la - taverne - des - aventuriers . com t...
Name: URL, Length: 25219, dtype: object


In [56]:
cxtp = cv.transform(XT)

In [115]:
Cnb.fit(Xcv,y)

MultinomialNB()

In [116]:
res = Cnb.predict(cxtp)

In [117]:
df = pd.DataFrame()

In [118]:
df["a"] = res

In [119]:
df.to_csv('SSSS.csv',index=False)

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
cv = CountVectorizer()
tf = TfidfVectorizer()

In [None]:
Xcv = cv.fit_transform(data['Tokens'])
Tcv = tf.fit_transform(data['Tokens'])

In [None]:
from sklearn.naive_bayes import MultinomialNB

In [None]:
Cnb = MultinomialNB()
Tnb = MultinomialNB()

In [None]:
y = data["fishy"]

In [None]:
Cnb.fit(X,y)
Tnb.fit(X,y)

MultinomialNB()

In [None]:
y_pred1 = Cnb.predict(X_test)
y_pred2 = Tnb.predict(X_test)

In [49]:
import pickle as pkl

In [109]:
pkl.dump(cv,open("./App/Models/CVTransformer.pkl","wb"))
pkl.dump(tf,open("./App/Models/TFTransformer.pkl","wb"))

In [110]:
pkl.dump(Cnb,open("./App/Models/CV_NB.pkl","wb"))

In [111]:
pkl.dump(Tnb,open("./App/Models/TV_NB.pkl","wb"))