In [4]:
import pickle

import pandas as pd
from nltk import RegexpTokenizer
from nltk import SnowballStemmer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.pipeline import make_pipeline



In [5]:
#Data Loading

phish_data = pd.read_csv('dataset/phishing_site_urls.csv')

In [6]:
phish_data.head()

Unnamed: 0,URL,Label
0,nobell.it/70ffb52d079109dca5664cce6f317373782/...,bad
1,www.dghjdgf.com/paypal.co.uk/cycgi-bin/webscrc...,bad
2,serviciosbys.com/paypal.cgi.bin.get-into.herf....,bad
3,mail.printakid.com/www.online.americanexpress....,bad
4,thewhiskeydregs.com/wp-content/themes/widescre...,bad


In [7]:
phish_data.tail()


Unnamed: 0,URL,Label
549341,23.227.196.215/,bad
549342,apple-checker.org/,bad
549343,apple-iclods.org/,bad
549344,apple-uptoday.org/,bad
549345,apple-search.info,bad


In [8]:
phish_data.isnull()

Unnamed: 0,URL,Label
0,False,False
1,False,False
2,False,False
3,False,False
4,False,False
...,...,...
549341,False,False
549342,False,False
549343,False,False
549344,False,False


In [9]:
phish_data.size

1098692

In [10]:
#Lets use Tokenizer first to gather words
# then use countVectorizer to Vectorize all words

tokenizer = RegexpTokenizer(r'[A-Za-z]+')
phish_data['text_tokenized'] = phish_data.URL.map(lambda t: tokenizer.tokenize(t))

In [11]:
# Snow Ball Streamer for getting root words

stemmer = SnowballStemmer(language='english')
phish_data['text_stemmed'] = phish_data['text_tokenized'].map(lambda l:
                                                              [stemmer.stem(word) for word in l])


In [12]:
# Joining the stemmed text corpus

phish_data['text_sent'] = phish_data['text_stemmed'].map(lambda l: ' '.join(l))

In [13]:
# Using Count Vectorization to vectorize the text corpus

cv = CountVectorizer()
feature = cv.fit_transform(phish_data['text_sent'])


In [14]:
# Spliting training data and testing data

trainX, testX, trainY, testY = train_test_split(feature, phish_data.Label)

In [15]:
# Using Logistic Regression to train the model

lr = LogisticRegression()
lr.fit(trainX, trainY)
print(lr.score(testX, testY))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.9651077277062992


In [16]:
# Used to Predict the user input

def user_data_predict(user_data):
    res = lr.predict(user_data)[0]
    return res

In [17]:
# Takes User Input and applies data pre-processing
# tasks before predicting

def user_input(data):
    tk_data = tokenizer.tokenize(data)
    stm_data = None
    join_data = []
    for word in tk_data:
        stm_data = stemmer.stem(word)
        join_data.append(stm_data)
    join_data = ' '.join(join_data)
    print(join_data)
    user_data = cv.transform([join_data])
    res = user_data_predict(user_data)
    return res

In [20]:
# Pipelining the code

pipeline_lr = make_pipeline(CountVectorizer(tokenizer = RegexpTokenizer(r'[A-Za-z]+').tokenize,stop_words='english'), LogisticRegression())
trainX, testX, trainY, testY = train_test_split(phish_data.URL, phish_data.Label)
pipeline_lr.fit(trainX,trainY)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.9659159585545046
good


In [21]:
# pickling the model
pickle.dump(pipeline_lr,open('model.pkl','wb'))