
#### Phishing is a method of trying to gather personal information like login credentials or credit card information using deceptive e-mails or  websites.

#### Phishing websites are created to dupe unsuspecting users into thinking they are on a legitimate site. The criminals will spend a lot of time making the site seem as credible as possible and many sites will appear almost indistinguishable from the real thing

In [None]:
# importing some useful libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns  
import time 

from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from nltk.tokenize import RegexpTokenizer  
from nltk.stem.snowball import SnowballStemmer
from sklearn.feature_extraction.text import CountVectorizer  
from sklearn.pipeline import make_pipeline

from PIL import Image
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
import pickle 

In [None]:
# Loading the dataset
df= pd.read_csv("../input/phishing-site-urls/phishing_site_urls.csv")

In [None]:
df.head()

In [None]:
df.info()

In [None]:
df.shape

### About data
#### It consist 549346 rows and 2 columns .The first column consist of links of website and the second column states whether the site is good or bad(phishing)

In [None]:
df.isnull().sum()

In [None]:
sns.countplot(x="Label",data=df)

### PREPROCESSING

#### Now we can Vectoize the URLs.We can gather words from the URLs using Tokenizer
### RegexpTokenizer
#### we are able to extract the tokens from string by using regular expression with RegexpTokenizer() method.

In [None]:
tokenizer = RegexpTokenizer(r'[A-Za-z]+')

In [None]:
tokenizer.tokenize(df.URL[0]) # this will fetch all the words from the first URL

In [None]:
# Tokenizing all the rows 
print('Getting words tokenized ...')
t0= time.perf_counter()
df['text_tokenized'] = df.URL.map(lambda t: tokenizer.tokenize(t))
t1 = time.perf_counter() - t0
print('Time taken',t1 ,'sec')

In [None]:
df.sample(5)

### SnowballStemmer
#### Snowball is a small string processing language that gives the root words

In [None]:
stemmer = SnowballStemmer("english") # choose a language

In [None]:
# Getting all the stemmed words
print('Getting words stemmed ...')
t0= time.perf_counter()
df['text_stemmed'] = df['text_tokenized'].map(lambda l: [stemmer.stem(word) for word in l])
t1= time.perf_counter() - t0
print('Time taken',t1 ,'sec')

In [None]:
df.sample(5)

In [None]:
# Joining all the stemmmed words.
print('Get joiningwords ...')
t0= time.perf_counter()
df['text_sent'] = df['text_stemmed'].map(lambda l: ' '.join(l))
t1= time.perf_counter() - t0
print('Time taken',t1 ,'sec')

In [None]:
bad_sites = df[df.Label == 'bad']
good_sites = df[df.Label == 'good']

In [None]:
bad_sites.head()

In [None]:
good_sites.head()

In [None]:
df.head()

## Creating Model
### CountVectorizer- Convert a collection of text documents to a matrix of token counts

In [None]:
cv = CountVectorizer()

In [None]:
feature = cv.fit_transform(df.text_sent) #transform all text which we tokenize and stemed

In [None]:
feature[:5].toarray() # convert sparse matrix into array to print transformed features

In [None]:
from sklearn.model_selection import train_test_split


In [None]:
trainX, testX, trainY, testY = train_test_split(feature, df.Label)

### LogisticRegression
#### Logistic Regression is a Machine Learning classification algorithm that is used to predict the probability of a categorical dependent variable. In logistic regression, the dependent variable is a binary variable that contains data coded as 1 (yes, success, etc.) or 0 (no, failure, etc.). In other words, the logistic regression model predicts P(Y=1) as a function of X.

In [None]:
from sklearn.linear_model import LogisticRegression


In [None]:
lr = LogisticRegression()
lr.fit(trainX,trainY)

In [None]:
lr.score(testX,testY)

#### Logistic Regression provide 96% accuracy,Now we will store the score in the dictionary so that we can find which model performs the best.


In [None]:
Scores_ml = {}
Scores_ml['Logistic Regression'] = np.round(lr.score(testX,testY),2)

In [None]:
# creating confusing matrix
print('Training Accuracy :',lr.score(trainX,trainY))
print('Testing Accuracy :',lr.score(testX,testY))
con_mat = pd.DataFrame(confusion_matrix(lr.predict(testX), testY),
            columns = ['Predicted:Bad', 'Predicted:Good'],
            index = ['Actual:Bad', 'Actual:Good'])


print('\nCLASSIFICATION REPORT\n')
print(classification_report(lr.predict(testX), testY,
                            target_names =['Bad','Good']))

print('\nCONFUSION MATRIX')
plt.figure(figsize= (6,4))
sns.heatmap(con_mat, annot = True,fmt='d',cmap="YlGnBu")

## MultinomialNB
#### The multinomial Naive Bayes classifier is suitable for classification with discrete features (e.g., word counts for text classification). The multinomial distribution normally requires integer feature counts. However, in practice, fractional counts such as tf-idf may also work.

In [None]:
from sklearn.naive_bayes import MultinomialNB 


In [None]:
# create mnb object
mnb = MultinomialNB()

In [None]:
mnb.fit(trainX,trainY)

In [None]:
mnb.score(testX,testY)

#### MultinomialNB provide 95% accuracy,so we can store the score in the dictionary

In [None]:
Scores_ml['MultinomialNB'] = np.round(mnb.score(testX,testY),2)

In [None]:
print('Training Accuracy :',mnb.score(trainX,trainY))
print('Testing Accuracy :',mnb.score(testX,testY))
con_mat = pd.DataFrame(confusion_matrix(mnb.predict(testX), testY),
            columns = ['Predicted:Bad', 'Predicted:Good'],
            index = ['Actual:Bad', 'Actual:Good'])


print('\nCLASSIFICATION REPORT\n')
print(classification_report(mnb.predict(testX), testY,
                            target_names =['Bad','Good']))

print('\nCONFUSION MATRIX')
plt.figure(figsize= (6,4))
sns.heatmap(con_mat, annot = True,fmt='d',cmap="YlGnBu")

In [None]:
# Lets compare the two models and find out which one is best.
acc = pd.DataFrame.from_dict(Scores_ml,orient = 'index',columns=['Accuracy'])
sns.set_style('darkgrid')
sns.barplot(acc.index,acc.Accuracy)

#### So, Logistic Regression is the best fit model, Now lets make sklearn pipeline using Logistic Regression

In [None]:
pipeline_ls = make_pipeline(CountVectorizer(tokenizer = RegexpTokenizer(r'[A-Za-z]+').tokenize,stop_words='english'), LogisticRegression())

In [None]:
trainX, testX, trainY, testY = train_test_split(df.URL, df.Label)

In [None]:
pipeline_ls.fit(trainX,trainY)

In [None]:
pipeline_ls.score(testX,testY)

In [None]:
print('Training Accuracy :',pipeline_ls.score(trainX,trainY))
print('Testing Accuracy :',pipeline_ls.score(testX,testY))
con_mat = pd.DataFrame(confusion_matrix(pipeline_ls.predict(testX), testY),
            columns = ['Predicted:Bad', 'Predicted:Good'],
            index = ['Actual:Bad', 'Actual:Good'])


print('\nCLASSIFICATION REPORT\n')
print(classification_report(pipeline_ls.predict(testX), testY,
                            target_names =['Bad','Good']))

print('\nCONFUSION MATRIX')
plt.figure(figsize= (6,4))
sns.heatmap(con_mat, annot = True,fmt='d',cmap="YlGnBu")

### Lets dump the model in pickle.

In [None]:
pickle.dump(pipeline_ls,open('phishing.pkl','wb'))

In [None]:
loaded_model = pickle.load(open('phishing.pkl', 'rb'))
result = loaded_model.score(testX,testY)
print(result)

#### Thats it. Now the pkl file is deployed into Heroku and can be used to create an app.
#### If you like the Notebook , do upvote.