In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from nltk.tokenize import RegexpTokenizer  
from nltk.stem.snowball import SnowballStemmer
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator

from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.feature_extraction.text import CountVectorizer  
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline

In [None]:
df= pd.read_csv("../input/phishing-site-urls/phishing_site_urls.csv")

In [None]:
df.head()

In [None]:
df.isnull().sum()

#### Removind Duplicates in Dataset Data.

The data set contains some duplicate data. You should probably remove them. Duplicates are an extreme case of nonrandom sampling, and they bias your fitted model. Including them will essentially lead to the model overfitting this subset of points.

In [None]:
df.describe()

In [None]:
df = df.drop_duplicates()

#### Let study the data!

The dataset contains a total of 507,196 unique rows and 2 columns. The data consists of URLs and each URL has a lable which denotes if it is a phishy URL with the label 'bad' and for a non phishy URL it has the lable 'good'.

In [None]:
sns.countplot(x="Label",data=df)

### Preprocession

Now we have to gather all the proper words (tokens) from the URLs using RegexpTokenizer() method from the nltk method.

We pass "r'[A-Za-z]+'" to consider only alphabets for forming tokens.

In [None]:
tokenizer = RegexpTokenizer(r'[A-Za-z]+')

In [None]:
df['text_tokenized'] = df.URL.map(lambda t: tokenizer.tokenize(t))

In [None]:
df.head()

### SnowballStemmer

Snowball is a small string processing language, gives root words

Difference Between Porter Stemmer and Snowball Stemmer:
* Snowball Stemmer is more aggressive than Porter Stemmer.
* Some issues in Porter Stemmer were fixed in Snowball Stemmer.
* There is only a little difference in the working of these two.

In [None]:
root_words = SnowballStemmer("english")
df['root_words'] = df['text_tokenized'].map(lambda l: [root_words.stem(word) for word in l])

In [None]:
df.head()

### Taking all the root words into a sentence. 
This is done to pass into CountVectorizer function later on.

In [None]:
df['text_sent'] = df['root_words'].map(lambda l: ' '.join(l))
df.head()

### Visualizing Data
Spliting Data into data with bad lables and good lables

In [None]:
bad_sites = df[df.Label == 'bad']
good_sites = df[df.Label == 'good']
bad_sites.head()


In [None]:
good_sites.head()

In [None]:
print(list(STOPWORDS)[:10])

Wordcloud for the good urls.

In [None]:
data = good_sites.text_sent
data.reset_index(drop=True, inplace=True)
text = str(data)


stopwords = set(STOPWORDS).union({'com','http','www'})  
wordcloud = WordCloud(width = 800, height = 800, background_color ='white', stopwords = stopwords, max_words = 400, min_font_size = 10).generate(text)
  
               
plt.figure(figsize = (8, 8), facecolor = None)
plt.imshow(wordcloud)
plt.title("Most common words used in Good Urls", fontdict={'size': 20, 'color': 'navy', 'verticalalignment': 'bottom'})
plt.axis("off")
plt.tight_layout(pad = 0)
  
plt.show()

Wordcloud for the bad urls.

In [None]:
data = bad_sites.text_sent
data.reset_index(drop=True, inplace=True)
text = str(data)


stopwords = set(STOPWORDS).union({'com','http','www'})  
wordcloud = WordCloud(width = 800, height = 800, background_color ='white', stopwords = stopwords, max_words = 400, min_font_size = 10).generate(text)
  
               
plt.figure(figsize = (8, 8), facecolor = None)
plt.imshow(wordcloud)
plt.title("Most common words used in Bad Urls", fontdict={'size': 20, 'color': 'navy', 'verticalalignment': 'bottom'})
plt.axis("off")
plt.tight_layout(pad = 0)
  
plt.show()

### Model Creation
#### CountVectorizer

CountVectorizer tokenizes(tokenization means breaking down a sentence or paragraph or any text into words) the text along with performing very basic preprocessing like removing the punctuation marks, converting all the words to lowercase, etc.


In [None]:
c = CountVectorizer()
cv = c.fit_transform(df.text_sent)

The text has been preprocessed, tokenized(word-level tokenization: means each word is a separate token), and represented as a sparse matrix. The best part is it ignores single character during tokenization like I and a.

In [None]:
print(list(c.vocabulary_)[:10])

In [None]:
print('The length of vocabulary', len(c.get_feature_names()))
print('The shape is', cv.shape)

#This means 507196 unique urls are there in the dataset and 350837 unique words in the dataset

#### Splitting the dataset

In [None]:
Xtrain, Xtest, Ytrain, Ytest = train_test_split(cv, df.Label,test_size=0.3, random_state=5)

#### KNN (K-Nearest Neighbor Algorithm)

In [None]:
model = KNeighborsClassifier(n_neighbors=2)
model.fit(Xtrain, Ytrain)
print(model)

In [None]:
model.score(Xtest, Ytest)

In [None]:
ypred = model.predict(Xtest)

In [None]:
con_mat = pd.DataFrame(confusion_matrix(ypred, Ytest),columns = ['Predicted:Bad', 'Predicted:Good'],index = ['Actual:Bad', 'Actual:Good'])
plt.figure(figsize= (6,4))
sns.heatmap(con_mat, annot = True,fmt='g', cmap="Blues", annot_kws = {'size': 14})

In [None]:
print('\nCLASSIFICATION REPORT\n')
print(classification_report(ypred, Ytest, target_names =['Bad','Good']))

#### Logistic Regression
Logistic Regression is a classification algorithm. It's a technique for predicting a binary outcome from a series of independent variables.

A binary outcome is one in which there are only two options: the occurrence occurs (1) or it does not occur (0). Independent variables are variables or factors that have the ability to affect the result (or dependent variable).

When dealing with binary data, the best method of analysis to use is logistic regression. When the performance or dependent variable is dichotomous or categorical in nature (e.g., "yes" or "no," "pass" or "fail," and so on), you're dealing with binary results.


In [None]:
lr = LogisticRegression(max_iter=507197)
lr.fit(Xtrain,Ytrain)

In [None]:
lr.score(Xtest,Ytest)

In [None]:
ypred = lr.predict(Xtest)

In [None]:
con_mat = pd.DataFrame(confusion_matrix(ypred, Ytest),columns = ['Predicted:Bad', 'Predicted:Good'],index = ['Actual:Bad', 'Actual:Good'])
plt.figure(figsize= (6,4))
sns.heatmap(con_mat, annot = True,fmt='g', cmap="Blues", annot_kws = {'size': 14})

In [None]:
print('\nCLASSIFICATION REPORT\n')
print(classification_report(ypred, Ytest, target_names =['Bad','Good']))

#### Conclusion
From this we can see that Logistic Regression gives a better accuracy when comaperd to the K-Nearest Neigbor Algorithm. Hence Logistic Regression is the appropriate algorithm to use for classification of the URLs.

In [None]:
Xtrain, Xtest, Ytrain, Ytest = train_test_split(df.URL, df.Label,test_size=0.3, random_state=5)

In [None]:
pipeline_ls = make_pipeline(CountVectorizer(tokenizer = RegexpTokenizer(r'[A-Za-z]+').tokenize,stop_words='english'), LogisticRegression(max_iter=507197))

In [None]:
pipeline_ls.fit(Xtrain,Ytrain)

In [None]:
bad = ['yeniik.com.tr/wp-admin/js/login.alibaba.com/login.jsp.php','fazan-pacir.rs/temp/libraries/ipad','tubemoviez.exe','svision-online.de/mgfi/administrator/components/com_babackup/classes/fx29id1.txt']
good = ['youtube.com/','youtube.com/watch?v=qI0TQJI3vdU','bestbuy.com/','restorevisioncenters.com/html/technology.html']

result1 = pipeline_ls.predict(bad)
result2 = pipeline_ls.predict(good)

print(result1)
print(result2)