In [1]:
from functools import wraps
from flask import Flask
from flask import request, Response
from subprocess import call
from flask import render_template

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
import random
from sklearn.feature_extraction.text import TfidfVectorizer
import sys
import os
from sklearn.linear_model import LogisticRegression

import math
from collections import Counter

In [2]:

def entropy(s):
	p, lns = Counter(s), float(len(s))
	return -sum( count/lns * math.log(count/lns, 2) for count in p.values())


def getTokens(input):
	tokensBySlash = str(input).split('/')	#get tokens after splitting by slash
	allTokens = []
	for i in tokensBySlash:
		tokens = str(i).split('-')	#get tokens after splitting by dash
		tokensByDot = []
		for j in range(0,len(tokens)):
			tempTokens = str(tokens[j]).split('.')	#get tokens after splitting by dot
			tokensByDot = tokensByDot + tempTokens
		allTokens = allTokens + tokens + tokensByDot
	allTokens = list(set(allTokens))	#remove redundant tokens
	if 'com' in allTokens:
		allTokens.remove('com')	#removing .com since it occurs a lot of times and it should not be included in our features
	
	return allTokens



In [10]:
 allurls = 'ScrubM_MPA_Approved_Sites_20210414.csv'	#path to our all urls file
#allurls = './data/data.csv'
allurlscsv = pd.read_csv(allurls,',',error_bad_lines=False)	#reading file
allurlsdata = np.array(allurlscsv)	#converting it into an array
allurlsdata


array([['filmux.online', 'www.filmux.online', 'Pirate Site'],
       ['ahmedateeqzia.xyz', 'www1.ahmedateeqzia.xyz', 'Pirate Site'],
       ['123movies-stream.com', 'www.123movies-stream.com',
        'Pirate Site'],
       ...,
       ['fullhdcizgifilmizle.com', 'fullhdcizgifilmizle.com',
        'Infringing Site'],
       ['moviesclub.live', 'www.moviesclub.live', 'Infringing Site'],
       ['movietimes.show', 'movietimes.show', 'Infringing Site']],
      dtype=object)

In [13]:
random.shuffle(allurlsdata)	#shuffling
y = [d[2] for d in allurlsdata]	#all labels 
corpus = [d[1] for d in allurlsdata]	#all urls corresponding to a label (either good or bad)
vectorizer = TfidfVectorizer(tokenizer=getTokens)	#get a vector for each url but use our customized tokenizer
print(y)
#print(corpus)
#print(vectorizer)


['Pirate Site', 'Pirate Site', 'Pirate Site', 'Pirate Site', 'Pirate Site', 'Pirate Site', 'Pirate Site', 'Pirate Site', 'Pirate Site', 'Pirate Site', 'Pirate Site', 'Pirate Site', 'Pirate Site', 'Pirate Site', 'Pirate Site', 'Pirate Site', 'Pirate Site', 'Pirate Site', 'Pirate Site', 'Pirate Site', 'Pirate Site', 'Pirate Site', 'Pirate Site', 'Pirate Site', 'Pirate Site', 'Pirate Site', 'Pirate Site', 'Pirate Site', 'Pirate Site', 'Pirate Site', 'Pirate Site', 'Pirate Site', 'Pirate Site', 'Pirate Site', 'Pirate Site', 'Pirate Site', 'Pirate Site', 'Pirate Site', 'Pirate Site', 'Pirate Site', 'Pirate Site', 'Pirate Site', 'Pirate Site', 'Pirate Site', 'Pirate Site', 'Pirate Site', 'Pirate Site', 'Pirate Site', 'Pirate Site', 'Pirate Site', 'Pirate Site', 'Pirate Site', 'Pirate Site', 'Pirate Site', 'Pirate Site', 'Pirate Site', 'Pirate Site', 'Pirate Site', 'Pirate Site', 'Pirate Site', 'Pirate Site', 'Pirate Site', 'Pirate Site', 'Pirate Site', 'Pirate Site', 'Pirate Site', 'Pirate S

In [14]:
X = vectorizer.fit_transform(corpus)	#get the X vector
#print(X)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1)	#split into training and testing set 80/20 ratio



In [15]:
lgs = LogisticRegression(max_iter=900)	#using logistic regression
lgs.fit(X_train, y_train)
print(lgs.score(X_test, y_test))

1.0


In [16]:
lgs.predict_proba(X_test)

array([[2.18516652e-04, 3.14111841e-04, 1.88490651e-04, 6.28621016e-04,
        9.97988760e-01, 6.61500111e-04],
       [2.15947034e-04, 3.10260732e-04, 1.86303529e-04, 6.19908478e-04,
        9.98015388e-01, 6.52192009e-04],
       [2.15617648e-04, 3.09772067e-04, 1.86022939e-04, 6.18813124e-04,
        9.98018973e-01, 6.50801070e-04],
       ...,
       [2.07703574e-04, 2.97918372e-04, 1.79287569e-04, 5.92009008e-04,
        9.98100564e-01, 6.22517142e-04],
       [1.90393849e-04, 2.72072365e-04, 1.64541559e-04, 5.34387813e-04,
        9.98277347e-01, 5.61257519e-04],
       [2.15107683e-04, 3.07848830e-04, 1.85775827e-04, 6.63072086e-04,
        9.97991136e-01, 6.37059980e-04]])


Testing the Prediction


In [17]:

#X_predict = ['wikipedia.com','google.com/search=faizanahad','pakistanifacebookforever.com/getpassword.php/','www.radsport-voggel.de/wp-admin/includes/log.exe','ahrenhei.without-transfer.ru/nethost.exe','www.itidea.it/centroesteticosothys/img/_notes/gum.exe']
#X_predict.append(str(path))
X_predict = ['breakmovies.com','123moviesz.pro','suppsenvamu.myq-see.com','google.com']
X_predict = vectorizer.transform(X_predict)
y_Predict = lgs.predict(X_predict)
print(str(y_Predict))


['Pirate Site' 'Pirate Site' 'Pirate Site' 'Pirate Site']
