In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [3]:
import re

In [4]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

In [5]:
from imblearn.over_sampling import SMOTE

In [6]:
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import train_test_split

In [7]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.naive_bayes import MultinomialNB

In [8]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

In [9]:
import pickle

In [10]:
from sklearn.preprocessing import StandardScaler

In [11]:
df=pd.read_csv('data.csv')

In [12]:
df.head()

Unnamed: 0,url,label
0,diaryofagameaddict.com,bad
1,espdesign.com.au,bad
2,iamagameaddict.com,bad
3,kalantzis.net,bad
4,slightlyoffcenter.net,bad


In [13]:
df.shape

(420464, 2)

In [14]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 420464 entries, 0 to 420463
Data columns (total 2 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   url     420464 non-null  object
 1   label   420464 non-null  object
dtypes: object(2)
memory usage: 6.4+ MB


In [15]:
df.isnull().sum()

url      0
label    0
dtype: int64

## Changing Values of Class 

In [16]:
def change_class(i):
    if i=='good':
        return 0
    else:
        return 1

In [17]:
df['label']=df['label'].apply(change_class)

In [18]:
df.head()

Unnamed: 0,url,label
0,diaryofagameaddict.com,1
1,espdesign.com.au,1
2,iamagameaddict.com,1
3,kalantzis.net,1
4,slightlyoffcenter.net,1


In [19]:
df['label'].value_counts()

0    344821
1     75643
Name: label, dtype: int64

## Tokenizing The Url

In [20]:
def tokenizer(i):
    p=''
    x=re.split('[/-]',i)
    if '' in x:
        x.remove('')
    for j in x:
        if j.find('.')>=0:
            y=j.split('.')
            if 'com' in y:
                y.remove('com')
            x=x+y
    return x

## Using Vectorizer

In [21]:
cf=CountVectorizer(tokenizer=tokenizer)

In [22]:
tf=TfidfVectorizer(tokenizer=tokenizer)

In [23]:
with open('vector','wb') as f:
    pickle.dump(tf,f)

In [24]:
x1=cf.fit_transform(df['url'])

In [25]:
x2=tf.fit_transform(df['url'])

In [26]:
y=df['label']
y

0         1
1         1
2         1
3         1
4         1
         ..
420459    1
420460    1
420461    1
420462    1
420463    1
Name: label, Length: 420464, dtype: int64

## As it is unbalanced Data set trying oversampling

In [27]:
sm=SMOTE(random_state=42)

In [28]:
x1_new,y1_new=sm.fit_resample(x1,y)

In [29]:
x2_new,y2_new=sm.fit_resample(x2,y)

In [30]:
cv=ShuffleSplit(n_splits=10,test_size=0.3,random_state=42)

## Trying different ML algorithms to check accuracy

### Count Vectorizer  without oversampling 

In [31]:
x=cross_val_score(LogisticRegression(max_iter=500),x1,y,cv=cv,scoring='accuracy')
print(x)

[0.97020771 0.97074679 0.97072301 0.97047725 0.96957349 0.97084192
 0.97061995 0.97055652 0.97034248 0.9707785 ]


In [32]:
print(cross_val_score(MultinomialNB(),x1,y,cv=cv,scoring='accuracy'))

[0.92057238 0.91998573 0.92058824 0.92042968 0.92146821 0.9205486
 0.92096876 0.92019978 0.92134929 0.92061995]


### Count_vectorizer with oversampling 

In [33]:
print(cross_val_score(LogisticRegression(max_iter=500),x1_new,y1_new,cv=cv,scoring='accuracy'))

[0.94081965 0.94099849 0.93993514 0.94038464 0.94040398 0.93951946
 0.94046198 0.94009464 0.93996897 0.94042814]


In [34]:
print(cross_val_score(MultinomialNB(),x1_new,y1_new,cv=cv,scoring='accuracy'))

[0.86360583 0.8639055  0.94012847 0.94075198 0.94024931 0.86259081
 0.94024931 0.9399303  0.86378466 0.86328682]


### Tfidf Vectorizer without oversampling

In [35]:
print(cross_val_score(MultinomialNB(),x2,y,cv=cv,scoring='accuracy'))

[0.96208181 0.96286666 0.96285873 0.9626209  0.96159822 0.96239099
 0.96216902 0.96170921 0.96191533 0.96251784]


In [36]:
print(cross_val_score(LogisticRegression(max_iter=500),x2,y,cv=cv,scoring='accuracy'))

[0.95726177 0.95837165 0.957785   0.95795148 0.95750753 0.95786428
 0.95825274 0.9579277  0.95719835 0.95791977]


### Tfidf Vectorizer with oversampling

In [37]:
print(cross_val_score(MultinomialNB(),x2_new,y2_new,cv=cv,scoring='accuracy'))

[0.98506958 0.98522908 0.9845379  0.98510341 0.98486657 0.98451373
 0.98479407 0.98497291 0.98517108 0.98491007]


In [38]:
print(cross_val_score(LogisticRegression(max_iter=500),x2_new,y2_new,cv=cv,scoring='accuracy'))

[0.98127051 0.98114001 0.98027966 0.98030866 0.980386   0.9806905
 0.98064217 0.98085484 0.98002349 0.98057933]


## Creating and saving the best model

In [39]:
nb=MultinomialNB()

In [40]:
x_train,x_test,y_train,y_test=train_test_split(x2_new,y2_new,test_size=0.3,random_state=42)

In [41]:
nb.fit(x_train,y_train)

In [42]:
prediction=nb.predict(x_test)

In [43]:
nb.score(x_train,y_train)

0.9966110753207154

In [44]:
confusion_matrix(prediction,y_test)

array([[100499,    322],
       [  2767, 103305]], dtype=int64)

In [45]:
print(classification_report(prediction,y_test))

              precision    recall  f1-score   support

           0       0.97      1.00      0.98    100821
           1       1.00      0.97      0.99    106072

    accuracy                           0.99    206893
   macro avg       0.99      0.99      0.99    206893
weighted avg       0.99      0.99      0.99    206893



### Model is 99% accurate

### Saving file 

In [46]:
with open('url','wb') as f:
    pickle.dump(nb,f)

### Trying new Predcition 

In [47]:
with open('url','rb') as f:
    model=pickle.load(f)

In [48]:
with open('vector','rb') as f:
    vectorizer=pickle.load(f)

In [49]:
def predict(a):
    v=vectorizer.transform([a])
    p=model.predict(v)
    return p

In [None]:
predict('wikipedia.com')

In [None]:
predict('www.itidea.it/centroesteticosothys/img/_notes/gum.exe')

array([1])