In [106]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [107]:
import re

In [108]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

In [109]:
from imblearn.over_sampling import SMOTE

In [110]:
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import train_test_split

In [111]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.naive_bayes import MultinomialNB

In [112]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

In [113]:
import pickle

In [114]:
from sklearn.preprocessing import StandardScaler

In [115]:
df=pd.read_csv('data.csv')

In [116]:
df.head()

Unnamed: 0,url,label
0,diaryofagameaddict.com,bad
1,espdesign.com.au,bad
2,iamagameaddict.com,bad
3,kalantzis.net,bad
4,slightlyoffcenter.net,bad


In [117]:
df.shape

(420464, 2)

In [118]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 420464 entries, 0 to 420463
Data columns (total 2 columns):
url      420464 non-null object
label    420464 non-null object
dtypes: object(2)
memory usage: 6.4+ MB


In [119]:
df.isnull().sum()

url      0
label    0
dtype: int64

## Changing Values of Class 

In [120]:
def change_class(i):
    if i=='good':
        return 0
    else:
        return 1

In [121]:
df['label']=df['label'].apply(change_class)

In [122]:
df.head()

Unnamed: 0,url,label
0,diaryofagameaddict.com,1
1,espdesign.com.au,1
2,iamagameaddict.com,1
3,kalantzis.net,1
4,slightlyoffcenter.net,1


In [123]:
df['label'].value_counts()

0    344821
1     75643
Name: label, dtype: int64

## Tokenizing The Url

In [124]:
def tokenizer(i):
    p=''
    x=re.split('[/-]',i)
    if '' in x:
        x.remove('')
    for j in x:
        if j.find('.')>=0:
            y=j.split('.')
            if 'com' in y:
                y.remove('com')
            x=x+y
    return x

## Using Vectorizer

In [125]:
cf=CountVectorizer(tokenizer=tokenizer)

In [126]:
tf=TfidfVectorizer(tokenizer=tokenizer)

In [155]:
with open('vector','wb') as f:
    pickle.dump(tf,f)

In [128]:
x1=cf.fit_transform(df['url'])

In [129]:
x2=tf.fit_transform(df['url'])

In [130]:
y=df['label']
y

0         1
1         1
2         1
3         1
4         1
         ..
420459    1
420460    1
420461    1
420462    1
420463    1
Name: label, Length: 420464, dtype: int64

## As it is unbalanced Data set trying oversampling

In [131]:
sm=SMOTE(random_state=42)

In [132]:
x1_new,y1_new=sm.fit_resample(x1,y)

In [133]:
x2_new,y2_new=sm.fit_resample(x2,y)

In [134]:
cv=ShuffleSplit(n_splits=10,test_size=0.3,random_state=42)

## Trying different ML algorithms to check accuracy

### Count Vectorizer  without oversampling 

In [135]:
x=cross_val_score(LogisticRegression(max_iter=500),x1,y,cv=cv,scoring='accuracy')
print(x)

[0.97020771 0.97073886 0.97072301 0.97046139 0.96957349 0.97084192
 0.97061995 0.97055652 0.9703504  0.9707785 ]


In [136]:
print(cross_val_score(MultinomialNB(),x1,y,cv=cv,scoring='accuracy'))

[0.92057238 0.91998573 0.92058824 0.92042968 0.92146821 0.9205486
 0.92096876 0.92019978 0.92134929 0.92061995]


### Count_vectorizer with oversampling 

In [137]:
print(cross_val_score(LogisticRegression(max_iter=500),x1_new,y1_new,cv=cv,scoring='accuracy'))

[0.94082448 0.94099365 0.93995447 0.94038464 0.94039914 0.93951463
 0.94046198 0.94009947 0.93996414 0.94041848]


In [138]:
print(cross_val_score(MultinomialNB(),x1_new,y1_new,cv=cv,scoring='accuracy'))

[0.86360583 0.8639055  0.94012847 0.94075198 0.94024931 0.86259081
 0.94024931 0.9399303  0.86378466 0.86328682]


### Tfidf Vectorizer without oversampling

In [139]:
print(cross_val_score(MultinomialNB(),x2,y,cv=cv,scoring='accuracy'))

[0.96208181 0.96286666 0.96285873 0.9626209  0.96159822 0.96239099
 0.96216902 0.96170921 0.96191533 0.96251784]


In [140]:
print(cross_val_score(LogisticRegression(max_iter=500),x2,y,cv=cv,scoring='accuracy'))

[0.95725384 0.95837165 0.957785   0.95795941 0.95751546 0.95786428
 0.95814967 0.95790392 0.95719042 0.95794355]


### Tfidf Vectorizer with oversampling

In [141]:
print(cross_val_score(MultinomialNB(),x2_new,y2_new,cv=cv,scoring='accuracy'))

[0.98506958 0.98522908 0.9845379  0.98510341 0.98486657 0.98451373
 0.98479407 0.98497291 0.98517108 0.98491007]


In [142]:
print(cross_val_score(LogisticRegression(max_iter=500),x2_new,y2_new,cv=cv,scoring='accuracy'))

[0.98128018 0.98114484 0.98028449 0.9803135  0.980386   0.98067117
 0.98053583 0.98085484 0.98003316 0.98057933]


## Creating and saving the best model

In [143]:
nb=MultinomialNB()

In [144]:
x_train,x_test,y_train,y_test=train_test_split(x2_new,y2_new,test_size=0.3,random_state=42)

In [145]:
nb.fit(x_train,y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [146]:
prediction=nb.predict(x_test)

In [147]:
nb.score(x_train,y_train)

0.9966110753207154

In [148]:
confusion_matrix(prediction,y_test)

array([[100499,    322],
       [  2767, 103305]])

In [149]:
print(classification_report(prediction,y_test))

              precision    recall  f1-score   support

           0       0.97      1.00      0.98    100821
           1       1.00      0.97      0.99    106072

    accuracy                           0.99    206893
   macro avg       0.99      0.99      0.99    206893
weighted avg       0.99      0.99      0.99    206893



### Model is 99% accurate

### Saving file 

In [150]:
with open('url','wb') as f:
    pickle.dump(nb,f)

### Trying new Predcition 

In [151]:
with open('url','rb') as f:
    model=pickle.load(f)

In [156]:
with open('vector','rb') as f:
    vectorizer=pickle.load(f)

In [157]:
def predict(a):
    v=vectorizer.transform([a])
    p=model.predict(v)
    return p

In [176]:
predict('wikipedia.com')

array([0])

In [177]:
predict('www.itidea.it/centroesteticosothys/img/_notes/gum.exe')

array([1])