## **Importing Libraries**

In [1]:
import pandas as pd
import numpy as np
import random 
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

### Exploratory Data Analysis

In [2]:
df = pd.read_csv("valid&fake_urls.csv")
df.head()

Unnamed: 0,urls,labels
0,chouftv.ma,0
1,india.com,0
2,google.cl,0
3,merdeka.com,0
4,yandex.com.tr,0


In [3]:
df.shape

(2000, 2)

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000 entries, 0 to 1999
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   urls    2000 non-null   object
 1   labels  2000 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 31.4+ KB


In [5]:
df["labels"].value_counts()

0    1000
1    1000
Name: labels, dtype: int64

0 ===> not a fake site

1 ===> fake site

#### Shuffling the Dataset

In [6]:
df = df.sample(frac=1).reset_index(drop=True)
df.head()

Unnamed: 0,urls,labels
0,couriermailpost.com,1
1,teacuppuppyfamily.com,1
2,eibae.net,1
3,kingpethouse.com,1
4,apexinvestbank.com,1


### Data Vectorization using TfidfVectorizer

In [7]:
def customtkns(t):
    tkns_byslash = str(t.encode("utf-8")).split("/")
    total_tokens = []
    for i in tkns_byslash:
        tokens = str(i).split("-")
        tkns_bydot = []
        for j in range(0,len(tokens)):
            temp_tkns = str(tokens[j]).split(".")
            tkns_bydot = tkns_bydot + temp_tkns
        total_tokens = total_tokens + tokens + tkns_bydot
    total_tokens = list(set(total_tokens))
    if "com" in total_tokens:
        total_tokens.remove("com")
    return total_tokens

### Vectorizing the urls

In [8]:
df = np.array(df)
random.shuffle(df)

In [9]:
x = [d[0] for d in df]
y = [d[1] for d in df]

In [10]:
#x = df["urls"]
#y = df["labels"]

In [11]:
vectorizer = TfidfVectorizer(tokenizer=customtkns)

In [12]:
X = vectorizer.fit_transform(x)

In [38]:
xtrain, xtest,ytrain,ytest = train_test_split(X,y,stratify=y,test_size = 33, random_state=1)

### LogisticRegression

In [39]:
log = LogisticRegression()
dt = DecisionTreeClassifier()
rf = RandomForestClassifier()
svm = SVC(kernel='rbf', C=100)

##### Train Accuracy

In [40]:
def modeltrain(model,name):
    model.fit(xtrain, ytrain)
    xtrain_predict = model.predict(xtrain)
    train_accuracy = accuracy_score(xtrain_predict,ytrain)
    print(name,"Train Data Accuracy Score: ",train_accuracy)
modeltrain(log,"LogisticRegression")
modeltrain(dt,"DecisionTreeClassifier")
modeltrain(rf,"RandomForestClassifier")
modeltrain(svm,"SupportVectorClassifier")

LogisticRegression Train Data Accuracy Score:  0.9989832231825114
DecisionTreeClassifier Train Data Accuracy Score:  1.0
RandomForestClassifier Train Data Accuracy Score:  1.0
SupportVectorClassifier Train Data Accuracy Score:  1.0


##### Test Accuracy

In [41]:
def modeltest(model,name):
    xtest_predict = model.predict(xtest)
    test_accuracy = accuracy_score(xtest_predict,ytest)
    print(name,"Test Data Accuracy Score: ",test_accuracy)
modeltest(log,"LogisticRegression")
modeltest(dt,"DecisionTreeClassifier")
modeltest(rf,"RandomForestClassifier")
modeltest(svm,"SupportVectorClassifier")

LogisticRegression Test Data Accuracy Score:  1.0
DecisionTreeClassifier Test Data Accuracy Score:  0.9696969696969697
RandomForestClassifier Test Data Accuracy Score:  1.0
SupportVectorClassifier Test Data Accuracy Score:  0.9696969696969697


### Model_Deployment

In [42]:
link = ["javatpoint.com/","https://github.com/","https://www.axiomtrustcitybank.com","http://www.melofrenchbulldog.com","https://www.ngi-sdnbhd.com","https://www.programiz.com/"]

In [43]:
link = np.array(link)
X_predict1 = vectorizer.transform(link)
xp = dt.predict(X_predict1)
print(xp)

[0 1 1 1 1 1]


In [19]:
#Predict Model
Predict_url = np.array(link)
Predict_url = vectorizer.transform(Predict_url)
value = dt.predict(Predict_url)
if value == [0]:
    return f'{url}: The url is a Legit Site'
elif value == [1]:
    return f'{url}: The url is a Fake Site'
else:
    return "Enter a valid url"
    
urls = [input(str("Enter a url:" ))]


SyntaxError: 'return' outside function (<ipython-input-19-ce32d81298bf>, line 6)

In [None]:
    link = np.array(url)
    X_predict = vectorizer.transform(link)
    value = dt.predict(X_predict1)
    if value == [0]:
        return f'{url}: The url is a Legit Site'
    elif value == [1]:
        return f'{url}: The url is a Fake Site'
    else:
        return "Enter a valid url"

get_url = [input(str("Enter a url:" ))]

FakeSiteDetection(get_urls)