<a href="https://colab.research.google.com/github/sarkar-sayan/URL-Classification/blob/main/POC_1_URL_Classification_mod.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


### Trial using keywords and language translate

In [None]:
pip install langdetect googletrans==4.0.0-rc1

In [3]:
import numpy as np
import pandas as pd
import requests
from bs4 import BeautifulSoup
from random import choice
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import FeatureUnion
from sklearn.preprocessing import FunctionTransformer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report
import nltk
from nltk.corpus import stopwords
import string
from langdetect import detect
from googletrans import Translator

# Initialize Google Translator
translator = Translator()

# Initialize Tokenizer
vectorizer = TfidfVectorizer()

In [4]:
# List of user-agents for rotation
user_agents = [
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
    'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.114 Safari/537.36',
    'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:89.0) Gecko/20100101 Firefox/89.0',
    # Add more user-agents if needed
]

# Download the stopwords
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [6]:
# Sample keywords for productive and non-productive classification
productive_keywords = ['study', 'research', 'education', 'work', 'project', 'python']
non_productive_keywords = ['game', 'social', 'fun', 'entertainment', 'video']

In [7]:
# Function to get metadata from URL
def get_metadata_from_url(url):
    try:
        headers = {'User-Agent': choice(user_agents)}
        response = requests.get(url, headers=headers)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'html.parser')

        title = soup.find('title').get_text() if soup.find('title') else 'No title'

        description = soup.find('meta', attrs={'name': 'description'})
        if description:
            description = description.get('content')
        else:
            description = soup.find('meta', attrs={'property': 'og:description'})
            description = description.get('content') if description else 'No description'

        image = soup.find('meta', attrs={'property': 'og:image'})
        if image:
            image = image.get('content')
        else:
            image = soup.find('link', attrs={'rel': 'image_src'})
            image = image.get('href') if image else 'No image'

        text = ' '.join(p.get_text() for p in soup.find_all('p'))

        return {
            "title": title,
            "description": description,
            "image": image,
            "url": url,
            "text": text
        }
    except requests.RequestException as e:
        return {
            "title": "Error",
            "description": str(e),
            "image": "No image",
            "url": url,
            "text": ""
        }

In [8]:
# Preprocess text function
def preprocess_text(text):
    if not text:
        return ""
    text = text.lower()
    text = ''.join([char for char in text if char not in string.punctuation])
    tokens = text.split()
    tokens = [word for word in tokens if word not in stop_words]
    return ' '.join(tokens)

# Translate text if not in English
def translate_text_if_needed(text):
    try:
        language = detect(text)
        if language != 'en':
            translated = translator.translate(text, dest='en')
            return translated.text
        return text
    except Exception as e:
        return text

# Preprocess metadata function
def preprocess_metadata(metadata):
    title = translate_text_if_needed(metadata.get('title', ''))
    description = translate_text_if_needed(metadata.get('description', ''))
    text = translate_text_if_needed(metadata.get('text', ''))
    clean_title = preprocess_text(title)
    clean_description = preprocess_text(description)
    clean_text = preprocess_text(text)
    combined_clean_content = f"{clean_title} {clean_description} {clean_text}".strip()
    return combined_clean_content

In [9]:
# Function to extract domain from URL
def extract_domain(url):
    return url.split('//')[-1].split('/')[0]

# Keyword matching function
def count_keywords(text, keywords):
    tokens = text.split()
    return sum(token in keywords for token in tokens)

# Function to create feature matrix
def create_feature_matrix(df):
    tfidf_matrix = vectorizer.fit_transform(df['clean_content'])

    keyword_counts = df[['productive_keyword_count', 'non_productive_keyword_count']].values
    return np.hstack((tfidf_matrix.toarray(), keyword_counts))

In [10]:
# Load Dataset
data = pd.read_csv('/content/drive/MyDrive/Sayan RP files/Datasets/URL_Dataset(Sheet1).csv')
data.head()

Unnamed: 0,index,url,label
0,1,https://stackoverflow.com/questions/66341659/e...,Productive
1,2,https://www.anandabazar.com/west-bengal/state-...,Non-productive
2,3,https://www.barandbench.com/news/sim-swap-scam...,Non-productive
3,4,https://www.youtube.com/watch?v=D_qLCwDiRs4\u0...,Non-productive
4,5,https://www.youtube.com/watch?v=UdcPhnNjSEw,Productive


In [11]:
data.columns

Index(['index', 'url', 'label'], dtype='object')

In [12]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   index   5 non-null      int64 
 1   url     5 non-null      object
 2   label   5 non-null      object
dtypes: int64(1), object(2)
memory usage: 248.0+ bytes


In [13]:
cols = data.columns
cols

Index(['index', 'url', 'label'], dtype='object')

In [14]:
data.drop(['index'],axis=1,inplace=True)

In [15]:
data

Unnamed: 0,url,label
0,https://stackoverflow.com/questions/66341659/e...,Productive
1,https://www.anandabazar.com/west-bengal/state-...,Non-productive
2,https://www.barandbench.com/news/sim-swap-scam...,Non-productive
3,https://www.youtube.com/watch?v=D_qLCwDiRs4\u0...,Non-productive
4,https://www.youtube.com/watch?v=UdcPhnNjSEw,Productive


In [16]:
data['label'].unique()

array(['Productive', 'Non-productive'], dtype=object)

In [17]:
data[data.label == 'Non-productive'].head(10)

Unnamed: 0,url,label
1,https://www.anandabazar.com/west-bengal/state-...,Non-productive
2,https://www.barandbench.com/news/sim-swap-scam...,Non-productive
3,https://www.youtube.com/watch?v=D_qLCwDiRs4\u0...,Non-productive


In [18]:
data[data.label == 'Productive'].head(10)

Unnamed: 0,url,label
0,https://stackoverflow.com/questions/66341659/e...,Productive
4,https://www.youtube.com/watch?v=UdcPhnNjSEw,Productive


In [19]:
# Scrape Content and Metadata
data['metadata'] = data['url'].apply(get_metadata_from_url)
# Apply Preprocessing to Metadata
data['clean_content'] = data['metadata'].apply(preprocess_metadata)
data['domain'] = data['url'].apply(extract_domain)

In [20]:
data.head()

Unnamed: 0,url,label,metadata,clean_content,domain
0,https://stackoverflow.com/questions/66341659/e...,Productive,{'title': 'uncaught exception - Electron ipcMa...,uncaught exception electron ipcmain gracefully...,stackoverflow.com
1,https://www.anandabazar.com/west-bengal/state-...,Non-productive,"{'title': 'Error', 'description': '403 Client ...",error 403 client error forbidden url httpswwwa...,www.anandabazar.com
2,https://www.barandbench.com/news/sim-swap-scam...,Non-productive,{'title': 'SIM swap scam: NCDRC restores order...,sim swap scam ncdrc restores order directing a...,www.barandbench.com
3,https://www.youtube.com/watch?v=D_qLCwDiRs4\u0...,Non-productive,{'title': 'East Bengal FC 0 - 1 Mohun Bagan SG...,east bengal fc 0 1 mohun bagan sg final highli...,www.youtube.com
4,https://www.youtube.com/watch?v=UdcPhnNjSEw,Productive,{'title': 'Collections in Python - Advanced Py...,collections python advanced python 06 programm...,www.youtube.com


In [22]:
data['productive_keyword_count'] = data['clean_content'].apply(lambda x: count_keywords(x, productive_keywords))
data['non_productive_keyword_count'] = data['clean_content'].apply(lambda x: count_keywords(x, non_productive_keywords))

X = create_feature_matrix(data)
y = data['label']

# Train/Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=42, shuffle=True)

# Train Model
model = MultinomialNB()
model.fit(X_train, y_train)

# Evaluate Model
y_pred = model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

Accuracy: 1.0
                precision    recall  f1-score   support

Non-productive       1.00      1.00      1.00         1
    Productive       1.00      1.00      1.00         1

      accuracy                           1.00         2
     macro avg       1.00      1.00      1.00         2
  weighted avg       1.00      1.00      1.00         2



In [23]:
def classify_url(url):
    metadata = get_metadata_from_url(url)
    clean_content = preprocess_metadata(metadata)
    vectorized_content = vectorizer.transform([clean_content])
    keyword_counts = np.array([[
        count_keywords(clean_content, productive_keywords),
        count_keywords(clean_content, non_productive_keywords)
    ]])
    feature_vector = np.hstack((vectorized_content.toarray(), keyword_counts))
    return model.predict(feature_vector)[0]


In [24]:
new_url = input("Enter url to check for Productive or Non-productive: ")
print(f"The URL {new_url} is classified as {classify_url(new_url)}")

Enter url to check for Productive or Non-productive: https://www.youtube.com/watch?v=th5_9woFJmk
The URL https://www.youtube.com/watch?v=th5_9woFJmk is classified as Productive


In [25]:
new_url = input("Enter url to check for Productive or Non-productive: ")
print(f"The URL {new_url} is classified as {classify_url(new_url)}")

Enter url to check for Productive or Non-productive: https://www.crazygames.com/
The URL https://www.crazygames.com/ is classified as Non-productive


In [26]:
new_url = input("Enter url to check for Productive or Non-productive: ")
print(f"The URL {new_url} is classified as {classify_url(new_url)}")

Enter url to check for Productive or Non-productive: https://bengali.indianexpress.com/west-bengal/more-than-5-18-lakh-android-and-8200-ios-users-downloades-metro-ride-kolkata-app-779881/
The URL https://bengali.indianexpress.com/west-bengal/more-than-5-18-lakh-android-and-8200-ios-users-downloades-metro-ride-kolkata-app-779881/ is classified as Non-productive
