In [2]:
#version 1.0
import requests
from bs4 import BeautifulSoup
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import re
from urllib.parse import urlparse, urljoin

def get_text_from_website(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    text = soup.get_text(separator=' ')
    text = re.sub(r'\W+', ' ', text)
    text = ' '.join(text.split())
    return text

def get_css_from_website(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    relevant_css_properties = []

    # Extract CSS from <style> tags
    for style in soup.find_all('style'):
        css_text = style.get_text()
        relevant_css_properties.extend(css_text.split('\n'))

    # Extract CSS from external CSS files linked using <link> tag
    for link in soup.find_all('link', rel='stylesheet'):
        css_url = link.get('href')
        if css_url:
            # Handle relative URLs
            if not bool(urlparse(css_url).netloc):
                css_url = urljoin(url, css_url)
            css_response = requests.get(css_url)
            css_text = css_response.text
            relevant_css_properties.extend(css_text.split('\n'))

    return relevant_css_properties

def calculate_cosine_similarity(tokens1, tokens2):
    vectorizer = CountVectorizer().fit_transform([tokens1, tokens2])
    cosine_sim = cosine_similarity(vectorizer)
    return cosine_sim[0, 1]

def calculate_css_similarity(css1, css2):
    relevant_attributes = ['background-color', 'color', 'font-family', 'font-size','position']
    css_tokens1 = []
    css_tokens2 = []

    # Extract relevant CSS attributes
    for css in css1:
        for attr in relevant_attributes:
            match = re.search(f'{attr}\s*:\s*([^;]+);', css)
            if match:
                css_tokens1.append(match.group(1))

    for css in css2:
        for attr in relevant_attributes:
            match = re.search(f'{attr}\s*:\s*([^;]+);', css)
            if match:
                css_tokens2.append(match.group(1))

    # Combine tokens into strings for vectorization
    css_str1 = ' '.join(css_tokens1)
    css_str2 = ' '.join(css_tokens2)

    # Calculate cosine similarity
    css_similarity = calculate_cosine_similarity(css_str1, css_str2)
    return css_similarity

website1_url = 'https://9anime.com.pl/'
website2_url = 'https://9anime.se/home'

text1 = get_text_from_website(website1_url)
text2 = get_text_from_website(website2_url)

css1 = get_css_from_website(website1_url)
css2 = get_css_from_website(website2_url)

text_similarity = calculate_cosine_similarity(text1, text2)
css_similarity = calculate_css_similarity(css1, css2)

overall_similarity = (text_similarity + css_similarity) / 2 * 100

print("Text Similarity Score:", text_similarity)
print("CSS Similarity Score:", css_similarity)
print("\nOverall Similarity Percentage:", overall_similarity, "%")
# print("\nCSS extracted from website 1:\n", '\n'.join(css1))
# print("*********************************************************")
# print("\nCSS extracted from website 2:\n", '\n'.join(css2))


Text Similarity Score: 0.45233775380448504
CSS Similarity Score: 0.9980797237875587

Overall Similarity Percentage: 72.52087387960219 %


In [1]:
#version 2.0
import requests
from bs4 import BeautifulSoup
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import re
from urllib.parse import urlparse, urljoin

def get_text_from_website(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    text = soup.get_text(separator=' ')
    text = re.sub(r'\W+', ' ', text)
    text = ' '.join(text.split())
    return text

def get_css_from_website(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    relevant_css_properties = []

    # Extract CSS from <style> tags
    for style in soup.find_all('style'):
        css_text = style.get_text()
        relevant_css_properties.extend(css_text.split('\n'))

    # Extract CSS from external CSS files linked using <link> tag
    for link in soup.find_all('link', rel='stylesheet'):
        css_url = link.get('href')
        if css_url:
            # Handle relative URLs
            if not bool(urlparse(css_url).netloc):
                css_url = urljoin(url, css_url)
            css_response = requests.get(css_url)
            css_text = css_response.text
            relevant_css_properties.extend(css_text.split('\n'))

    return relevant_css_properties

def calculate_cosine_similarity(tokens1, tokens2):
    vectorizer = TfidfVectorizer().fit([tokens1, tokens2])
    tfidf_matrix = vectorizer.transform([tokens1, tokens2])
    cosine_sim = cosine_similarity(tfidf_matrix)
    return cosine_sim[0, 1]

def calculate_css_similarity(css1, css2):
    css_str1 = ' '.join(css1)
    css_str2 = ' '.join(css2)

    # Calculate cosine similarity
    css_similarity = calculate_cosine_similarity(css_str1, css_str2)
    return css_similarity


website1_url = 'http://127.0.0.1:5500/84_Spotify_Clone/index.html'
website2_url = 'https://open.spotify.com'

text1 = get_text_from_website(website1_url)
text2 = get_text_from_website(website2_url)

css1 = get_css_from_website(website1_url)
css2 = get_css_from_website(website2_url)

text_similarity = calculate_cosine_similarity(text1, text2)
css_similarity = calculate_css_similarity(css1, css2)

overall_similarity = (text_similarity + css_similarity) / 2 * 100

print("Text Similarity Score:", text_similarity)
print("CSS Similarity Score:", css_similarity)
print("\nOverall Similarity Percentage:", overall_similarity, "%")
# print("\nCSS extracted from website 1:\n", '\n'.join(css1))
# print("*********************************************************")
# print("\nCSS extracted from website 2:\n", '\n'.join(css2))


Text Similarity Score: 0.028049975708856
CSS Similarity Score: 0.07189307184903505

Overall Similarity Percentage: 4.997152377894552 %
