In [1]:
from flask import Flask, request, jsonify, render_template
import requests
from bs4 import BeautifulSoup
from gensim import corpora, models, similarities
from gensim.parsing.preprocessing import preprocess_string, strip_punctuation, strip_numeric, strip_short

app = Flask(__name__)

@app.route('/')
def index():
    return render_template('index.html')

@app.route('/check', methods=['POST'])
def check_plagiarism():
    user_input = request.json.get('text')
    if not user_input:
        return jsonify({"error": "No text provided"}), 400
    
    try:
        result = check_text_for_plagiarism(user_input)
        return jsonify(result)
    except Exception as e:
        return jsonify({"error": str(e)}), 500

def scrape_web(query):
    search_url = f"https://www.google.com/search?q={query}"
    headers = {
        "User-Agent": "Mozilla/5.0"
    }
    response = requests.get(search_url, headers=headers)
    soup = BeautifulSoup(response.text, 'html.parser')
    
    results = []
    for g in soup.find_all('a'):
        link = g.get('href')
        if link and 'url?q=' in link:
            results.append(link.split('url?q=')[1].split('&')[0])
    
    return results

def fetch_page_content(url):
    try:
        response = requests.get(url)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'html.parser')
        paragraphs = soup.find_all('p')
        page_content = ' '.join([para.text for para in paragraphs])
        return page_content
    except requests.RequestException as e:
        print(f"Error fetching {url}: {e}")
        return ""

def preprocess(text):
    custom_filters = [strip_punctuation, strip_numeric, strip_short]
    return preprocess_string(text, custom_filters)

def compare_texts(text1, text2):
    texts = [preprocess(text1), preprocess(text2)]
    dictionary = corpora.Dictionary(texts)
    corpus = [dictionary.doc2bow(text) for text in texts]
    lsi = models.LsiModel(corpus, id2word=dictionary, num_topics=2)
    index = similarities.MatrixSimilarity(lsi[corpus])
    
    vec_bow = dictionary.doc2bow(preprocess(text2))
    vec_lsi = lsi[vec_bow]
    
    sims = index[vec_lsi]
    return float(sims[0])

def check_text_for_plagiarism(text):
    search_results = scrape_web(text[:50])
    results = []
    for url in search_results:
        page_content = fetch_page_content(url)
        if page_content:
            similarity_score = compare_texts(text, page_content)
            if similarity_score > 0:
                results.append({
                    "url": url,
                    "similarity": similarity_score,
                    "content": page_content[:500]  # Limiting to the first 500 characters for brevity
                })
    
    if results:
        best_match = max(results, key=lambda x: x['similarity'])
        best_match['similarity'] = round(best_match['similarity'], 4)
    else:
        best_match = {"url": "", "similarity": 0.0, "content": ""}

    return best_match

if __name__ == '__main__':
    app.run(debug=True, use_reloader=False)

 * Serving Flask app '__main__'
 * Debug mode: on


 * Running on http://127.0.0.1:5000
Press CTRL+C to quit
127.0.0.1 - - [21/May/2024 23:30:18] "GET / HTTP/1.1" 200 -


Error fetching https://maps.google.com/maps%3Fq%3Dapplication%2Bof%2Bbleach%26um%3D1%26ie%3DUTF-8%26ved%3D1t:200713%26ictx%3D111: 404 Client Error: Not Found for url: https://maps.google.com/maps%3Fq%3Dapplication%2Bof%2Bbleach%26um%3D1%26ie%3DUTF-8%26ved%3D1t:200713%26ictx%3D111
Error fetching https://dengarden.com/cleaning/22-Great-Uses-for-Household-Bleach: 403 Client Error: Forbidden for url: https://dengarden.com/cleaning/22-Great-Uses-for-Household-Bleach
Error fetching https://www.thespruce.com/is-bleach-a-great-choice-as-a-cleaner-1900778: 406 Client Error: Not Acceptable for url: https://www.thespruce.com/is-bleach-a-great-choice-as-a-cleaner-1900778
Error fetching https://www.rd.com/article/12-smart-ways-to-use-bleach/: 403 Client Error: Forbidden for url: https://www.rd.com/article/12-smart-ways-to-use-bleach/
Error fetching https://www.readersdigest.ca/home-garden/tips/5-things-do-bleach/: 403 Client Error: Forbidden for url: https://www.readersdigest.ca/home-garden/tips/5-

127.0.0.1 - - [21/May/2024 23:31:40] "POST /check HTTP/1.1" 200 -


Error fetching https://accounts.google.com/ServiceLogin%3Fcontinue%3Dhttps://www.google.com/search%253Fq%253Dapplication%252Bof%252Bbleach%26hl%3Den: 404 Client Error: Not Found for url: https://accounts.google.com/ServiceLogin%3Fcontinue%3Dhttps://www.google.com/search%253Fq%253Dapplication%252Bof%252Bbleach%26hl%3Den


127.0.0.1 - - [21/May/2024 23:52:25] "GET / HTTP/1.1" 200 -
127.0.0.1 - - [21/May/2024 23:52:25] "GET /lib/owlcarousel/assets/owl.carousel.min.css HTTP/1.1" 404 -
127.0.0.1 - - [21/May/2024 23:52:26] "GET /img/overlay-bottom.png HTTP/1.1" 404 -
127.0.0.1 - - [21/May/2024 23:52:26] "GET /header.jpg HTTP/1.1" 404 -
127.0.0.1 - - [21/May/2024 23:52:26] "GET /img/bg-image.jpg HTTP/1.1" 404 -
127.0.0.1 - - [21/May/2024 23:52:26] "GET /img/overlay-top.png HTTP/1.1" 404 -
127.0.0.1 - - [21/May/2024 23:52:27] "GET /img/favicon.ico HTTP/1.1" 404 -
127.0.0.1 - - [21/May/2024 23:52:27] "GET /img/favicon.ico HTTP/1.1" 404 -
127.0.0.1 - - [21/May/2024 23:59:20] "GET / HTTP/1.1" 200 -
127.0.0.1 - - [21/May/2024 23:59:20] "GET /lib/owlcarousel/assets/owl.carousel.min.css HTTP/1.1" 404 -
127.0.0.1 - - [21/May/2024 23:59:21] "GET /header.jpg HTTP/1.1" 404 -
127.0.0.1 - - [21/May/2024 23:59:21] "GET /img/overlay-bottom.png HTTP/1.1" 404 -
127.0.0.1 - - [21/May/2024 23:59:21] "GET /img/bg-image.jpg HTTP/

Error fetching https://maps.google.com/maps%3Fq%3DOur%2Badvanced%2Balgorithm%2Bchecks%2Bfor%2Bduplicates%2Band%2Bs%26um%3D1%26ie%3DUTF-8%26ved%3D1t:200713%26ictx%3D111: 404 Client Error: Not Found for url: https://maps.google.com/maps%3Fq%3DOur%2Badvanced%2Balgorithm%2Bchecks%2Bfor%2Bduplicates%2Band%2Bs%26um%3D1%26ie%3DUTF-8%26ved%3D1t:200713%26ictx%3D111
Error fetching https://www.avrfreaks.net/s/topic/a5C3l000000UboXEAS/t159149: HTTPSConnectionPool(host='www.avrfreaks.net', port=443): Max retries exceeded with url: /s/topic/a5C3l000000UboXEAS/t159149 (Caused by SSLError(SSLCertVerificationError(1, '[SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed: unable to get local issuer certificate (_ssl.c:1006)')))
Error fetching https://www.researchgate.net/publication/300432909_Advanced_Algorithms_for_Efficient_Approximate_Duplicate_Detection_in_Data_Streams_Using_Bloom_Filters: 403 Client Error: Forbidden for url: https://www.researchgate.net/publication/300432909_Advanced_Algorith

127.0.0.1 - - [22/May/2024 00:00:33] "POST /check HTTP/1.1" 200 -


Error fetching https://accounts.google.com/ServiceLogin%3Fcontinue%3Dhttps://www.google.com/search%253Fq%253DOur%252Badvanced%252Balgorithm%252Bchecks%252Bfor%252Bduplicates%252Band%252Bs%26hl%3Den: 404 Client Error: Not Found for url: https://accounts.google.com/ServiceLogin%3Fcontinue%3Dhttps://www.google.com/search%253Fq%253DOur%252Badvanced%252Balgorithm%252Bchecks%252Bfor%252Bduplicates%252Band%252Bs%26hl%3Den


127.0.0.1 - - [22/May/2024 00:15:09] "GET / HTTP/1.1" 200 -
127.0.0.1 - - [22/May/2024 00:15:10] "GET /lib/owlcarousel/assets/owl.carousel.min.css HTTP/1.1" 404 -
127.0.0.1 - - [22/May/2024 00:15:10] "GET /static/feature.jpg HTTP/1.1" 200 -
127.0.0.1 - - [22/May/2024 00:15:11] "GET /img/overlay-bottom.png HTTP/1.1" 404 -
127.0.0.1 - - [22/May/2024 00:15:11] "GET /header.jpg HTTP/1.1" 404 -
127.0.0.1 - - [22/May/2024 00:15:11] "GET /img/bg-image.jpg HTTP/1.1" 404 -
127.0.0.1 - - [22/May/2024 00:15:11] "GET /img/overlay-top.png HTTP/1.1" 404 -
127.0.0.1 - - [22/May/2024 00:15:11] "GET /img/favicon.ico HTTP/1.1" 404 -
127.0.0.1 - - [22/May/2024 00:15:29] "GET / HTTP/1.1" 200 -
127.0.0.1 - - [22/May/2024 00:15:29] "GET /lib/owlcarousel/assets/owl.carousel.min.css HTTP/1.1" 404 -
127.0.0.1 - - [22/May/2024 00:15:29] "GET /static/feature.jpg HTTP/1.1" 304 -
127.0.0.1 - - [22/May/2024 00:15:30] "GET /header.jpg HTTP/1.1" 404 -
127.0.0.1 - - [22/May/2024 00:15:30] "GET /img/overlay-bottom.png 

Error fetching https://maps.google.com/maps%3Fq%3DEnter%2Btext%2Bto%2Bcheck%26um%3D1%26ie%3DUTF-8%26ved%3D1t:200713%26ictx%3D111: 404 Client Error: Not Found for url: https://maps.google.com/maps%3Fq%3DEnter%2Btext%2Bto%2Bcheck%26um%3D1%26ie%3DUTF-8%26ved%3D1t:200713%26ictx%3D111
Error fetching https://quillbot.com/grammar-check: 403 Client Error: Forbidden for url: https://quillbot.com/grammar-check
Error fetching https://www.oxfordlearnersdictionaries.com/text-checker/: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))
Error fetching https://quillbot.com/punctuation-checker: 403 Client Error: Forbidden for url: https://quillbot.com/punctuation-checker
Error fetching https://www.reverso.net/spell-checker/english-spelling-grammar/: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))
Error fetching https://www.reverso.net/spell-checker/french-spelling-grammar/: ('Connection aborted.', RemoteDisconnected(

127.0.0.1 - - [22/May/2024 00:54:08] "POST /check HTTP/1.1" 200 -


Error fetching https://accounts.google.com/ServiceLogin%3Fcontinue%3Dhttps://www.google.com/search%253Fq%253DEnter%252Btext%252Bto%252Bcheck%26hl%3Den: 404 Client Error: Not Found for url: https://accounts.google.com/ServiceLogin%3Fcontinue%3Dhttps://www.google.com/search%253Fq%253DEnter%252Btext%252Bto%252Bcheck%26hl%3Den


127.0.0.1 - - [22/May/2024 00:56:05] "GET / HTTP/1.1" 200 -
127.0.0.1 - - [22/May/2024 00:56:05] "GET /lib/owlcarousel/assets/owl.carousel.min.css HTTP/1.1" 404 -
127.0.0.1 - - [22/May/2024 00:56:05] "GET /static/feature.jpg HTTP/1.1" 304 -
127.0.0.1 - - [22/May/2024 00:56:06] "GET /static/bg.jpg HTTP/1.1" 304 -
127.0.0.1 - - [22/May/2024 00:56:06] "GET /img/bg-image.jpg HTTP/1.1" 404 -
127.0.0.1 - - [22/May/2024 00:56:06] "GET /img/overlay-top.png HTTP/1.1" 404 -
127.0.0.1 - - [22/May/2024 00:56:06] "GET /img/overlay-bottom.png HTTP/1.1" 404 -
127.0.0.1 - - [22/May/2024 00:56:07] "GET /img/favicon.ico HTTP/1.1" 404 -
127.0.0.1 - - [22/May/2024 00:57:31] "GET / HTTP/1.1" 200 -
127.0.0.1 - - [22/May/2024 00:57:31] "GET /lib/owlcarousel/assets/owl.carousel.min.css HTTP/1.1" 404 -
127.0.0.1 - - [22/May/2024 00:57:31] "GET /static/feature.jpg HTTP/1.1" 304 -
127.0.0.1 - - [22/May/2024 00:57:31] "GET /img/overlay-bottom.png HTTP/1.1" 404 -
127.0.0.1 - - [22/May/2024 00:57:31] "GET /img/bg-i