In [1]:
from flask import Flask, request, jsonify, render_template
import requests
from bs4 import BeautifulSoup
from gensim import corpora, models, similarities
from gensim.parsing.preprocessing import preprocess_string, strip_punctuation, strip_numeric, strip_short
import difflib

app = Flask(__name__)

@app.route('/')
def index():
    return render_template('index.html')

@app.route('/check', methods=['POST'])
def check_plagiarism():
    user_input = request.json.get('text')
    if not user_input:
        return jsonify({"error": "No text provided"}), 400
    
    try:
        result = check_text_for_plagiarism(user_input)
        return jsonify(result)
    except Exception as e:
        return jsonify({"error": str(e)}), 500

def scrape_web(query):
    search_url = f"https://www.google.com/search?q={query}"
    headers = {
        "User-Agent": "Mozilla/5.0"
    }
    response = requests.get(search_url, headers=headers)
    soup = BeautifulSoup(response.text, 'html.parser')
    
    results = []
    for g in soup.find_all('a'):
        link = g.get('href')
        if link and 'url?q=' in link:
            results.append(link.split('url?q=')[1].split('&')[0])
    
    return results

def fetch_page_content(url):
    try:
        response = requests.get(url)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'html.parser')
        paragraphs = soup.find_all('p')
        page_content = ' '.join([para.text for para in paragraphs])
        return page_content
    except requests.RequestException as e:
        print(f"Error fetching {url}: {e}")
        return ""

def preprocess(text):
    custom_filters = [strip_punctuation, strip_numeric, strip_short]
    return preprocess_string(text, custom_filters)

def compare_texts(text1, text2):
    texts = [preprocess(text1), preprocess(text2)]
    dictionary = corpora.Dictionary(texts)
    corpus = [dictionary.doc2bow(text) for text in texts]
    lsi = models.LsiModel(corpus, id2word=dictionary, num_topics=2)
    index = similarities.MatrixSimilarity(lsi[corpus])
    
    vec_bow = dictionary.doc2bow(preprocess(text2))
    vec_lsi = lsi[vec_bow]
    
    sims = index[vec_lsi]
    return float(sims[0])

def highlight_text(original, plagiarized):
    original_words = original.split()
    plagiarized_words = plagiarized.split()
    diff = difflib.ndiff(original_words, plagiarized_words)
    highlighted = []
    
    for word in diff:
        if word.startswith('- '):
            highlighted.append(f"<span style='color: red;'>{word[2:]}</span>")
        else:
            highlighted.append(word[2:])
    
    return ' '.join(highlighted)

def check_text_for_plagiarism(text):
    search_results = scrape_web(text[:50])
    results = []
    for url in search_results:
        page_content = fetch_page_content(url)
        if page_content:
            similarity_score = compare_texts(text, page_content)
            if similarity_score > 0:
                results.append({
                    "url": url,
                    "similarity": similarity_score,
                    "content": page_content
                })
    
    if results:
        best_match = max(results, key=lambda x: x['similarity'])
        best_match['similarity'] = round(best_match['similarity'] * 100, 2)  # Convert to percentage
        best_match['highlighted_content'] = highlight_text(text, best_match['content'][:500])
    else:
        best_match = {"url": "", "similarity": 0.0, "highlighted_content": "", "content": ""}

    return best_match

if __name__ == '__main__':
    app.run(debug=True, use_reloader=False)


 * Serving Flask app '__main__'
 * Debug mode: on


 * Running on http://127.0.0.1:5000
Press CTRL+C to quit
127.0.0.1 - - [29/May/2024 04:52:20] "GET / HTTP/1.1" 200 -
127.0.0.1 - - [29/May/2024 04:52:20] "GET /lib/owlcarousel/assets/owl.carousel.min.css HTTP/1.1" 404 -
127.0.0.1 - - [29/May/2024 04:52:21] "GET /static/feature.jpg HTTP/1.1" 304 -
127.0.0.1 - - [29/May/2024 04:52:22] "GET /static/bg.jpg HTTP/1.1" 200 -
127.0.0.1 - - [29/May/2024 04:52:22] "GET /img/overlay-bottom.png HTTP/1.1" 404 -
127.0.0.1 - - [29/May/2024 04:52:26] "GET /img/overlay-top.png HTTP/1.1" 404 -
127.0.0.1 - - [29/May/2024 04:52:26] "GET /img/favicon.ico HTTP/1.1" 404 -
127.0.0.1 - - [29/May/2024 04:52:26] "GET /img/bg-image.jpg HTTP/1.1" 404 -
127.0.0.1 - - [29/May/2024 04:53:26] "GET /img/favicon.ico HTTP/1.1" 404 -


Error fetching https://maps.google.com/maps%3Fq%3DThe%2Bsimilarity%2Bscore%2Bis%2Bnow%2Bmultiplied%2Bby%2B100%2Band%26um%3D1%26ie%3DUTF-8%26ved%3D1t:200713%26ictx%3D111: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))
Error fetching https://www.scribbr.com/frequently-asked-questions/check-document-multiple-times/: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))


127.0.0.1 - - [29/May/2024 04:56:51] "POST /check HTTP/1.1" 200 -


Error fetching https://accounts.google.com/ServiceLogin%3Fcontinue%3Dhttps://www.google.com/search%253Fq%253DThe%252Bsimilarity%252Bscore%252Bis%252Bnow%252Bmultiplied%252Bby%252B100%252Band%252B%26hl%3Den: 404 Client Error: Not Found for url: https://accounts.google.com/ServiceLogin%3Fcontinue%3Dhttps://www.google.com/search%253Fq%253DThe%252Bsimilarity%252Bscore%252Bis%252Bnow%252Bmultiplied%252Bby%252B100%252Band%252B%26hl%3Den
