-
Notifications
You must be signed in to change notification settings - Fork 0
/
main.py
120 lines (95 loc) · 3.79 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
import re
import difflib
import streamlit as st
import pytesseract as tess
from PIL import Image
from textblob import TextBlob
from collections import Counter
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
# Set path to Tesseract executable
tess.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'
def analyze_sentiment(text):
blob = TextBlob(text)
sentiment_score = blob.sentiment.polarity
if sentiment_score > 0:
return "Positive"
elif sentiment_score < 0:
return "Negative"
else:
return "Neutral"
def extract_text(image):
try:
text = tess.image_to_string(image)
return text.strip()
except Exception as e:
st.error(f"An error occurred: {str(e)}")
st.warning("Please try uploading another image or check your internet connection.")
return None
def spell_check(text):
words = re.findall(r'\b\w+\b', text.lower())
corrected_text = []
for word in words:
corrected_word = difflib.get_close_matches(word, ENGLISH_STOP_WORDS, n=1)
corrected_text.append(corrected_word[0] if corrected_word else word)
return ' '.join(corrected_text)
def extract_keywords(text):
# Tokenize the text into words
words = re.findall(r'\b\w+\b', text.lower())
# Remove stop words
custom_stopwords = set(ENGLISH_STOP_WORDS)
words = [word for word in words if word not in custom_stopwords]
# Filter out non-keywords based on specific conditions
keywords = [word for word in words if len(word) > 3 and not word.isdigit()]
# Get word frequencies
word_freq = Counter(keywords)
# Get top 30 keywords based on frequency
top_keywords = [word for word, freq in word_freq.most_common(30)]
return top_keywords, word_freq
def main():
st.title("Image2Text Analyzer")
# Upload Image
st.header("Choose an image...")
uploaded_file = st.file_uploader("", type=["jpg", "jpeg", "png"])
if uploaded_file is not None:
# Display the uploaded image
image = Image.open(uploaded_file)
# Button to trigger text extraction
if st.button("Extract Text"):
# Extract text from image
extracted_text = extract_text(image)
if extracted_text:
# Perform spell-checking
corrected_text = spell_check(extracted_text)
# st.write("Corrected Text:")
# st.write(corrected_text)
# Analyze sentiment of corrected text
sentiment = analyze_sentiment(corrected_text)
st.subheader(f"Sentiment: {sentiment}")
# Provide a download link for the extracted text and keywords
col1, col2, col3 = st.columns(3)
with col1:
st.download_button(
label="Download Extracted Text",
data=extracted_text,
file_name="extracted_text.txt",
mime="text/plain"
)
with col2:
st.download_button(
label="Download Corrected Text",
data=corrected_text,
file_name="corrected_text.txt",
mime="text/plain"
)
with col3:
# Extract keywords from extracted text
keywords, word_freq = extract_keywords(corrected_text)
keywords_text = "\n".join(keywords)
st.download_button(
label="Download Keywords",
data=keywords_text,
file_name="extracted_keywords.txt",
mime="text/plain"
)
if __name__ == "__main__":
main()