Skip to content

Commit

Permalink
Improved link scraping
Browse files Browse the repository at this point in the history
  • Loading branch information
sakethpathike committed Jun 2, 2024
1 parent 276ce7b commit 0668f82
Show file tree
Hide file tree
Showing 4 changed files with 55 additions and 21 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@ import com.sakethh.linkora.ui.viewmodels.SettingsScreenVM.Settings.isSendCrashRe
import com.sakethh.linkora.ui.viewmodels.localDB.UpdateVM
import com.sakethh.linkora.utils.ExportImpl
import com.sakethh.linkora.utils.ImportImpl
import com.sakethh.linkora.utils.isNetworkAvailable
import com.sakethh.linkora.utils.linkDataExtractor
import io.ktor.client.HttpClient
import io.ktor.client.request.get
Expand Down Expand Up @@ -482,6 +483,11 @@ class SettingsScreenVM(
isSwitchEnabled = mutableStateOf(false),
onSwitchStateChange = {
dataRefreshState.intValue = 0
if (!isNetworkAvailable(context)) {
// Toast is not supposed to be implemented in the VM and will be removed during the rewrite of this app
Toast.makeText(context, "Network not found", Toast.LENGTH_SHORT).show()
return@SettingsUIElement
}
viewModelScope.launch {
awaitAll(async {
LocalDataBase.localDB.readDao().getAllFromLinksTable().toList()
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -89,7 +89,8 @@ class CreateVM : ViewModel() {
LocalDataBase.localDB.createDao().addANewLinkToImpLinks(
importantLinks = ImportantLinks(
title = if (SettingsScreenVM.Settings.isAutoDetectTitleForLinksEnabled.value || autoDetectTitle) linkDataExtractor.title else title,
webURL = webURL,
webURL = "http" + webURL.substringAfter("http").substringBefore("?")
.trim(),
baseURL = webURL,
imgURL = linkDataExtractor.imgURL,
infoForSaving = noteForSaving
Expand Down Expand Up @@ -157,7 +158,7 @@ class CreateVM : ViewModel() {
}
val linkData = LinksTable(
title = if (SettingsScreenVM.Settings.isAutoDetectTitleForLinksEnabled.value || autoDetectTitle) _linkDataExtractor.title else title,
webURL = webURL,
webURL = "http" + webURL.substringAfter("http").substringBefore("?").trim(),
baseURL = _linkDataExtractor.baseURL,
imgURL = _linkDataExtractor.imgURL,
infoForSaving = noteForSaving,
Expand Down Expand Up @@ -236,7 +237,7 @@ class CreateVM : ViewModel() {
}
val linkData = LinksTable(
title = if (SettingsScreenVM.Settings.isAutoDetectTitleForLinksEnabled.value || autoDetectTitle) linkDataExtractor.title else title,
webURL = webURL,
webURL = "http" + webURL.substringAfter("http").substringBefore("?").trim(),
baseURL = linkDataExtractor.baseURL,
imgURL = linkDataExtractor.imgURL,
infoForSaving = noteForSaving,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -142,7 +142,8 @@ class UpdateVM : ViewModel() {
linkDataExtractor(importantLinks.webURL)
val linksData = ImportantLinks(
title = if (SettingsScreenVM.Settings.isAutoDetectTitleForLinksEnabled.value || autoDetectTitle) linkDataExtractor.title else importantLinks.title,
webURL = importantLinks.webURL,
webURL = "http" + importantLinks.webURL.substringAfter("http")
.substringBefore("?").trim(),
baseURL = linkDataExtractor.baseURL,
imgURL = linkDataExtractor.imgURL,
infoForSaving = importantLinks.infoForSaving
Expand Down
60 changes: 43 additions & 17 deletions app/src/main/java/com/sakethh/linkora/utils/LinkDataExtractor.kt
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,6 @@ import android.content.Context
import android.net.ConnectivityManager
import android.net.NetworkCapabilities
import android.os.Build
import com.sakethh.linkora.ui.viewmodels.SettingsScreenVM
import io.ktor.client.request.get
import kotlinx.coroutines.Dispatchers
import kotlinx.coroutines.withContext
import org.jsoup.Jsoup
Expand All @@ -29,28 +27,56 @@ suspend fun linkDataExtractor(webURL: String): LinkDataExtractor {
}
return withContext(Dispatchers.IO) {
val rawHTML = if (!errorInGivenURL) {
Jsoup.connect(webURL)
.userAgent("Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:125.0) Gecko/20100101 Firefox/125.0")
.referrer("http://www.google.com")
.followRedirects(true)
.header("Accept", "text/html")
.header("Accept-Encoding", "gzip,deflate")
.header(
"Accept-Language",
"it-IT,en;q=0.8,en-US;q=0.6,de;q=0.4,it;q=0.2,es;q=0.2"
)
.header("Connection", "keep-alive")
.ignoreContentType(true).maxBodySize(0).ignoreHttpErrors(true).get().toString()
try {
Jsoup.connect("http" + webURL.substringAfter("http").substringBefore("?").trim())
.userAgent("Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:125.0) Gecko/20100101 Firefox/125.0")
.referrer("http://www.google.com")
.followRedirects(true)
.header("Accept", "text/html")
.header("Accept-Encoding", "gzip,deflate")
.header(
"Accept-Language",
"it-IT,en;q=0.8,en-US;q=0.6,de;q=0.4,it;q=0.2,es;q=0.2"
)
.header("Connection", "keep-alive")
.ignoreContentType(true).maxBodySize(0).ignoreHttpErrors(true).get().toString()
} catch (e: Exception) {
e.printStackTrace()
""
}
} else {
""
}
val imgURL = rawHTML.substringAfter("og:image").substringAfter("content=\"")
.substringBefore("\">").trim().let {
if (SettingsScreenVM.Settings.ktorClient.get(it).status.value == 200) {
val imgURL = rawHTML.split("\n").firstOrNull() {
it.contains("og:image")
}.let {
"http" + it?.substringAfter("http")?.substringBefore("\"")
}.trim().let {
try {
val statusValue = withContext(Dispatchers.IO) {
Jsoup.connect(it)
.userAgent("Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:125.0) Gecko/20100101 Firefox/125.0")
.referrer("http://www.google.com")
.followRedirects(true)
.header("Accept", "text/html")
.header("Accept-Encoding", "gzip,deflate")
.header(
"Accept-Language",
"it-IT,en;q=0.8,en-US;q=0.6,de;q=0.4,it;q=0.2,es;q=0.2"
)
.header("Connection", "keep-alive")
.ignoreContentType(true).maxBodySize(0).ignoreHttpErrors(true).execute()
.statusCode()
}
if (statusValue == 200) {
it
} else {
""
}
} catch (e: Exception) {
e.printStackTrace()
""
}
}
val title =
rawHTML.substringAfter("<title").substringAfter(">").substringBefore("</title>").trim()
Expand Down

0 comments on commit 0668f82

Please sign in to comment.