Skip to content

Commit

Permalink
Merge pull request #69 from shaikhsajid1111/browser-profile-remove
Browse files Browse the repository at this point in the history
Removed browser profile feature
  • Loading branch information
shaikhsajid1111 committed Jun 4, 2023
2 parents c526887 + 63808d8 commit 9ce2641
Show file tree
Hide file tree
Showing 5 changed files with 11 additions and 42 deletions.
12 changes: 0 additions & 12 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -444,11 +444,6 @@ Output:
<td>Boolean</td>
<td>Whether to run crawler headlessly?. Default is <code>True</code></td>
</tr>
<tr>
<td>browser_profile</td>
<td>String</td>
<td>Path to the browser profile where cookies are stored and can be used for scraping data in an authenticated way.</td>
</tr>
</tbody>
</table>

Expand Down Expand Up @@ -830,11 +825,6 @@ Output:
<td>Boolean</td>
<td>Whether to run crawler headlessly?. Default is <code>True</code></td>
</tr>
<tr>
<td>browser_profile</td>
<td>String</td>
<td>Path to the browser profile where cookies are stored and can be used for scraping data in an authenticated way.</td>
</tr>
</tbody>
</table>
</div>
Expand Down Expand Up @@ -1049,8 +1039,6 @@ data = scrape_topic(filename="steamdeck", url='https://twitter.com/i/topics/1415
| tweets_count | int | Number of posts to scrape. default 10. |
| output_format | str | The output format whether JSON or CSV. Default json. |
| directory | str | Directory to save output file. Deafult current working directory. |
| browser_profile | str | Path to the browser profile where cookies are stored and can be used for scraping data in an authenticated way. |

<br>
<hr>
<div id="to-scrape-user-tweets-with-api">
Expand Down
12 changes: 1 addition & 11 deletions twitter_scraper_selenium/driver_initialization.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,34 +23,24 @@


class Initializer:
def __init__(self, browser_name: str, headless: bool, proxy: Union[str, None] = None, profile: Union[str, None] = None):
def __init__(self, browser_name: str, headless: bool, proxy: Union[str, None] = None):
"""Initialize Browser
Args:
browser_name (str): Browser Name
headless (bool): Whether to run Browser in headless mode?
proxy (Union[str, None], optional): Optional parameter, if user wants to use proxy for scraping. If the proxy is authenticated proxy then the proxy format is username:password@host:port. Defaults to None.
profile (Union[str, None], optional): Path of Browser Profile where cookies might be located to scrap data in authenticated way. Defaults to None.
"""
self.browser_name = browser_name
self.proxy = proxy
self.headless = headless
self.profile = profile

def set_properties(self, browser_option):
"""adds capabilities to the driver"""
header = Headers().generate()['User-Agent']
if self.headless:
# runs browser in headless mode
browser_option.add_argument("--headless")
if self.profile and self.browser_name.lower() == "chrome":
browser_option.add_argument(
"user-data-dir={}".format(self.profile))
if self.profile and self.browser_name.lower() == "firefox":
logger.setLevel(logging.INFO)
logger.info("Loading Profile from {}".format(self.profile))
browser_option.add_argument("-profile")
browser_option.add_argument(self.profile)
browser_option.add_argument('--no-sandbox')
browser_option.add_argument("--disable-dev-shm-usage")
browser_option.add_argument('--ignore-certificate-errors')
Expand Down
12 changes: 4 additions & 8 deletions twitter_scraper_selenium/keyword.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ class Keyword:
"""This class needs to be instantiated in order to find something
on twitter related to keywords"""

def __init__(self, keyword: str, browser: str, proxy: Union[str, None], tweets_count: int, url: Union[str, None], headless: bool, browser_profile: Union[str, None]):
def __init__(self, keyword: str, browser: str, proxy: Union[str, None], tweets_count: int, url: Union[str, None], headless: bool):
"""Scrape Tweet using keyword.
Args:
Expand All @@ -33,7 +33,6 @@ def __init__(self, keyword: str, browser: str, proxy: Union[str, None], tweets_c
tweets_count (int): Number of tweets to scrap
url (Union[str, None]): URL of the webpage.
headless (bool): Whether to run browser in headless mode?.
browser_profile (Union[str, None]): Path of Browser Profile where cookies might be located to scrap data in authenticated way.
"""
self.keyword = keyword
self.URL = url
Expand All @@ -44,12 +43,11 @@ def __init__(self, keyword: str, browser: str, proxy: Union[str, None], tweets_c
self.posts_data = {}
self.retry = 10
self.headless = headless
self.browser_profile = browser_profile

def start_driver(self):
"""changes the class member driver value to driver on call"""
self.driver = Initializer(
self.browser, self.headless, self.proxy, self.browser_profile).init()
self.browser, self.headless, self.proxy).init()

def close_driver(self):
self.driver.close()
Expand Down Expand Up @@ -189,8 +187,7 @@ def scrape_keyword(keyword: str, browser: str = "firefox", until: Union[str, Non
since: Union[int, None] = None, since_id: Union[int, None] = None, max_id: Union[int, None] = None,
within_time: Union[str, None] = None,
proxy: Union[str, None] = None, tweets_count: int = 10, output_format: str = "json",
filename: str = "", directory: str = os.getcwd(), headless: bool = True,
browser_profile: Union[str, None] = None):
filename: str = "", directory: str = os.getcwd(), headless: bool = True):
"""Scrap tweets using keywords.
Args:
Expand All @@ -207,15 +204,14 @@ def scrape_keyword(keyword: str, browser: str = "firefox", until: Union[str, Non
filename (str, optional): If output parameter is set to CSV, then it is necessary for filename parameter to passed. If not passed then the filename will be same as keyword passed. Defaults to "".
directory (str, optional): If output parameter is set to CSV, then it is valid for directory parameter to be passed. If not passed then CSV file will be saved in current working directory. Defaults to current work directory.
headless (bool, optional): Whether to run browser in Headless Mode?. Defaults to True.
browser_profile (str, optional): Path of Browser Profile where cookies might be located to scrap data in authenticated way. Defaults to None.
Returns:
str: tweets data in CSV or JSON
"""
URL = Scraping_utilities.url_generator(keyword, since=since, until=until,
since_id=since_id, max_id=max_id, within_time=within_time)
keyword_bot = Keyword(keyword, browser=browser, url=URL,
proxy=proxy, tweets_count=tweets_count, headless=headless, browser_profile=browser_profile)
proxy=proxy, tweets_count=tweets_count, headless=headless)
data = keyword_bot.scrap()
if output_format.lower() == "json":
if filename == '':
Expand Down
10 changes: 4 additions & 6 deletions twitter_scraper_selenium/profile.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ class Profile:
"""this class needs to be instantiated in orer to scrape post of some
twitter profile"""

def __init__(self, twitter_username, browser, proxy, tweets_count, headless, browser_profile):
def __init__(self, twitter_username, browser, proxy, tweets_count, headless):
self.twitter_username = twitter_username
self.URL = "https://twitter.com/{}".format(twitter_username.lower())
self.__driver = ""
Expand All @@ -32,12 +32,11 @@ def __init__(self, twitter_username, browser, proxy, tweets_count, headless, bro
self.posts_data = {}
self.retry = 10
self.headless = headless
self.browser_profile = browser_profile

def __start_driver(self):
"""changes the class member __driver value to driver on call"""
self.__driver = Initializer(
self.browser, self.headless, self.proxy, self.browser_profile).init()
self.browser, self.headless, self.proxy).init()

def __close_driver(self):
self.__driver.close()
Expand Down Expand Up @@ -175,7 +174,7 @@ def json_to_csv(filename, json_data, directory):

def scrape_profile(twitter_username: str, browser: str = "firefox", proxy: Union[str, None] = None,
tweets_count: int = 10, output_format: str = "json", filename: str = "", directory: str = os.getcwd(),
headless: bool = True, browser_profile: Union[str, None] = None):
headless: bool = True):
"""Scrap tweets of twitter profile using twitter username.
Args:
Expand All @@ -187,13 +186,12 @@ def scrape_profile(twitter_username: str, browser: str = "firefox", proxy: Union
filename (str, optional): If output_format parameter is set to CSV, then it is necessary for filename parameter to passed. If not passed then the filename will be same as keyword passed. Defaults to "".
directory (str, optional): If output_format parameter is set to CSV, then it is valid for directory parameter to be passed. If not passed then CSV file will be saved in current working directory. Defaults to os.getcwd().
headless (bool, optional): Whether to run browser in headless mode?. Defaults to True.
browser_profile (Union[str, None], optional): Path of Browser Profile where cookies might be located to scrap data in authenticated way. Defaults to None.
Returns:
str: tweets data in CSV or JSON
"""
profile_bot = Profile(twitter_username, browser,
proxy, tweets_count, headless, browser_profile)
proxy, tweets_count, headless)
data = profile_bot.scrap()
if output_format.lower() == "json":
if filename == '':
Expand Down
7 changes: 2 additions & 5 deletions twitter_scraper_selenium/topic.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,8 +22,7 @@ def scrape_topic(
tweets_count: int = 10,
output_format: str = "json",
directory: str = None,
headless: bool = True,
browser_profile=None
headless: bool = True
):
"""
Returns tweets data in CSV or JSON.
Expand All @@ -36,7 +35,6 @@ def scrape_topic(
tweets_count: Number of posts to scrap. Default is 10.
output_format: The output format, whether JSON or CSV. Default is JSON.
directory: If output parameter is set to CSV, then it is valid for directory parameter to be passed. If not passed then CSV file will be saved in current working directory.
browser_profile: Path of Browser Profile where cookies might be located to scrap data in authenticated way.
"""
pass
if directory is None:
Expand All @@ -47,8 +45,7 @@ def scrape_topic(
url=url,
headless=headless,
proxy=proxy,
tweets_count=tweets_count,
browser_profile=browser_profile
tweets_count=tweets_count
)
data = keyword_bot.scrap()
if output_format.lower() == "json":
Expand Down

0 comments on commit 9ce2641

Please sign in to comment.