Merge pull request #69 from shaikhsajid1111/browser-profile-remove

Removed browser profile feature
shaikhsajid1111 · Jun 4, 2023 · 9ce2641 · 9ce2641
2 parents c526887 + 63808d8
commit 9ce2641
Show file tree

Hide file tree

Showing 5 changed files with 11 additions and 42 deletions.
diff --git a/README.md b/README.md
@@ -444,11 +444,6 @@ Output:
             <td>Boolean</td>
             <td>Whether to run crawler headlessly?. Default is <code>True</code></td>
         </tr>
-        <tr>
-            <td>browser_profile</td>
-            <td>String</td>
-            <td>Path to the browser profile where cookies are stored and can be used for scraping data in an authenticated way.</td>
-        </tr>
     </tbody>
 </table>
 
@@ -830,11 +825,6 @@ Output:
             <td>Boolean</td>
             <td>Whether to run crawler headlessly?. Default is <code>True</code></td>
         </tr>
-        <tr>
-            <td>browser_profile</td>
-            <td>String</td>
-            <td>Path to the browser profile where cookies are stored and can be used for scraping data in an authenticated way.</td>
-        </tr>
     </tbody>
 </table>
 </div>
@@ -1049,8 +1039,6 @@ data = scrape_topic(filename="steamdeck", url='https://twitter.com/i/topics/1415
 | tweets_count  | int                | Number of posts to scrape. default 10.                                                                                                  |
 | output_format | str                | The output format whether JSON or CSV. Default json.                                                                                   |
 | directory     | str                | Directory to save output file. Deafult current working directory.                                                                      |
-| browser_profile | str | Path to the browser profile where cookies are stored and can be used for scraping data in an authenticated way. |
-
 <br>
 <hr>
 <div id="to-scrape-user-tweets-with-api">

diff --git a/twitter_scraper_selenium/driver_initialization.py b/twitter_scraper_selenium/driver_initialization.py
@@ -23,34 +23,24 @@
 
 
 class Initializer:
-    def __init__(self, browser_name: str, headless: bool, proxy: Union[str, None] = None, profile: Union[str, None] = None):
+    def __init__(self, browser_name: str, headless: bool, proxy: Union[str, None] = None):
         """Initialize Browser
 
         Args:
             browser_name (str): Browser Name
             headless (bool): Whether to run Browser in headless mode?
             proxy (Union[str, None], optional): Optional parameter, if user wants to use proxy for scraping. If the proxy is authenticated proxy then the proxy format is username:password@host:port. Defaults to None.
-            profile (Union[str, None], optional): Path of Browser Profile where cookies might be located to scrap data in authenticated way. Defaults to None.
       """
         self.browser_name = browser_name
         self.proxy = proxy
         self.headless = headless
-        self.profile = profile
 
     def set_properties(self, browser_option):
         """adds capabilities to the driver"""
         header = Headers().generate()['User-Agent']
         if self.headless:
             # runs browser in headless mode
             browser_option.add_argument("--headless")
-        if self.profile and self.browser_name.lower() == "chrome":
-            browser_option.add_argument(
-                "user-data-dir={}".format(self.profile))
-        if self.profile and self.browser_name.lower() == "firefox":
-            logger.setLevel(logging.INFO)
-            logger.info("Loading Profile from {}".format(self.profile))
-            browser_option.add_argument("-profile")
-            browser_option.add_argument(self.profile)
         browser_option.add_argument('--no-sandbox')
         browser_option.add_argument("--disable-dev-shm-usage")
         browser_option.add_argument('--ignore-certificate-errors')

diff --git a/twitter_scraper_selenium/keyword.py b/twitter_scraper_selenium/keyword.py
@@ -23,7 +23,7 @@ class Keyword:
     """This class needs to be instantiated in order to find something
     on twitter related to keywords"""
 
-    def __init__(self, keyword: str, browser: str, proxy: Union[str, None], tweets_count: int, url: Union[str, None], headless: bool, browser_profile: Union[str, None]):
+    def __init__(self, keyword: str, browser: str, proxy: Union[str, None], tweets_count: int, url: Union[str, None], headless: bool):
         """Scrape Tweet using keyword.
 
         Args:
@@ -33,7 +33,6 @@ def __init__(self, keyword: str, browser: str, proxy: Union[str, None], tweets_c
             tweets_count (int): Number of tweets to scrap
             url (Union[str, None]): URL of the webpage.
             headless (bool): Whether to run browser in headless mode?.
-            browser_profile (Union[str, None]): Path of Browser Profile where cookies might be located to scrap data in authenticated way.
         """
         self.keyword = keyword
         self.URL = url
@@ -44,12 +43,11 @@ def __init__(self, keyword: str, browser: str, proxy: Union[str, None], tweets_c
         self.posts_data = {}
         self.retry = 10
         self.headless = headless
-        self.browser_profile = browser_profile
 
     def start_driver(self):
         """changes the class member driver value to driver on call"""
         self.driver = Initializer(
-            self.browser, self.headless, self.proxy, self.browser_profile).init()
+            self.browser, self.headless, self.proxy).init()
 
     def close_driver(self):
         self.driver.close()
@@ -189,8 +187,7 @@ def scrape_keyword(keyword: str, browser: str = "firefox", until: Union[str, Non
                   since: Union[int, None] = None, since_id: Union[int, None] = None, max_id: Union[int, None] = None,
                   within_time: Union[str, None] = None,
                   proxy: Union[str, None] = None, tweets_count: int = 10, output_format: str = "json",
-                  filename: str = "", directory: str = os.getcwd(), headless: bool = True,
-                  browser_profile: Union[str, None] = None):
+                  filename: str = "", directory: str = os.getcwd(), headless: bool = True):
     """Scrap tweets using keywords.
 
     Args:
@@ -207,15 +204,14 @@ def scrape_keyword(keyword: str, browser: str = "firefox", until: Union[str, Non
         filename (str, optional): If output parameter is set to CSV, then it is necessary for filename parameter to passed. If not passed then the filename will be same as keyword passed. Defaults to "".
         directory (str, optional): If output parameter is set to CSV, then it is valid for directory parameter to be passed. If not passed then CSV file will be saved in current working directory. Defaults to current work directory.
         headless (bool, optional): Whether to run browser in Headless Mode?. Defaults to True.
-        browser_profile (str, optional): Path of Browser Profile where cookies might be located to scrap data in authenticated way. Defaults to None.
 
     Returns:
         str: tweets data in CSV or JSON
     """
     URL = Scraping_utilities.url_generator(keyword, since=since, until=until,
                                            since_id=since_id, max_id=max_id, within_time=within_time)
     keyword_bot = Keyword(keyword, browser=browser, url=URL,
-                          proxy=proxy, tweets_count=tweets_count, headless=headless, browser_profile=browser_profile)
+                          proxy=proxy, tweets_count=tweets_count, headless=headless)
     data = keyword_bot.scrap()
     if output_format.lower() == "json":
         if filename == '':

diff --git a/twitter_scraper_selenium/profile.py b/twitter_scraper_selenium/profile.py
@@ -22,7 +22,7 @@ class Profile:
     """this class needs to be instantiated in orer to scrape post of some
     twitter profile"""
 
-    def __init__(self, twitter_username, browser, proxy, tweets_count, headless, browser_profile):
+    def __init__(self, twitter_username, browser, proxy, tweets_count, headless):
         self.twitter_username = twitter_username
         self.URL = "https://twitter.com/{}".format(twitter_username.lower())
         self.__driver = ""
@@ -32,12 +32,11 @@ def __init__(self, twitter_username, browser, proxy, tweets_count, headless, bro
         self.posts_data = {}
         self.retry = 10
         self.headless = headless
-        self.browser_profile = browser_profile
 
     def __start_driver(self):
         """changes the class member __driver value to driver on call"""
         self.__driver = Initializer(
-            self.browser, self.headless, self.proxy, self.browser_profile).init()
+            self.browser, self.headless, self.proxy).init()
 
     def __close_driver(self):
         self.__driver.close()
@@ -175,7 +174,7 @@ def json_to_csv(filename, json_data, directory):
 
 def scrape_profile(twitter_username: str, browser: str = "firefox", proxy: Union[str, None] = None,
                   tweets_count: int = 10, output_format: str = "json", filename: str = "", directory: str = os.getcwd(),
-                  headless: bool = True, browser_profile: Union[str, None] = None):
+                  headless: bool = True):
     """Scrap tweets of twitter profile using twitter username.
 
     Args:
@@ -187,13 +186,12 @@ def scrape_profile(twitter_username: str, browser: str = "firefox", proxy: Union
         filename (str, optional): If output_format parameter is set to CSV, then it is necessary for filename parameter to passed. If not passed then the filename will be same as keyword passed. Defaults to "".
         directory (str, optional): If output_format parameter is set to CSV, then it is valid for directory parameter to be passed. If not passed then CSV file will be saved in current working directory. Defaults to os.getcwd().
         headless (bool, optional): Whether to run browser in headless mode?. Defaults to True.
-        browser_profile (Union[str, None], optional): Path of Browser Profile where cookies might be located to scrap data in authenticated way. Defaults to None.
 
     Returns:
         str: tweets data in CSV or JSON
     """
     profile_bot = Profile(twitter_username, browser,
-                          proxy, tweets_count, headless, browser_profile)
+                          proxy, tweets_count, headless)
     data = profile_bot.scrap()
     if output_format.lower() == "json":
         if filename == '':

diff --git a/twitter_scraper_selenium/topic.py b/twitter_scraper_selenium/topic.py
@@ -22,8 +22,7 @@ def scrape_topic(
     tweets_count: int = 10,
     output_format: str = "json",
     directory: str = None,
-    headless: bool = True,
-    browser_profile=None
+    headless: bool = True
 ):
     """
     Returns tweets data in CSV or JSON.
@@ -36,7 +35,6 @@ def scrape_topic(
         tweets_count: Number of posts to scrap. Default is 10.
         output_format: The output format, whether JSON or CSV. Default is JSON.
         directory: If output parameter is set to CSV, then it is valid for directory parameter to be passed. If not passed then CSV file will be saved in current working directory.
-        browser_profile: Path of Browser Profile where cookies might be located to scrap data in authenticated way.
     """
     pass
     if directory is None:
@@ -47,8 +45,7 @@ def scrape_topic(
         url=url,
         headless=headless,
         proxy=proxy,
-        tweets_count=tweets_count,
-        browser_profile=browser_profile
+        tweets_count=tweets_count
     )
     data = keyword_bot.scrap()
     if output_format.lower() == "json":