[Issue #87]: update-graph-data.py script | column to track updation

stopstalk · Jul 26, 2017 · 56299ed · 56299ed
1 parent 01fa1e1
commit 56299ed
Show file tree

Hide file tree

Showing 4 changed files with 338 additions and 1 deletion.
diff --git a/.gitignore b/.gitignore
@@ -4,6 +4,7 @@
 /sessions/*
 /uploads/*
 /databases/*
+/graph_data/*.pickle
 .~lock.*.csv#
 *.log
 *.pyc

diff --git a/graph_data/README.md b/graph_data/README.md
@@ -0,0 +1,4 @@
+All the graph data will be stored in this directory. All the files would be standard pickled files.
+
+<user_id>.pickle -> Pickle file for User(<user_id>)
+<user_id>_custom.pickle -> Pickle file for Custom user(<user_id>)
diff --git a/models/db.py b/models/db.py
@@ -150,6 +150,10 @@
                       readable=False,
                       writable=False),
                 Field("authentic", "boolean",
+                      default=False,
+                      readable=False,
+                      writable=False),
+                Field("graph_data_retrieved", "boolean",
                       default=False,
                       readable=False,
                       writable=False)]
@@ -482,7 +486,11 @@ def register_callback(form):
                               default="0.0",
                               writable=False),
                         Field("duplicate_cu", "reference custom_friend",
-                              default=None)]
+                              default=None),
+                        Field("graph_data_retrieved", "boolean",
+                              default=False,
+                              readable=False,
+                              writable=False)]
 
 custom_friend_fields += site_handles
 custom_friend_fields += all_last_retrieved

diff --git a/private/scripts/update-graph-data.py b/private/scripts/update-graph-data.py
@@ -0,0 +1,324 @@
+"""
+    Copyright (c) 2015-2017 Raj Patel(raj454raj@gmail.com), StopStalk
+
+    Permission is hereby granted, free of charge, to any person obtaining a copy
+    of this software and associated documentation files (the "Software"), to deal
+    in the Software without restriction, including without limitation the rights
+    to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+    copies of the Software, and to permit persons to whom the Software is
+    furnished to do so, subject to the following conditions:
+
+    The above copyright notice and this permission notice shall be included in
+    all copies or substantial portions of the Software.
+
+    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+    OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+    THE SOFTWARE.
+"""
+
+# Usage
+# -----
+# First Argument - Comma seperated lower-cased sites
+# Second Argument - {batch, specific_user, new_user}
+#   * batch - Update graph data for users for whom (user_id % <fourth_argment> == <third_argument>)
+#   * specific_user -
+#       Third argument - normal/custom
+#       Fourth argument - <user_id>
+#   * new_user - Retrieve contest data for all the users whose graph_data_retrieved is not True
+# python web2py.py -S stopstalk -M -R applications/stopstalk/private/scripts/update-graph-data.py -A codechef,codeforces,hackerrank batch 5 100
+
+import requests, re, os, sys, json, gevent, pickle
+from gevent import monkey
+gevent.monkey.patch_all(thread=False)
+
+from bs4 import BeautifulSoup
+from datetime import datetime, timedelta
+
+# Constants to be used in case of request failures
+SERVER_FAILURE = "SERVER_FAILURE"
+NOT_FOUND = "NOT_FOUND"
+OTHER_FAILURE = "OTHER_FAILURE"
+REQUEST_FAILURES = (SERVER_FAILURE, NOT_FOUND, OTHER_FAILURE)
+INVALID_HANDLES = set([(row.handle, row.site) for row in db(db.invalid_handle).select()])
+
+# -----------------------------------------------------------------------------
+def get_request(url, headers={}, timeout=current.TIMEOUT, params={}):
+    """
+        Make a HTTP GET request to a url
+
+        @param url (String): URL to make get request to
+        @param headers (Dict): Headers to be passed along
+                               with the request headers
+
+        @return: Response object or -1 or {}
+    """
+
+    i = 0
+    while i < current.MAX_TRIES_ALLOWED:
+        try:
+            response = requests.get(url,
+                                    headers=headers,
+                                    params=params,
+                                    proxies=current.PROXY,
+                                    timeout=timeout)
+        except Exception as e:
+            print e, url
+            return SERVER_FAILURE
+
+        if response.status_code == 200:
+            return response
+        elif response.status_code == 404 or response.status_code == 400:
+            # User not found
+            # 400 for CodeForces users
+            return NOT_FOUND
+        i += 1
+
+    # Request unsuccessful even after MAX_TRIES_ALLOWED
+    return OTHER_FAILURE
+
+class User:
+
+    def __init__(self, user_id, handles, user_record, custom=False):
+        self.handles = handles
+        if custom:
+            self.pickle_file_path = "./applications/stopstalk/graph_data/" + \
+                                    str(user_id) + "_custom.pickle"
+        else:
+            self.pickle_file_path = "./applications/stopstalk/graph_data/" + \
+                                    str(user_id) + ".pickle"
+        self.contest_mapping = {}
+        self.previous_graph_data = None
+        self.graph_data = dict([(x.lower() + "_data", []) for x in current.SITES])
+        self.user_record = user_record
+
+        if os.path.exists(self.pickle_file_path):
+            self.previous_graph_data = pickle.load(open(self.pickle_file_path, "rb"))
+            self.graph_data = dict(self.previous_graph_data)
+            for site_data in self.previous_graph_data:
+                for contest_data in self.previous_graph_data[site_data]:
+                    self.contest_mapping[contest_data["title"]] = contest_data["data"]
+
+    def codechef_data(self):
+        handle = self.handles["codechef_handle"]
+        url = "https://www.codechef.com/users/" + handle
+        response = get_request(url)
+        if response in REQUEST_FAILURES:
+            print "Request ERROR: CodeChef " + url + " " + response
+            return
+
+        def zero_pad(string):
+            return "0" + string if len(string) == 1 else string
+
+        try:
+            ratings = eval(re.search("var all_rating = .*?;", response.text).group()[17:-1])
+        except Exception as e:
+            print e
+            return
+
+        long_contest_data = {}
+        cookoff_contest_data = {}
+        ltime_contest_data = {}
+
+        for contest in ratings:
+            this_obj = None
+            if contest["code"].__contains__("COOK"):
+                # Cook off contest
+                this_obj = cookoff_contest_data
+            elif contest["code"].__contains__("LTIME"):
+                # Lunchtime contest
+                this_obj = ltime_contest_data
+            else:
+                # Long contest
+                this_obj = long_contest_data
+            # getdate, getmonth and getyear give end_date only
+            time_stamp = str(datetime.strptime(contest["end_date"], "%Y-%m-%d %H:%M:%S"))
+            this_obj[time_stamp] = {"name": contest["name"],
+                                    "url": "https://www.codechef.com/" + contest["code"],
+                                    "rating": str(contest["rating"]),
+                                    "rank": contest["rank"]}
+
+        self.graph_data["codechef_data"] = [{"title": "CodeChef Long",
+                                             "data": long_contest_data},
+                                            {"title": "CodeChef Cook-off",
+                                             "data": cookoff_contest_data},
+                                            {"title": "CodeChef Lunchtime",
+                                             "data": ltime_contest_data}]
+
+    def codeforces_data(self):
+        handle = self.handles["codeforces_handle"]
+        website = "http://codeforces.com/"
+        url = "%sapi/contest.list" % website
+        response = get_request(url)
+        if response in REQUEST_FAILURES:
+            print "Request ERROR: Codeforces " + url + " " + response
+            return
+
+        contest_list = response.json()["result"]
+        all_contests = {}
+
+        for contest in contest_list:
+            all_contests[contest["id"]] = contest
+
+        url = "%scontests/with/%s" % (website, handle)
+
+        response = get_request(url)
+        if response in REQUEST_FAILURES:
+            print "Request ERROR: Codeforces " + url + " " + response
+            return
+
+        soup = BeautifulSoup(response.text, "lxml")
+        try:
+            tbody = soup.find("table", class_="tablesorter").find("tbody")
+        except AttributeError:
+            print "Cannot find CodeForces user " + handle
+            return
+
+        contest_data = {}
+        for tr in tbody.find_all("tr"):
+            all_tds = tr.find_all("td")
+            contest_id = int(all_tds[1].find("a")["href"].split("/")[-1])
+            rank = int(all_tds[2].find("a").contents[0].strip())
+            solved_count = int(all_tds[3].find("a").contents[0].strip())
+            rating_change = int(all_tds[4].find("span").contents[0].strip())
+            new_rating = int(all_tds[5].contents[0].strip())
+            contest = all_contests[contest_id]
+            time_stamp = str(datetime.fromtimestamp(contest["startTimeSeconds"]))
+            contest_data[time_stamp] = {"name": contest["name"],
+                                        "url": "%scontest/%d" % (website,
+                                                                 contest_id),
+                                        "rating": str(new_rating),
+                                        "ratingChange": rating_change,
+                                        "rank": rank,
+                                        "solvedCount": solved_count}
+
+        self.graph_data["codeforces_data"] = [{"title": "Codeforces",
+                                               "data": contest_data}]
+
+    def hackerrank_data(self):
+        handle = self.handles["hackerrank_handle"]
+        website = "https://www.hackerrank.com/"
+        url = "%srest/hackers/%s/rating_histories_elo" % (website, handle)
+        response = get_request(url)
+        if response in REQUEST_FAILURES:
+            print "Request ERROR: HackerRank " + url + " " + response
+            return
+        response = response.json()["models"]
+
+        hackerrank_graphs = []
+        for contest_class in response:
+            final_json = {}
+            for contest in contest_class["events"]:
+                time_stamp = contest["date"][:-5].split("T")
+                time_stamp = datetime.strptime(time_stamp[0] + " " + time_stamp[1],
+                                               "%Y-%m-%d %H:%M:%S")
+                # Convert UTC to IST
+                time_stamp += timedelta(hours=5, minutes=30)
+                time_stamp = str(time_stamp)
+                final_json[time_stamp] = {"name": contest["contest_name"],
+                                          "url": website + contest["contest_slug"],
+                                          "rating": str(contest["rating"]),
+                                          "rank": contest["rank"]}
+
+            graph_name = "HackerRank - %s" % contest_class["category"]
+            hackerrank_graphs.append({"title": graph_name,
+                                      "data": final_json})
+
+        self.graph_data["hackerrank_data"] = hackerrank_graphs
+
+    def spoj_data(self):
+        pass
+
+    def hackerearth_data(self):
+        pass
+
+    def uva_data(self):
+        pass
+
+    def write_to_filesystem(self):
+        if self.previous_graph_data == self.graph_data:
+            print "No updates in the graph data"
+            return
+
+        if self.previous_graph_data is not None:
+            # Pickle file already exists for the user
+            for site_data in self.graph_data:
+                for contest_data in self.graph_data[site_data]:
+                    try:
+                        previous_value = self.contest_mapping[contest_data["title"]]
+                    except KeyError:
+                        continue
+                    if len(contest_data["data"]) < len(previous_value):
+                        contest_data = previous_value
+        pickle.dump(self.graph_data, open(self.pickle_file_path, "wb"))
+        print "Writing to filesystem done"
+
+    def update_graph_data(self, sites):
+        threads = []
+        for site in sites:
+            if self.handles.has_key(site + "_handle") and self.handles[site + "_handle"] != "":
+                threads.append(gevent.spawn(getattr(self,
+                                                    site + "_data")))
+
+        gevent.joinall(threads)
+        self.write_to_filesystem()
+        self.user_record.update_record(graph_data_retrieved=True)
+
+def get_user_objects(aquery=None, cquery=None, sites=None):
+    user_objects = []
+    users = []
+    if aquery:
+        users += db(aquery).select().records
+    if cquery:
+        users += db(cquery).select().records
+
+    for user in users:
+        if "custom_friend" in user:
+            custom = True
+            user = user["custom_friend"]
+        else:
+            custom = False
+            user = user["auth_user"]
+
+        user_dict = {}
+        for site in sites:
+            site_handle = site + "_handle"
+            if user[site_handle] != "" and \
+               (user[site_handle], site) not in INVALID_HANDLES:
+                user_dict[site_handle] = user[site_handle]
+        print user
+        user_objects.append(User(user.id, user_dict, user, custom))
+
+    return user_objects
+
+if __name__ == "__main__":
+    sites = sys.argv[1].strip().split(",")
+    atable = db.auth_user
+    cftable = db.custom_friend
+    user_objects = []
+
+    if sys.argv[2] == "batch":
+        index = int(sys.argv[3])
+        N = int(sys.argv[4])
+        user_objects = get_user_objects((atable.id % N == index),
+                                        (cftable.id % N == index),
+                                        sites)
+    elif sys.argv[2] == "specific_user":
+        if sys.argv[3] == "normal":
+            user_objects = get_user_objects(aquery=(atable.id == int(sys.argv[4])),
+                                            sites=sites)
+        else:
+            user_objects = get_user_objects(cquery=(cftable.id == int(sys.argv[4])),
+                                            sites=sites)
+    elif sys.argv[2] == "new_user":
+        user_objects = get_user_objects((atable.graph_data_retrieved != True),
+                                        (cftable.graph_data_retrieved != True),
+                                        sites=sites)
+    else:
+        print "Invalid Arguments"
+
+    for user_object in user_objects:
+        user_object.update_graph_data(sites)