In [None]:
import pandas as pd
import ast
import math
from collections import Counter
import requests

In [None]:
def all_same(items):
    if isinstance(items[0], float) and all(math.isnan(x) for x in items):
        return True
    return all(x == items[0] for x in items)

In [None]:
turk_df = pd.read_excel('NESCent_publications/NESCent_CV.xlsx')

In [None]:
turk_grouped_df = turk_df.groupby('Input.name').agg(lambda x: x.tolist()[0] if all_same(x.tolist()) else x.tolist())

In [None]:
stop_urls = ['http://notfound.com', 'http://none', 'http://NA', 'http://na.com', 'http://NA@NA.com', 'http://none.com', 'http://www.NA@NA.com', 'https://NA', 'https://none.com', 'https://notfound.com', 'http://notexactlyfound.com', 'http://notexplicityfound.com', 'http://www.thereisnocv.com', 'file:///C:/Users/user/Downloads/Documents/Ove_Nilsson.pdf', 'httpys://ocs.yale.edu/sites/default/files/files/CV%20to%20ResumeWorkshopfinal.pdf' ]

In [None]:
all_clean_urls = []

for index, row in turk_grouped_df.iterrows():
    urls = row['Answer.web_url']
    
    # Create array of all urls
    parsed_urls = []
    if isinstance(urls, str):
        if urls[-1] == '|':
            urls = urls[0:-1]
        
        cleaned_url = [x for x in urls.split('|')]
        parsed_urls.extend( cleaned_url)

    elif isinstance(urls, list):
        for url in urls:
            if url[-1] == '|':
                url = url[0:-1]
            parsed_urls.extend(url.split('|'))
        
    # Remove 'https://', 'http://', 'www.' for comparison
    clean_urls = []
    for index, url in enumerate(parsed_urls):
        cleaned_url = url
        if cleaned_url not in stop_urls:
            clean_urls.append(cleaned_url)
    
    all_clean_urls.append(clean_urls)

In [None]:
url_counters = [Counter(x) for x in all_clean_urls]

In [None]:
top_5 = [[], [], [], [], []]

for counter in url_counters:
    top_urls = [x[0] for x in counter.most_common()]
    
    for index, url in enumerate(top_urls):
        top_5[index].append(url)
        
    for index in range(len(top_urls), 5):
        top_5[index].append(-1)
        
    

In [None]:
for index, url_group in enumerate(top_5):
    turk_grouped_df['top_' + str(index+1) + '_url'] =  url_group

In [None]:
turk_grouped_df = turk_grouped_df.reset_index()

In [None]:
turk_grouped_df = turk_grouped_df.drop(['Title', 'Description', 'Input.searchlink', 'Answer.web_url'], axis=1)

In [None]:
from requests.packages.urllib3.exceptions import InsecureRequestWarning
requests.packages.urllib3.disable_warnings(InsecureRequestWarning)

In [None]:
from text_scraper import get_text_from_url

In [None]:
from tqdm import tqdm_notebook as tqdm

raw = [[], [], [], [], []]

with tqdm(total=180) as pbar:
    for i, row in turk_grouped_df.iterrows():
        pbar.update(1)
        for j in range(0,5):
            url = row['top_' + str(j+1) + '_url']

            if url == -1:
                raw[j].append(-1)
            else:
                raw[j].append(get_text_from_url(url, False).replace('\n', " "))
         

In [None]:
for index, raw_group in enumerate(raw):
    turk_grouped_df['top_' + str(index+1) + '_raw'] =  raw_group

In [None]:
turk_grouped_df.to_csv('turked_grouped.csv')

In [None]:
import re

In [None]:
# Returns True for: (2015) (2013). 2013.
def isCitationYear(test_string):
    if re.findall('^\(\d{4}\)', test_string):
        return True
    
    if re.findall('^\d{4}\.', test_string):
        return True
    
    return False

In [None]:
isCitationYear('(2015):')

In [None]:
tolerance = 30

all_publications = []

for i, row in turk_grouped_df.iterrows():
    publications = set()
    
    for j in range(0,5):
        raw = row['top_' + str(j+1) + '_raw']
        
        if raw == -1:
            break
            
        last_name = row['Input.name'].split()[1]

        raw_split = raw.split()
        
        for k, word in enumerate(raw_split):
            if last_name in word and k+1 < len(raw_split):
                current_index = k+1
                current_word = raw_split[current_index]
                found = True
                while(not isCitationYear(current_word)):
                    current_index += 1
                    
                    try:
                        current_word = raw_split[current_index]
                    except IndexError:
                        found = False
                        break
                        
                    if current_index == tolerance:
                        found = False
                        break
                        
                if found and current_index+1 != len(raw_split):
                    title = ''
                    current_index += 1
                    current = raw_split[current_index]
                    while True:
                        title += ' ' + current

                        if '.' in current or '?' in current:
                            break
                        
                        current_index += 1
                    
                        
                        try:
                            current = raw_split[current_index]
                        except:
                            found = False
                            break
                    
                    if found:
                        publications.add(title)           
    all_publications.append(publications)

In [None]:
all_publications_clean = ['' if x=='set()' else ', '.join(str(y) for y in x) for x in all_publications]

In [None]:
turk_grouped_df

In [None]:
turk_grouped_df['unconfirmed_publications'] = all_publications_clean

In [None]:
turk_grouped_df.to_csv('turk_grouped_publications.csv', encoding='utf-8')

In [None]:
turk_grouped_df.to_excel('turk_grouped_publications.xlsx')

In [None]:
middle_initials_df = pd.read_excel('turk_grouped_publications_with_middle_initial.xlsx')

In [None]:
turk_grouped_df["Google Scholar Middle Initial"] = middle_initials_df["Google Scholar Middle Initial"]

In [None]:
turked_grouped_with_middle_initial = turk_grouped_df[turk_grouped_df["Google Scholar Middle Initial"].notnull() ]
turked_grouped_with_middle_initial.to_csv('turk_grouped_with_middle_initial_only.csv')

In [None]:
turked_grouped_wo_middle_initial = turk_grouped_df[turk_grouped_df["Google Scholar Middle Initial"].isnull() ]

In [None]:
turked_grouped_wo_middle_initial

In [None]:
turk_grouped_df.to_csv("turk_grouped.csv")

In [None]:
import pandas as pd
turk_grouped_df = pd.read_csv("Stage_2/turk_grouped_with_middle_initial_only.csv")

In [None]:
for index, row in turk_grouped_df[0:1].iterrows():
    test = row['Google Scholar Middle Initial']
    search_query = scholarly.search_pubs_query(test)

In [None]:
import requests
import os

In [None]:
os.system('export http_proxy="http://localhost:8123"')
os.system('export https_proxy="https://localhost:8123"')
'Congratulations' in requests.get('http://check.torproject.org/').text

In [None]:
result1 = next(search_query)

In [None]:
vars(result1)

In [None]:
result1_fill = result1.fill()

In [None]:
vars(result1_fill)

In [None]:
import scholarly
import json

In [None]:
test = scholarly.search_pubs_query("AG Grant")

In [None]:
a = next(test)

In [None]:
json.dumps(vars(a))

In [None]:
import six

In [None]:
#!/usr/bin/env python
import random
import requests

In [None]:
username = 'lum-customer-hl_ed3aa9cc-zone-static'
password = 'ly6gws5xff8h'
port = 22225
session_id = random.random()
super_proxy_url = ('http://%s-country-us-session-%s:%s@zproxy.lum-superproxy.io:%d' %
        (username, session_id, password, port))
proxyDict = { 
              "http"  : super_proxy_url, 
              "https" : super_proxy_url, 
            }

In [None]:
url = "https://youtube.com"
requests.get(url, proxies=proxyDict)

In [None]:
#!/usr/bin/env python
print('If you get error "ImportError: No module named \'six\'" install six:\n'+\
    '$ sudo pip install six');
print('To enable your free eval account and get CUSTOMER, YOURZONE and ' + \
    'YOURPASS, please contact sales@luminati.io')
import sys
if sys.version_info[0]==2:
    import six
    from six.moves.urllib import request
    opener = request.build_opener(
        request.ProxyHandler(
            {'http': 'http://lum-customer-davidcheng-zone-static:ly6gws5xff8h@zproxy.lum-superproxy.io:22225'}))
    print(opener.open('http://lumtest.com/myip.json').read())
if sys.version_info[0]==3:
    import urllib.request
    opener = urllib.request.build_opener(
        urllib.request.ProxyHandler(
            {'http': 'http://lum-customer-davidcheng-zone-static:ly6gws5xff8h@zproxy.lum-superproxy.io:22225'}))
    print(opener.open('http://lumtest.com/myip.json').read())

In [None]:
#!/usr/bin/env python
print('If you get error "ImportError: No module named \'six\'" install six:\n'+\
    '$ sudo pip install six');
print('To enable your free eval account and get CUSTOMER, YOURZONE and ' + \
    'YOURPASS, please contact sales@luminati.io')
import sys
if sys.version_info[0]==2:
    import six
    from six.moves.urllib import request
    opener = request.build_opener(
        request.ProxyHandler(
            {'http': 'http://lum-customer-davidcheng-zone-static:ly6gws5xff8h@zproxy.lum-superproxy.io:22225'}))
    print(opener.open('http://lumtest.com/myip.json').read())
if sys.version_info[0]==3:
    import urllib.request
    opener = urllib.request.build_opener(
        urllib.request.ProxyHandler(
            {'http': 'http://lum-customer-davidcheng-zone-static:ly6gws5xff8h@zproxy.lum-superproxy.io:22225'}))
    print(opener.open('http://lumtest.com/myip.json').read())

In [None]:
from lxml.html import fromstring
from itertools import cycle
def get_proxies():
    url = 'https://free-proxy-list.net/'
    response = requests.get(url)
    parser = fromstring(response.text)
    proxies = set()
    for i in parser.xpath('//tbody/tr')[:10]:
        if i.xpath('.//td[7][contains(text(),"yes")]'):
            #Grabbing IP and corresponding PORT
            proxy = ":".join([i.xpath('.//td[1]/text()')[0], i.xpath('.//td[2]/text()')[0]])
            proxies.add(proxy)
    return proxies


proxies = get_proxies()
proxy_pool = cycle(proxies)

In [None]:
next(proxy_pool)

In [None]:
from lxml.html import fromstring
import requests
from itertools import cycle
import traceback
 
def get_proxies():
    url = 'https://free-proxy-list.net/'
    response = requests.get(url)
    parser = fromstring(response.text)
    proxies = set()
    for i in parser.xpath('//tbody/tr')[:10]:
        if i.xpath('.//td[7][contains(text(),"yes")]'):
            proxy = ":".join([i.xpath('.//td[1]/text()')[0], i.xpath('.//td[2]/text()')[0]])
            proxies.add(proxy)
    return proxies
 
 
#If you are copy pasting proxy ips, put in the list below
#proxies = ['121.129.127.209:80', '124.41.215.238:45169', '185.93.3.123:8080', '194.182.64.67:3128', '106.0.38.174:8080', '163.172.175.210:3128', '13.92.196.150:8080']
proxies = get_proxies()
proxy_pool = cycle(proxies)
 
url = 'https://httpbin.org/ip'
for i in range(1,11):
    #Get a proxy from the pool
    proxy = next(proxy_pool)
    print("Request #%d"%i)
    try:
        response = requests.get(url,proxies={"http": proxy, "https": proxy})
        print(response.json())
    except:
        #Most free proxies will often get connection errors. You will have retry the entire request using another proxy to work. 
        #We will just skip retries as its beyond the scope of this tutorial and we are only downloading a single url 
        print("Skipping. Connnection error")