### Finding the most disussed person under each sport category in New York Times between 2010 and 2016.

In [191]:
import requests, configparser, os, sys, json, glob
from nltk.tag import pos_tag



In [228]:
# Collecting data

config = configparser.ConfigParser()
config.readfp(open(r'config.txt'))

i = 100
try:
    while i < 120:
        print('page '+str(i))
        uri_path_list = []
        sports_list = []
        category_list = []

        for link in config['uri']:
            uri_path_list.append(config.get('uri',link))
            break;

        directory = config.get('directory_path','directory')
        sub_directory = config.get('directory_path','sub_directory')    

        for key in config['api_key']:
            uri_path_list.append('?api_key='+config.get('api_key',key))
            break;                

        for param in config['parameters']:
            #print(param)
            uri_path_list.append('&'+param+'='+config.get('parameters',param))
        uri_path_list.append('&page=' + str(i) )

        uri_path = ''.join(uri_path_list);
        resp = requests.get(uri_path)
        request_data = (resp).json();  
        if 'errors' in request_data.keys():
            raise Exception(request_data['errors'])
        elif 'message' in request_data.keys():
            raise Exception(request_data['message'])

        #print(directory)
        if not os.path.exists(directory):
            os.makedirs(directory)

        if not os.path.exists(directory+'\\'+sub_directory):
            os.makedirs(directory+'\\'+sub_directory)

        #print('Starting to store json files for sports.')    

        try:    
            if( 'response' in request_data.keys() ):
                response = request_data['response']
                if( 'docs' in response.keys() ):
                    docs = response['docs']        
                    for doc in docs:
                        if( 'subsection_name' in doc.keys() ):
                            sport = doc.get('subsection_name')
                            if(sport != None):
                                sport_path = directory+'\\'+sub_directory+'\\'+sport;
                                if not os.path.exists(sport_path):
                                    sports_list.append(sport)
                                    os.makedirs(sport_path)

                                if( 'document_type' in doc.keys() ):
                                    category = doc.get('document_type')
                                    if(category != None):
                                        category_path = sport_path + '\\' + category;
                                        if not os.path.exists(category_path):
                                            category_list.append(category)
                                            os.makedirs(category_path)

                                        # put doc data in to correspoding json file inside sport
                                        file_path = category_path+'\\'+doc.get('_id')+".json"
                                        if not os.path.exists(file_path):
                                            print(file_path+' writing')
                                            with open(file_path, "w") as outfile:
                                                json.dump(doc, outfile, indent=4)
                                        else: 
                                            print(file_path+' exists')
        except ValueError:
            print("error :", sys.exc_info()[0])
        i = i + 1
    print('Storing json files for sports completed.')
except Exception as error:
    print('Caught this error: ' + repr(error))



page 100
nytimes_data\Sports\Pro Football\article\585721ba95d0e03926076af2.json exists
nytimes_data\Sports\Soccer\article\585721ba95d0e03926076af1.json exists
nytimes_data\Sports\College Basketball\article\5857206095d0e03926076aed.json exists
nytimes_data\Sports\College Basketball\article\58571ef695d0e03926076aeb.json exists
nytimes_data\Sports\Pro Basketball\article\58571d8c95d0e03926076aea.json exists
nytimes_data\Sports\Baseball\article\58571d6e95d0e03926076ae7.json exists
nytimes_data\Sports\College Basketball\article\58571cd695d0e03926076ae6.json exists
nytimes_data\Sports\College Basketball\article\58571b7095d0e03926076ae3.json exists
nytimes_data\Sports\Pro Basketball\article\58571ac495d0e03926076ae2.json exists
nytimes_data\Sports\Pro Football\article\5857168195d0e03926076ada.json exists
page 101
nytimes_data\Sports\Hockey\article\5857151995d0e03926076ad8.json exists
nytimes_data\Sports\College Basketball\article\58570b4595d0e03926076ac9.json exists
nytimes_data\Sports\College 

### Analyzing the data...

In [230]:
#import operator

search_path = directory+'/'+sub_directory+'/'
sports_results = dict()
MAX_RESULTS = 10
directory = config.get('directory_path','directory')
sub_directory = config.get('directory_path','sub_directory')    

for x in os.walk(directory+'/'+sub_directory):
    sports_list = x[1]
    break;

for sport in sports_list:
    sport_path = search_path+sport+'/*/*.json'
    top_word = dict()
    for file in glob.glob(sport_path):
        with open(file, 'r') as f:
            #print(file)
            data = json.load(f)
            paragraph = data['lead_paragraph']            
            tagged_sent = pos_tag((paragraph.replace('.','').replace(',','')).split())
            
            tri = [chunk for chunk in ne_chunk(tagged_sent) if isinstance(chunk, Tree)]
            named_entities = []
            for t in tri:
                if t.label() == 'PERSON':
                    for c in t:
                        named_entities.append(c[0])
            
            #propernouns = [word for word,pos in tagged_sent if pos == 'NNP']
            for noun in named_entities:
                if noun not in top_word:
                    top_word[noun] = 1
                else:
                    top_word[noun] = top_word[noun] + 1
            #print(propernouns)
    #sorted_top_words = sorted(top_word.items(), key=operator.itemgetter(1),reverse=True)
    sorted_top_words = [(k, top_word[k]) for k in sorted(top_word, key=top_word.get, reverse=True)]
    sports_results[sport] = sorted_top_words[:MAX_RESULTS]
    
#print(sports_results)
print("Most discussed people in each sports in New York Times")
for sport,people in sports_results.items():
    print("\n"+sport+":")
    for person in people:
        print(str(person[1])+" - "+person[0] )

Most discussed people in each sports in New York Times

Auto Racing:
1 - Antonio
1 - Giovinazzi
1 - Jost
1 - Capito
1 - Ron
1 - Dennis
1 - Lewis
1 - Hamilton

Baseball:
2 - Edwin
2 - Encarnacion
2 - Outfielder
1 - Shortstop
1 - Jimmy
1 - Rollins
1 - Daniel
1 - Hudson
1 - Pitcher
1 - Nick

College Basketball:
13 - State
5 - Texas
4 - Kentucky
4 - Brown
4 - Kelsey
4 - Florida
4 - Saint
4 - Josh
3 - Bob
3 - Huggins

College Football:
3 - Mike
3 - Ohio
3 - State
3 - Clemson
3 - Dabo
3 - Swinney
3 - Williams
2 - Leonard
2 - Fournette
2 - Bob

Cricket:
8 - Sri
8 - Lanka
6 - Steve
3 - Smith
3 - Stephen
2 - Karun
2 - Nair
2 - Virat
2 - Kohli
2 - Cook

Cycling:
3 - Team
3 - Bradley
3 - Wiggins
2 - Sky
1 - Dave
1 - Brailsford
1 - Cycling
1 - Stephen
1 - Park
1 - GB

Golf:
2 - Tiger
2 - Woods
1 - Solheim
1 - Cup
1 - Helen
1 - Alfredsson
1 - Jack
1 - Nicklaus
1 - Notah
1 - Begay

Hockey:
9 - Jaromir
7 - Jagr
4 - John
3 - Tortorella
3 - Mike
2 - Henrik
2 - Coach
2 - Ryan
2 - Bill
2 - Aleksander

Ol