In [294]:
%matplotlib inline
import numpy as np 
import pandas as pd 
import nltk
import matplotlib.pyplot as plt

In [295]:
# read the csv file
df = pd.read_csv('location_trends_2.csv')
df['tags'].describe()

count      500000
unique     305000
top       android
freq         2598
Name: tags, dtype: object

In [296]:
# perform the data cleaning and split the tags column into a list
df = df.fillna("")
df['tags'] = df['tags'].apply(lambda x: x.split('|'))
df

Unnamed: 0,id,location,tags
0,162338,"San Jose, CA, USA","[machine-learning, accelerometer]"
1,162338,"San Jose, CA, USA","[python, robotframework]"
2,162338,"San Jose, CA, USA",[data-structures]
3,162338,"San Jose, CA, USA","[video, image-processing]"
4,162338,"San Jose, CA, USA","[python, antlr4]"
...,...,...,...
499995,5031965,"Vevey, Switzerland","[python, python-3.x]"
499996,5031965,"Vevey, Switzerland","[python, python-3.x]"
499997,5031965,"Vevey, Switzerland","[python, python-3.x]"
499998,5031965,"Vevey, Switzerland","[python, django, django-views]"


In [297]:
# get the list of all tags
all_tags = [item for sublist in df['tags'].values for item in sublist]

In [298]:
# find the frequency distribution of those tags and fetch top 10 tags
keywords = nltk.FreqDist(all_tags)
frequency_dist=keywords.most_common(10)[0:]
top_10_tags=[freq[0] for freq in frequency_dist]
del keywords['']

#top_tags = ','.join(top_100_tags)
print(top_10_tags)

['javascript', 'java', 'c#', 'php', 'android', 'python', 'jquery', 'html', 'ios', 'c++']


In [300]:
# filter those records which have the top tags
def check_list(list):
    popular_count = 0
    for i in list:
        if i in top_100_tags:
            popular_count += 1
    return popular_count

df['popular_count'] = df['tags'].apply(lambda x: check_list(x))

# filter those records with top tags count greater than 4
df = df[df['popular_count'] > 4]

In [301]:
df['popular_count'].values

array([5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
       5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
       5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
       5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
       5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
       5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
       5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
       5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
       5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
       5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
       5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
       5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
       5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
       5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,

In [302]:
# group by location and tags
df = df.groupby('location')['tags'].apply(list).reset_index(name='tag_list')
df = df['location']
df.to_csv('location_list.csv', index=False)

In [256]:
# compute the latitude and longitude of the location using the maps API
import requests

def geocodeAddress(streetaddress):
    
    # replace with your api key
    apikey = '<your-api-key>'
    streetaddress.replace(' ', '+')
    url = 'https://maps.googleapis.com/maps/api/geocode/json?address='+streetaddress+'&key='+apikey
    
    req = requests.get(url)
    response = req.json()
    
    if response['results'] != []: 
        result = response['results'][0]
        geodata = str(result['geometry']['location']['lat']) + ',' + str(result['geometry']['location']['lng'])
    else:
        geodata = ''
    return geodata

In [257]:
df = df.head(10)
df['location'] = df['location'].apply(lambda x: geocodeAddress(x))
df.to_csv('geocodelocations.csv')
df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Unnamed: 0,location,tag_list
0,"-34.8488008,138.5151734","[[javascript, php, jquery, html, css]]"
1,"57.0488195,9.921747","[[php, javascript, jquery, html, ajax]]"
2,"5.6037168,-0.1869644","[[javascript, php, jquery, ajax, laravel]]"
3,"23.022505,72.5713621","[[php, javascript, jquery, ajax, arrays]]"
4,"23.022505,72.5713621","[[javascript, php, jquery, ajax, json], [javas..."
5,"23.022505,72.5713621","[[javascript, jquery, html, css, ajax]]"
6,"43.529742,5.447426999999999","[[java, php, android, mysql, json]]"
7,"41.153332,20.168331","[[javascript, php, html, css, ajax]]"
8,"35.0843859,-106.650422","[[javascript, php, jquery, json, ajax]]"
9,,"[[javascript, mysql, node.js, mongodb, angular..."
