In [7]:
import pandas as pd
import pandas as pd
import numpy as np
import folium
import json
import requests
import os
from shapely.geometry import Point, Polygon

In [8]:
# geojson files exported from .shp with QGIS (EPSG:4326 WGS 84)
warsaw_geofile = 'geodata/Warsaw_districts/warsaw_districts.geojson'
cracow_geofile = 'geodata/Cracow_districts/cracow_districts.geojson'

In [9]:
with open(warsaw_geofile) as w:
    warsaw_geojson = json.load(w)

In [10]:
with open(cracow_geofile) as c:
    cracow_geojson = json.load(c)

In [12]:
warsaw_districts_names = []
for x in warsaw_geojson['features']:
    warsaw_districts_names.append(x['properties']['nazwa_dzie'])

In [14]:
test_density = list(range(len(warsaw_districts_names)))

In [15]:
df = pd.DataFrame([warsaw_districts_names, test_density]).T
df.columns = ['nazwa_dzie', 'Density']

In [16]:
warsaw_coordinates = [52.2297700, 21.0117800]
warsaw_map = folium.Map(location=warsaw_coordinates, zoom_start=11)
warsaw_map.choropleth(
    geo_data=warsaw_geojson,
    name='choropleth',
    data=df,
    columns=['nazwa_dzie', 'Density'],
    key_on='feature.properties.nazwa_dzie',
    fill_color='YlGn',
    fill_opacity=0.7,
    line_opacity=0.9,
    legend_name='Density'
)
#warsaw_map

In [10]:
# cracow_coordinates = [50.06143, 19.93658]
# cracow_map = folium.Map(location=cracow_coordinates, zoom_start=11)

# folium.GeoJson(
#     cracow_geojson,
#     name='geojson'
# ).add_to(cracow_map)
# cracow_map

In [17]:
CLIENT_ID = os.environ.get('FOURSQUAREID')
CLIENT_SECRET = os.environ.get('FOURSQUARESECRET')
VERSION = '20200605'
LIMIT = 100
OFFSET=LIMIT

In [18]:
def get_district_centers(city_geojson):
    district_centers = {}
    for district in city_geojson['features']:
        district_geometry = pd.DataFrame(
            district['geometry']['coordinates'][0][0],
            columns=['longitude', 'latitude']
        )
        district_center = [[district_geometry['latitude'].mean(), district_geometry['longitude'].mean()]]
        district_centers[district['properties']['nazwa_dzie']]=district_center
    
    district_centers = pd.DataFrame.from_dict(district_centers,
                                              orient='index',
                                              columns=['District_center'])
    
    return district_centers

In [19]:
warsaw_district_centers = get_district_centers(warsaw_geojson)

In [None]:
for center in warsaw_district_centers.index:
    folium.Marker(warsaw_district_centers.loc[center, 'District_center']).add_to(warsaw_map)
    #print(center)

In [20]:
def get_venues(district_center, radius):
    RADIUS = radius
    lat, lng = district_center
    url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            RADIUS, 
            LIMIT)
    result = requests.get(url).json()
    
    venues = result['response']['groups'][0]['items']
    
    total_results = result['response']['totalResults']
    print('\tTotal results: ', total_results, '\n')
    
    #checking if there is more results -if true, next request with offset is send
    requests_to_perform = total_results//100
    
    for _ in range(requests_to_perform):
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}&offset={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            RADIUS, 
            LIMIT,
            OFFSET
        )
        result = requests.get(url).json()
        venues.extend(result['response']['groups'][0]['items'])
        
    return venues

In [21]:
def calculate_radius(district_name):
    center_coords = warsaw_district_centers.loc[district_name, 'District_center']
    center_point = Point(center_coords[1], center_coords[0])
    
    polygon_points = []
    for point_coords in warsaw_districts_polygons [district_name]:
        polygon_point = Point(point_coords[0], point_coords[1])
        polygon_points.append(polygon_point)
    
    distances = []
    for point in polygon_points:
        distance_to_center = center_point.distance(point)
        distances.append(distance_to_center)
        
    max_distance = max(distances)
    factor = 0.6157*111.3*1000
    max_distance_meters = int(max_distance*factor)
    
    return max_distance_meters

In [23]:
def create_districts_polygons(geojson):
    polygons = {}
    for district in geojson['features']:
        polygons[district['properties']['nazwa_dzie']]=district['geometry']['coordinates'][0][0]
    return polygons

In [24]:
warsaw_districts_polygons = create_districts_polygons(warsaw_geojson)

In [25]:
warsaw_districts_radiuses = {}
for district in warsaw_district_centers.index:
    warsaw_districts_radiuses[district] = calculate_radius(district)

In [26]:
warsaw_districts_venues_foursquare = {}
for district in warsaw_district_centers.index:
    print(district)
    warsaw_districts_venues_foursquare[district] = get_venues(
        warsaw_district_centers.loc[district, 'District_center'],
        warsaw_districts_radiuses[district])

Żoliborz
	Total results:  154 

Praga-Południe
	Total results:  238 

Mokotów
	Total results:  234 

Wola
	Total results:  209 

Wilanów
	Total results:  156 

Wesoła
	Total results:  17 

Wawer
	Total results:  99 

Włochy
	Total results:  225 

Ursynów
	Total results:  143 

Śródmieście
	Total results:  234 

Praga-Północ
	Total results:  155 

Ursus
	Total results:  48 

Targówek
	Total results:  146 

Rembertów
	Total results:  37 

Ochota
	Total results:  167 

Bielany
	Total results:  100 

Białołęka
	Total results:  127 

Bemowo
	Total results:  60 



In [28]:
def check_if_inside_district(venue_coords, district_shape):
    p = Point(venue_coords)
    poly = Polygon(district_shape)
    return p.within(poly)

In [31]:
def extract_data(district,district_venues_foursquare):
    district_venues=[]
    for item in district_venues_foursquare:
        x={}
        x['District']=district
        x['Name']=item['venue']['name']
        x['Category']=item['venue']['categories'][0]['name']
        x['Lat'] = item['venue']['location']['lat']
        x['Lon'] = item['venue']['location']['lng']
        x['VenueId'] = item['venue']['id']
        district_venues.append(x)
    district_df = pd.DataFrame(district_venues)
    return district_df

In [32]:
def get_districts_venues(city_districts_venues):
    all_city_venues= pd.DataFrame()
    for district in city_districts_venues.keys():
        district_df = extract_data(district, city_districts_venues[district])
        district_df['District'] = district
        all_city_venues = all_city_venues.append(district_df)
    return all_city_venues

In [33]:
warsaw_districts_venues = get_districts_venues(warsaw_districts_venues_foursquare)

In [61]:
for key in warsaw_districts_venues.keys():    
    for item in warsaw_districts_venues[key]:
        #print(item['venue']['name'], ' ', item['venue']['categories'][0]['name'])
        folium.Marker([item['venue']['location']['lat'], item['venue']['location']['lng']]).add_to(warsaw_map)

In [6]:
%history -g

 2/1:
def read_landsat_images (folder_name):
    file_list = os.listdir(folder_name)
    chanel_list = []
    for f in fiel_list:
        if (f.startswith('LC') and f.endswith('.tif')):
            if  'band' in f:
                chanel_list.append(folder_name +f)
    channel_list.sort()
    channel_numbers = np.arrange(1,8)
    bands_dictionary = dict(zip(channel_numbers, channel_list))
    return bands_directory

#test
satelite_images = read_landsat_images('LC081900232018080301T1-SC20181218151707')
for band in satelite_images:
    print(band, satelite_images[band])
 2/2:
import os
import numpy as np
import rasterio as rio
import rasterio.mask as rmask
import fiona as fio
import matplotlib as plt
 2/3:
# Kod wpisujemy normalnie w komórkach. Shift + Enter realizuje kod w danej komórce.
# Komenda poniżej obowiązkowa w celu poprawnego wyświetlania obrazów.
%matplotlib notebook
 2/4:
def read_landsat_images (folder_name):
    file_list = os.listdir(folder_name)
    chanel_list = []
    f

21/160: ufo_df.drop(ufo_df[ufo_df.Date_occured_len !=8].index, inplace=True)
21/161: pd.to_datetime(ufo_df.loc[0]['Date_occured'],format='%Y%m%d')
21/162: ufo_df['zerodate'] = ufo_df['Date_occured'].astype(str).str.match(r'....00')
21/163: ufo_df.loc[ufo_df['zerodate'] == True]
21/164: ufo_df.drop(ufo_df.loc[ufo_df['zerodate'] == True].index, inplace=True)
21/165: ufo_df.sort_values(by=['Date_occured']).head()
21/166: ufo_df.drop(40901, inplace=True)
21/167: ufo_df['Date_occured']=pd.to_datetime(ufo_df['Date_occured'], format='%Y%m%d')
21/168: ufo_df['Date_reported']=pd.to_datetime(ufo_df['Date_reported'], format='%Y%m%d')
21/169: ufo_df.head()
21/170: mask = ufo_df.Date_occured>('1999-01-01')
21/171: since_1999 = ufo_df[mask]
21/172: since_1999.head()
21/173: ufo_df.describe(include=['datetime64'])
21/174: ufo_df.dtypes
21/175: new=ufo_df['Location'].str.split(',')
21/176: new.head()
21/177: new.head(20)
21/178: new['len'] = new.str.len()
21/179: new.head()
21/180: newest = new.str.le

22/330: akdf.describe()
22/331:
all_dfs=[]
for state in states_abbreviations:
    df = df2[df2.State==state]
    all_dfs.append(df)
22/332: all_dfs
22/333: all_dfs[0]
22/334: all_dfs[49]
22/335: len(all_dfs)
22/336: len(states_abbreviations)
22/337: len(states_abbreviations)
22/338: len(states_abbreviations)
22/339: all_dfs_2 = [df2.State==state for state in states_abbreviations]
22/340: all_dfs_2[0]
22/341: all_dfs_2 = [df2[df2.State==state] for state in states_abbreviations]
22/342: all_dfs_2[0]
22/343: all_dfs = [df2[df2.State==state] for state in states_abbreviations]
22/344:
for df in all_dfs:
    plt.plot(df.YearMonth, df.Observations)
22/345:
for df in all_dfs:
    plt.figure()
    plt.plot(df.YearMonth, df.Observations)
23/1:
for df in all_dfs[:2]:
    plt.figure()
    plt.plot(df.YearMonth, df.Observations)
23/2: import pandas as pd
23/3: import pandas as pd
23/4: ufo_path='/users/wioletanytko/documents/workspace/r/umapro/01-Introduction/data/ufo/ufo_awesome.tsv'
23/5:
ufo_df 

    return all_email_messages
32/68: all_spam = get_messages(spam_path)
32/69: all_spam
32/70: all_spam[0]
32/71: print(all_spam[0])
32/72: print(all_spam[99])
32/73: print(all_spam[199])
33/1:
def get_messages(path_to_directory):
    file_names = os.listdir(path_to_directory)
    path_to_all_files = [path_to_directory + file_name for file_name in file_names if file_name != 'cmds']
    all_email_messages = []
    
    for file in path_to_all_files:
        email = open(file, encoding='latin1').read()
        header_end_index = email.find('\n\n')
        email_message = email[header_end_index:]
        all_email_messages.append(email_message)
        
    return all_email_messages
33/2:
import pandas as pd
import matplotlib.pyplot as plt
import os
33/3:
spam_path = 'Spam/data/spam/'
spam2_path = 'Spam/data/spam2/'
easyham_path = 'Spam/data/easy_ham/'
easyham2_path = 'Spam/data/easy_ham2/'
hardham_path = 'Spam/data/hard_ham/'
hardham2_path = 'Spam/data/har_ham_2/'
33/4:
def get_messages(

40/96: matrix_spam.sort_values(by=['sum'],ascending=False).head()
40/97: all_ham = get_messages(spam_path)[:500]
40/98: ham_dicts = [count_words(message) for message in all_ham]
40/99:
matrix_ham = pd.DataFrame(ham_dicts)
matrix_ham = matrix_ham.transpose()
40/100:
matrix_ham['sum']=matrix_ham.sum(axis=1)
matrix_ham=matrix_ham.loc[matrix_ham['sum']>1 ]
40/101: matrix_ham['number_of_documents']=(matrix_ham.count(axis=1)-1)
40/102: matrix_ham['documents_percentage']=matrix_ham['number_of_documents']/len(all_ham)
40/103: matrix_ham['word_density'] = matrix_ham['sum']/matrix_ham['sum'].sum()
40/104: matrix_ham.sort_values(by=['sum'],ascending=False).head()
40/105:
import pandas as pd
import matplotlib.pyplot as plt
import os
import re
from collections import Counter
40/106:
spam_path = 'Spam/data/spam/'
spam2_path = 'Spam/data/spam2/'
easyham_path = 'Spam/data/easy_ham/'
easyham2_path = 'Spam/data/easy_ham2/'
hardham_path = 'Spam/data/hard_ham/'
hardham2_path = 'Spam/data/har_ham_2/'
40/10

47/32: fdist = FreqDist(word.lower() for word in word_tokenize(all_spam[0]))
47/33: fdist
47/34: fdist['home']
47/35: fdist.keys()
47/36: print(len(fdist.keys()))
47/37: len(word_tokenize(all_spam[0]))
47/38: fdist.plot()
47/39: help(nltk.tokenize)
47/40: len(fdist.keys())
47/41: word_tokenize(all_spam[0])
47/42: len(word_tokenize(all_spam[0]))
47/43: fdist = FreqDist(word for word in word_tokenize(all_spam[0]))
47/44: fdist.plot()
47/45: fdist['home']
47/46: fdist.keys()
47/47: len(fdist.keys())
47/48: len(word_tokenize(all_spam[0]))
47/49: len(count_words(all_spam[0]).keys())
47/50: help(nltk.tokenize)
47/51: len(word_tokenize(all_spam[0]))
47/52:
from nltk.tokenize import word_tokenize, wordpunct_tokenize
from nltk.probability import FreqDist
47/53: wordpunt_tokenize(all_spam[0])
47/54: wordpunct_tokenize(all_spam[0])
47/55: len(wordpunct_tokenize(all_spam[0]))
48/1: stowp_words_pl = stopwords.words('polish')
48/2:
import pandas as pd
import matplotlib.pyplot as plt
import os
import

51/97: xxx.head()
51/98: xxx.head(10)
51/99: xxx = create_matrix([all_spam[0]])
51/100: xxx = create_matrix([all_spam[1]])
51/101: xxx.head(10)
51/102: xxx = create_matrix([all_spam[1]])
51/103: xxx.head(10)
51/104: xxx
51/105:
import pandas as pd
import matplotlib.pyplot as plt
import os
import re
from collections import Counter
import numpy as np
from nltk.corpus import stopwords
import nltk
from nltk.tokenize import word_tokenize
from nltk.probability import FreqDist
from nltk.stem import PorterStemmer
51/106:
#import nltk
#nltk.download()
51/107:
spam_path = 'Spam/data/spam/'
spam2_path = 'Spam/data/spam2/'
easyham_path = 'Spam/data/easy_ham/'
easyham2_path = 'Spam/data/easy_ham2/'
hardham_path = 'Spam/data/hard_ham/'
hardham2_path = 'Spam/data/har_ham_2/'
51/108: ps = PorterStemmer()
51/109: stop_words = stopwords.words('english')
51/110: stop_words_pl = stopwords.words('polish')
51/111:
def get_messages(path_to_directory):
    file_names = os.listdir(path_to_directory)
    path_t

    print(x)
66/16:
for x in dates_lines[100:120]:
    print(x)
66/17:
for x in dates_lines[900:1200]:
    print(x)
66/18:
for x in dates_lines[:5]:
    print(x)
66/19: dates_lines = [datestr[7:] for datestr in date_lines]
66/20: dates_lines = [datestr[7:] for datestr in dates_lines]
66/21: dates_lines
66/22: dates_lines = [datestr[6:] for datestr in dates_lines]
66/23: dates_lines
66/24:
dates_lines = []
for header_lines in headers_lines:
    for line in header_lines:
        if line.startswith('Date:'):
            dates_lines.append(line)
66/25:
for x in dates_lines[:5]:
    print(x)
66/26: dates_lines = [datestr[:] for datestr in dates_lines]
66/27: dates_lines
66/28: dates_lines = [datestr[6:] for datestr in dates_lines]
66/29: dates_lines
66/30: t = pd.to_datetime(dates_lines[0])
66/31: t = pd.to_datetime(dates_lines[1])
66/32: t
66/33: t = pd.to_datetime(dates_lines[], error='coerce')
66/34: t = pd.to_datetime(dates_lines, error='coerce')
66/35: t = pd.to_datetime(dates_lines, e

emails_threads = all_emails_training[all_emails_training.is_thread==True]
emails_threads.drop(['Message', 'is_thread'], axis=1, inplace=True)
77/88: emails_threads.head()
77/89: emails_threads_senders = emails_threads.groupby(by=['From'])
77/90: emails_threads_senders.head()
77/91: emails_threads_senders = emails_threads.groupby(by=['From'].count())
77/92: emails_threads_senders = emails_threads.groupby(by=['From']).count()
77/93: emails_threads_senders.head()
77/94: emails_threads_senders = emails_threads.groupby(by=['From']).count().sort_values(by=['Subject'])
77/95: emails_threads_senders.head()
77/96: emails_threads_senders.tail()
77/97:
emails_threads_senders['Weight'] = np.log(emails_threads_senders['Subject']+1)
emails_threads_senders = emails_threads_senders.Weight
77/98: emails_threads_senders.head()
77/99: emails_threads_senders.tail()
78/1: all_emails_training.groupby(by=['thread_subject'])
78/2:
import pandas as pd
import matplotlib.pyplot as plt
import os
import re
import 

80/203: all_emails_training.at[1,'thread_subject']
81/1: import pandas as pd
82/1: %run Priority_box
82/2: %run Priority_box.py
85/1:
import pandas as pd
import matplotlib.pyplot as plt
import os
import re
import numpy as np
import datetime

from email.utils import parseaddr
85/2:
easyham_path = 'Spam/data/easy_ham/'
easyham2_path = 'Spam/data/easy_ham2/'
85/3:
def get_emails(path_to_directory):
    file_names = os.listdir(path_to_directory)
    path_to_all_files = [path_to_directory + file_name for file_name in file_names if file_name != 'cmds']
    all_email_messages = []
    all_email_headers = []
    
    for file in path_to_all_files:
        email = open(file, encoding='latin1').read()
        header_end_index = email.find('\n\n')
        
        email_header = email[:header_end_index]
        all_email_headers.append(email_header)
        
        email_message = email[header_end_index+2:].lower()
        all_email_messages.append(email_message)
        
    return all_email_he

from_lines = []
for header_lines in headers_lines:
    for line in header_lines:
        if line.startswith('From:'):
            from_lines.append(line)
86/89:
from_addresses = []
for email in from_lines:
    parser_results = parseaddr(email)
    from_addresses.append(parser_results[1])
86/90:
subject_lines = []
for header_lines in headers_lines:
    for line in header_lines:
        if line.startswith('Subject:'):
            subject_lines.append(line)
86/91:
subjects=[]
for subject_line in subject_lines:
    subject = re.findall(r'Subject: (.+)', subject_line)
    subjects.append(subject[0].lower())
86/92:
dates_lines = []
for header_lines in headers_lines:
    for line in header_lines:
        if line.startswith('Date:'):
            dates_lines.append(line)
86/93: dates_lines = [datestr[6:] for datestr in dates_lines]
86/94: send_datetime = pd.to_datetime(dates_lines, errors='coerce')
86/95: emails_data = [send_datetime, from_addresses, subjects, messages]
86/96:
all_emails_data =

91/11: dates_lines = [datestr[6:] for datestr in dates_lines]
91/12: send_datetime = pd.to_datetime(dates_lines, errors='coerce')
91/13: emails_data = [send_datetime, from_addresses, subjects, messages]
91/14:
all_emails_data = pd.DataFrame(emails_data)
all_emails_data = all_emails_data.transpose()
all_emails_data.columns = ['Send_datetime', 'From', 'Subject', 'Message']
all_emails_data = all_emails_data.sort_values(by=['Send_datetime'])
all_emails_data.reset_index(inplace=True)
91/15: all_emails_training = all_emails_data.loc[:1250]
91/16: len(all_emails_training)
91/17: grouped_from = all_emails_training.groupby(by=['From']).count().sort_values(by=['Message'])
91/18:
grouped_from['Weight'] = np.log(grouped_from['Message']+1)
grouped_from = grouped_from.Weight
91/19:
#plt.plot(grouped_from.index, grouped_from.Weight, 'or')
#plt.plot(grouped_from.index, grouped_from.Message, 'ob')
#plt.rcParams['figure.figsize']=(24,16)
91/20: all_emails_training['is_thread'] = all_emails_training.Subj

emails_tdm = make_tdm(all_emails_training.Message)
emails_tdm.loc['weight'] = emails_tdm.loc['weight']/2
emails_tdm.tail()
96/54:
#grouped_from.tail(20 -social activity measure
#emails_threads_senders.tail() -thread senders activity measure
#emails_threads.thread_density -thread activity measure
#thread_tdm.weight -words in active threads mesure
#emails_tdm.weight -words in all messages measure
96/55:
def get_from_measure(senders):
    x = [grouped_from.get(sender,1) for sender in senders]
    result = pd.Series(x)
    print('get_from_measure: \n', result.sort_values())
    return result
96/56:
def get_from_thread_measure(senders):
    x = [emails_threads_senders.get(sender,1) for sender in senders]
    result = pd.Series(x)
    print('get_from_thread_measure: \n', result.sort_values())
    return result
96/57:
def get_thread_activity_measure(emails):    
    emails['is_thread'] = emails.Subject.str.contains('re\[{0,1}[1-9]{0,1}[1-9]{0,1}\]{0,1}:')
    
    threads_subjects=[]
    for 

def calculate_weight(words_set):
    result = emails_tdm[list(set(emails_tdm.columns).intersection(words_set))].mean(axis=1)['weight']
    return result
100/190:
def get_words_in_thread_measure(emails):
    messages = emails.Message.copy()
    messages_words = [count_words_in_thread_weight(message) for message in messages]
    z=pd.Series(messages_words)
    
    x = pd.DataFrame(z)
    x['weight'] = x[0].apply(calculate_weight)
    
    return x
100/191: x = get_words_in_thread_measure(test_emails_data)
100/192: x
100/193:
def rank_emails(emails):
    from_measure = get_from_measure(emails.From)
    from_thread_measure = get_from_thread_measure(emails.From)
    thread_activity_measure = get_thread_activity_measure(emails)
    words_in_thread_measure = get_words_in_thread_measure(emails)
    words_in_all_messages_measure = 1#get_words_in_all_messages_measure(emails)
    
    rank = from_measure*from_thread_measure*thread_activity_measure*words_in_thread_measure*words_in_all_messages_me

100/457:
def rank_emails(emails):
    from_measure = get_from_measure(emails.From)
    from_thread_measure = get_from_thread_measure(emails.From)
    thread_activity_measure = get_thread_activity_measure(emails)
    words_in_thread_measure = get_words_in_thread_measure(emails)
    words_in_all_messages_measure = get_words_in_all_messages_measure(emails)
    
    test_rank = pd.DataFrame([from_measure,
                              from_thread_measure,
                              thread_activity_measure,
                              words_in_thread_measure,
                              words_in_all_messages_measure]).T
    
    test_rank['final_rank']=test_rank.product(axis=1)
    
    test_rank.columns = ['from_measure', 'from_thread_measure','thread_activity_measure',
                         'words_in_thread_measure', 'words_in_all_messages_measure', 'final_rank']
    
    print('Test rank: \n', test_rank['final_rank'])
    
    #rank = from_measure*from_thread_measure*thread_act

grid=True
108/20: help(plt.grid)
108/21:
plt.scatter(top_sites.UniqueVisitors, top_sites.PageViews )
grid(True)
108/22:
plt.scatter(top_sites.UniqueVisitors, top_sites.PageViews )
plt.grid
108/23:
plt.scatter(top_sites.UniqueVisitors, top_sites.PageViews )
plt.grid = True
108/24:
plt.scatter(top_sites.UniqueVisitors, top_sites.PageViews )
plt.grid = on
108/25:
plt.scatter(top_sites.UniqueVisitors, top_sites.PageViews )
plt.grid = True
108/26:
plt.scatter(top_sites.UniqueVisitors, top_sites.PageViews )
plt.grid()
110/1:
import pandas as pd
import matplotlib.pyplot as plt
110/2: top_sites = pd.read_csv('Regression/top_1000_sites.tsv', delimiter='\t' )
110/3: top_sites.head()
110/4: help(plt.grid)
110/5:
plt.scatter(top_sites.UniqueVisitors, top_sites.PageViews )
plt.grid()
110/6: help(plt.xlabel)
110/7:
plt.scatter(top_sites.UniqueVisitors, top_sites.PageViews )
plt.grid()
plt(xlabel='Unique_visitors')
110/8:
plt.scatter(top_sites.UniqueVisitors, top_sites.PageViews )
plt.grid()
plt.xlab

120/14: y_model - y
120/15:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import statsmodels.api as sm
import statsmodels.formula.api as smf
120/16: (y_model - y).mean()
120/17: (y_model - y).max()
120/18: df = pd.DataFrame(x_squared, y)
120/19: df = pd.DataFrame(x_squared, y, columns=['x_squared', 'y'])
120/20: df = pd.DataFrame([x_squared, y], columns=['x_squared', 'y'])
120/21: df
120/22: df = pd.DataFrame(x_squared, columns=['x_squared'])
120/23: df
120/24: df = pd.DataFrame(x_squared).join(y, inplace=True)
120/25: df = pd.DataFrame(x_squared).join(y)
120/26: y
120/27: df = pd.DataFrame({'x_squared': x_squared, 'y': y})
120/28: df
120/29:
model = smf.ols(formula='y ~ x_squared', model=df)
model.summary()
120/30:
model = smf.ols(formula='y ~ x_squared', data=df)
model.summary()
120/31:
model = smf.ols(formula='y ~ x_squared', data=df).fit()
model.summary()
120/32: x_squared.corr(y)
120/33: x_squared.corr(y)**2
120/34: x_squared.corr(y)
120/35:
#r2
x_squared.

124/31: (sk_result.coef_)**2
124/32: sk_result.coef_
124/33: l2_model_complexity = (sk_result.coef_)[0]**2.sum()
124/34: sk_result.coef_[0]
124/35: sk_result.coef_[0].sum()
124/36: sk_result.coef_[0]**2
124/37: sk_result.coef_[0]**2.sum()
124/38: (sk_result.coef_[0]**2).sum()
124/39: l2_model_complexity = (sk_result.coef_[0]**2).sum()
124/40: l1_model_complexity = (abs(sk_result.coef_)).sum()
124/41: l1_model_complexity, l2_model_complxity
124/42: l1_model_complexity, l2_model_complexity
124/43:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import statsmodels.api as sm
import statsmodels.formula.api as smf
from sklearn import linear_model
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from skelearn.preprocessing import PolynomialFeatures

%matplotlib inline
124/44:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import statsmodels.api as sm
import statsmodels.formula.api as smf
from skl

135/20:
# Since https://stats.nba.com does lot allow api calls from Cloud IPs and Skills Network Labs uses a Cloud IP.
# The following code is comment out, you can run it on jupyter labs on your own computer.
gamefinder.get_json()
135/21:
# Since https://stats.nba.com does lot allow api calls from Cloud IPs and Skills Network Labs uses a Cloud IP.
# The following code is comment out, you can run it on jupyter labs on your own computer.
games = gamefinder.get_data_frames()[0]
games.head()
135/22:
games_home=games [games ['MATCHUP']=='GSW vs. TOR']
games_away=games [games ['MATCHUP']=='GSW @ TOR']
135/23: games_home.mean()['PLUS_MINUS']
135/24: games_away.mean()['PLUS_MINUS']
135/25:
fig, ax = plt.subplots()

games_away.plot(x='GAME_DATE',y='PLUS_MINUS', ax=ax)
games_home.plot(x='GAME_DATE',y='PLUS_MINUS', ax=ax)
ax.legend(["away", "home"])
plt.show()
136/1:
import pandas as pd
from bokeh.plotting import figure, output_file, show, output_notebook
output_notebook()
136/2:
def make_dashboa

141/296:
import pandas as pd
import numpy as np
import geocoder
import folium
import os
141/297:
CLIENT_ID = os.environ.get('FOURSQAUREID')
CLIENT_SECRET = os.environ.get('FOURSQARESECRET')
VERSION = '20180605'
LIMIT = 50
141/298:
def get_nearby_venues(neighborhood, latitudes, longitudes, radius=500):
    venues = []
    
    for name, lat, lng in zip(names, latitudes, longitudes):
    print(name)

    # create the API request URL
    url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
        CLIENT_ID, 
        CLIENT_SECRET, 
        VERSION, 
        lat, 
        lng, 
        radius, 
        LIMIT)
    
    results = requests.get(url).json()["response"]['groups'][0]['items']

    venues_list.append([(
        name, 
        lat, 
        lng, 
        v['venue']['name'], 
        v['venue']['location']['lat'], 
        v['venue']['location']['lng'],  
        v['venue']['categories'][0]['name']) for v in re

145/4: poznan_old_market = folium.Map(location=poznan_coordinates)
145/5: poznan_old_market
145/6:
poznan_coordinates = [52.4069200, 16.9299300]
# zerokość: 52.4069200° 
# Długość: 16.9299300°
145/7: poznan_old_market = folium.Map(location=poznan_coordinates)
145/8: poznan_old_market
145/9: poznan_old_market = folium.Map(location=poznan_coordinates, zoom=6)
145/10: poznan_old_market
145/11: poznan_old_market = folium.Map(location=poznan_coordinates, zoom_start=6)
145/12: poznan_old_market
145/13: poznan_old_market = folium.Map(location=poznan_coordinates, zoom_start=12)
145/14: poznan_old_market
145/15: poznan_old_market = folium.Map(location=poznan_coordinates, zoom_start=13)
145/16: poznan_old_market
145/17: poznan_old_market = folium.Map(location=poznan_coordinates, zoom_start=16)
145/18: poznan_old_market
145/19: warsaw_geofile = 'geodata/warsaw_districts/dzielnice_Warszawy.shp'
145/20: w = open(warsaw_geofile)
145/21:
for x in range(10):
    w.read_line()
145/22: w.read()
145/23: 

145/349: lat, lng = warsaw_coordinates
145/350:
url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            RADIUS, 
            LIMIT)
145/351: result = requests.get(url).json()
145/352: #r = requests.get(url)
145/353:
# number of items depends on LIMIT parameter from the URL
len(result['response']['groups'][0]['items'])
145/354: result
145/355:
CLIENT_ID = os.environ.get('FOURSQUAREID')
CLIENT_SECRET = os.environ.get('FOURSQUARESECRET')
VERSION = '20200605'
LIMIT = 100
RADIUS=150
145/356:
#folium.Marker(warsaw_coordinates).add_to(warsaw_map)
#warsaw_map
145/357: lat, lng = warsaw_coordinates
145/358:
CLIENT_ID = os.environ.get('FOURSQUAREID')
CLIENT_SECRET = os.environ.get('FOURSQUARESECRET')
VERSION = '20200605'
LIMIT = 100
RADIUS=15000
145/359:
CLIENT_ID = os.environ.get('FOURSQUAREID')
CLIENT_SECR

150/297:
def extract_data(district_venues_foursquare):
    district_venues=[]
    for item in district_venues_foursquare:
        x={}
        #x['District']=district
        x['Name']=item['venue']['name']
        x['Category']=item['venue']['categories'][0]['name']
        x['Lat'] = item['venue']['location']['lat']
        x['Lon'] = item['venue']['location']['lng']
        x['VenueId'] = item['venue']['id']
        district_venues.append(x)
    district_df = pd.DataFrame(district_venues)
    return district_df
150/298: z = extract_data(x)
150/299: z
150/300: x = get_venues(warsaw_district_centers.loc['Wesoła', 'District_center'])
150/301: z = extract_data(x)
150/302: z
150/303:
def pin_venues(venues_df,city_map):
    for i in venues_df.index:
        folium.Marker([venues_df.loc[i,'Lat'], venues_df.loc[i,'Lon']]).add_to(city_map)
150/304: pin_venues(z, warsaw_map)
150/305: warsaw_map
150/306: x = get_venues(warsaw_district_centers.loc['Białołęka', 'District_center'])
150/307: z = e

In [None]:
def pin_venues(venues_df,city_map):
    for i in venues_df.index:
        folium.Marker([venues_df.loc[i,'Lat'], venues_df.loc[i,'Lon']]).add_to(city_map)

In [34]:
warsaw_districts_venues.reset_index(inplace=True)

In [35]:
for i in warsaw_districts_venues.index:
    warsaw_districts_venues.loc[i, 'Inside'] = check_if_inside_district(
                                        [warsaw_districts_venues.loc[i,'Lon'], warsaw_districts_venues.loc[i,'Lat']],
                                        warsaw_districts_polygons[warsaw_districts_venues.loc[i,'District']])

In [36]:
warsaw_districts_venues = warsaw_districts_venues.loc[warsaw_districts_venues['Inside']==True]

In [37]:
warsaw_top5 = warsaw_districts_venues.groupby('Category').count().sort_values(by='VenueId', ascending=False).head()

In [38]:
warsaw_top5

Unnamed: 0_level_0,index,District,Name,Lat,Lon,VenueId,Inside
Category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Café,66,66,66,66,66,66,66
Park,56,56,56,56,56,56,56
Italian Restaurant,49,49,49,49,49,49,49
Coffee Shop,38,38,38,38,38,38,38
Supermarket,36,36,36,36,36,36,36


In [39]:
cracow_districts_names = []
for x in cracow_geojson['features']:
    cracow_districts_names.append(x['properties']['nazwa'])

In [41]:
cracow_test_density = list(range(len(cracow_districts_names)))
df = pd.DataFrame([cracow_districts_names, test_density]).T
df.columns = ['nazwa', 'Density']

In [43]:
def create_cracow_map():    
    cracow_coordinates = [50.06143, 19.93658]
    cracow_map = folium.Map(location=cracow_coordinates, zoom_start=11)
    cracow_map.choropleth(
        geo_data=cracow_geojson,
        name='choropleth',
        data=df,
        columns=['nazwa', 'Density'],
        key_on='feature.properties.nazwa',
        fill_color='YlGn',
        fill_opacity=0.7,
        line_opacity=0.9,
        legend_name='Density'
    )
    return cracow_map

cracow_map = create_cracow_map()

In [44]:
def create_cracow_polygons(geojson):
    polygons = {}
    for district in geojson['features']:
        polygons[district['properties']['nazwa']]=district['geometry']['coordinates'][0][0]
    return polygons

In [45]:
def get_cracow_centers(city_geojson):
    district_centers = {}
    for district in city_geojson['features']:
        district_geometry = pd.DataFrame(
            district['geometry']['coordinates'][0][0],
            columns=['longitude', 'latitude']
        )
        district_center = [[district_geometry['latitude'].mean(), district_geometry['longitude'].mean()]]
        district_centers[district['properties']['nazwa']]=district_center
    
    district_centers = pd.DataFrame.from_dict(district_centers,
                                              orient='index',
                                              columns=['District_center'])
    
    return district_centers
cracow_districts_centers = get_cracow_centers(cracow_geojson)

In [46]:
cracow_districts_polygons = create_cracow_polygons(cracow_geojson)

In [47]:
def calculate_cracow_radius(district_name):
    center_coords = cracow_districts_centers.loc[district_name, 'District_center']
    center_point = Point(center_coords[1], center_coords[0])
    
    polygon_points = []
    for point_coords in cracow_districts_polygons [district_name]:
        polygon_point = Point(point_coords[0], point_coords[1])
        polygon_points.append(polygon_point)
    
    distances = []
    for point in polygon_points:
        distance_to_center = center_point.distance(point)
        distances.append(distance_to_center)
        
    max_distance = max(distances)
    factor = 0.6157*111.3*1000
    max_distance_meters = int(max_distance*factor)
    
    return max_distance_meters

In [48]:
cracow_districts_radiuses = {}
for district in cracow_districts_centers.index:
    cracow_districts_radiuses[district] = calculate_cracow_radius(district)

In [49]:
cracow_districts_venues_foursquare = {}
for district in cracow_districts_centers.index:
    print(district)
    cracow_districts_venues_foursquare[district] = get_venues(
        cracow_districts_centers.loc[district, 'District_center'],
        cracow_districts_radiuses[district])

Stare Miasto
	Total results:  215 

Grzegórzki
	Total results:  155 

Prądnik Czerwony
	Total results:  63 

Prądnik Biały
	Total results:  118 

Krowodrza
	Total results:  204 

Bronowice
	Total results:  91 

Zwierzyniec
	Total results:  133 

Dębniki
	Total results:  51 

Łagiewniki-Borek Fałęcki
	Total results:  37 

Swoszowice
	Total results:  50 

Podgórze Duchackie
	Total results:  56 

Bieżanów-Prokocim
	Total results:  28 

Podgórze
	Total results:  209 

Czyżyny
	Total results:  84 

Mistrzejowice
	Total results:  28 

Bieńczyce
	Total results:  41 

Wzgórza Krzesławickie
	Total results:  17 

Nowa Huta
	Total results:  31 



In [50]:
cracow_districts_venues = get_districts_venues(cracow_districts_venues_foursquare)
cracow_districts_venues.reset_index(inplace=True)

In [51]:
for i in cracow_districts_venues.index:
    cracow_districts_venues.loc[i, 'Inside'] = check_if_inside_district(
                                        [cracow_districts_venues.iloc[i]['Lon'], cracow_districts_venues.loc[i]['Lat']],
                                        cracow_districts_polygons[cracow_districts_venues.iloc[i]['District']])

In [52]:
cracow_districts_venues = cracow_districts_venues.loc[cracow_districts_venues['Inside']==True]
cracow_top5 = cracow_districts_venues.groupby('Category').count().sort_values(by='VenueId', ascending=False).head()

In [53]:
cracow_map = create_cracow_map()
for i in cracow_districts_venues.index:    
    folium.Marker([cracow_districts_venues.loc[i,'Lat'], cracow_districts_venues.loc[i,'Lon']]).add_to(cracow_map)

In [54]:
top_categories = warsaw_top5.add(cracow_top5, fill_value=0)

In [55]:
top_5_categories = top_categories.head().index
top_5_categories

Index(['Café', 'Coffee Shop', 'Hotel', 'Italian Restaurant', 'Park'], dtype='object', name='Category')

In [56]:
warsaw_districts_wiki = 'https://pl.wikipedia.org/wiki/Podział_administracyjny_Warszawy'
cracow_districts_wiki = 'https://pl.wikipedia.org/wiki/Podział_administracyjny_Krakowa'

In [57]:
warsaw_districts_df = pd.read_html(warsaw_districts_wiki)

ImportError: BeautifulSoup4 (bs4) not found, please install it