In [1]:
import pandas as pd
import pandas as pd
import numpy as np
import folium
import json
import requests
import os
from shapely.geometry import Point, Polygon

In [2]:
# geojson files exported from .shp with QGIS (EPSG:4326 WGS 84)
warsaw_geofile = 'geodata/Warsaw_districts/warsaw_districts.geojson'
cracow_geofile = 'geodata/Cracow_districts/cracow_districts.geojson'

In [3]:
with open(warsaw_geofile) as w:
    warsaw_geojson = json.load(w)

In [4]:
with open(cracow_geofile) as c:
    cracow_geojson = json.load(c)

In [5]:
districts_names = []
for x in warsaw_geojson['features']:
    districts_names.append(x['properties']['nazwa_dzie'])

In [7]:
test_density = list(range(len(districts_names)))

In [8]:
df = pd.DataFrame([districts_names, test_density]).T
df.columns = ['nazwa_dzie', 'Density']

In [59]:
warsaw_coordinates = [52.2297700, 21.0117800]
warsaw_map = folium.Map(location=warsaw_coordinates, zoom_start=11)
warsaw_map.choropleth(
    geo_data=warsaw_geojson,
    name='choropleth',
    data=df,
    columns=['nazwa_dzie', 'Density'],
    key_on='feature.properties.nazwa_dzie',
    fill_color='YlGn',
    fill_opacity=0.7,
    line_opacity=0.9,
    legend_name='Density'
)
#warsaw_map

In [10]:
# cracow_coordinates = [50.06143, 19.93658]
# cracow_map = folium.Map(location=cracow_coordinates, zoom_start=11)

# folium.GeoJson(
#     cracow_geojson,
#     name='geojson'
# ).add_to(cracow_map)
# cracow_map

In [57]:
CLIENT_ID = os.environ.get('FOURSQUAREID')
CLIENT_SECRET = os.environ.get('FOURSQUARESECRET')
VERSION = '20200605'
LIMIT = 100
RADIUS=5000
OFFSET=LIMIT

In [68]:
def get_district_centers(city_geojson):
    district_centers = {}
    for district in city_geojson['features']:
        district_geometry = pd.DataFrame(
            district['geometry']['coordinates'][0][0],
            columns=['longitude', 'latitude']
        )
        district_center = [[district_geometry['latitude'].mean(), district_geometry['longitude'].mean()]]
        district_centers[district['properties']['nazwa_dzie']]=district_center
    
    district_centers = pd.DataFrame.from_dict(district_centers,
                                              orient='index',
                                              columns=['District_center'])
    
    return district_centers

In [13]:
def get_venues(district_center):
    lat, lng = district_center
    url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            RADIUS, 
            LIMIT)
    result = requests.get(url).json()
    
    venues = result['response']['groups'][0]['items']
    
    total_results = result['response']['totalResults']
    print('Total results: ', total_results)
    
    requests_to_perform = total_results//100
    
    for _ in range(requests_to_perform):
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}&offset={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            RADIUS, 
            LIMIT,
            OFFSET
        )
        result = requests.get(url).json()
        venues.extend(result['response']['groups'][0]['items'])
        
    return venues

In [69]:
warsaw_district_centers = get_district_centers(warsaw_geojson)

In [58]:
warsaw_districts_venues = {}
for district in warsaw_district_centers.index:
    warsaw_districts_venues[district] = get_venues(warsaw_district_centers.loc[district, 'District_center'])

Total results:  239
Total results:  140
Total results:  230
Total results:  232
Total results:  153
Total results:  42
Total results:  56
Total results:  201
Total results:  142
Total results:  228
Total results:  236
Total results:  68
Total results:  137
Total results:  44
Total results:  233
Total results:  121
Total results:  73
Total results:  146


In [44]:
def check_if_inside_district(venue_coords, district_shape):
    p = Point(venue_coords)
    poly = Polygon(district_shape)
    return p.within(poly)

In [45]:
warsaw_district_centers.head()

Unnamed: 0,District_center
Żoliborz,"[52.2688536216614, 20.985135391429377]"
Praga-Południe,"[52.235168690871, 21.071138918572952]"
Mokotów,"[52.18866196405862, 21.052814696946914]"
Wola,"[52.22969762335711, 20.94634273758449]"
Wilanów,"[52.15030830502675, 21.091139416474352]"


In [46]:
for center in warsaw_district_centers.index:
    folium.Marker(warsaw_district_centers.loc[center, 'District_center']).add_to(warsaw_map)
    #print(center)

In [61]:
for key in warsaw_districts_venues.keys():    
    for item in warsaw_districts_venues[key]:
        #print(item['venue']['name'], ' ', item['venue']['categories'][0]['name'])
        folium.Marker([item['venue']['location']['lat'], item['venue']['location']['lng']]).add_to(warsaw_map)

In [62]:
warsaw_map

In [73]:
warsaw_districts_venues['Żoliborz'][0]

{'reasons': {'count': 0,
  'items': [{'summary': 'This spot is popular',
    'type': 'general',
    'reasonName': 'globalInteractionReason'}]},
 'venue': {'id': '4baf7aa5f964a52031033ce3',
  'name': 'Park Żeromskiego',
  'location': {'address': 'Mickiewicza',
   'lat': 52.26837710113502,
   'lng': 20.988746715108583,
   'labeledLatLngs': [{'label': 'display',
     'lat': 52.26837710113502,
     'lng': 20.988746715108583}],
   'distance': 251,
   'cc': 'PL',
   'city': 'Warszawa',
   'state': 'Województwo mazowieckie',
   'country': 'Polska',
   'formattedAddress': ['Mickiewicza', 'Warszawa', 'Polska']},
  'categories': [{'id': '4bf58dd8d48988d163941735',
    'name': 'Park',
    'pluralName': 'Parks',
    'shortName': 'Park',
    'icon': {'prefix': 'https://ss3.4sqi.net/img/categories_v2/parks_outdoors/park_',
     'suffix': '.png'},
    'primary': True}],
  'photos': {'count': 0, 'groups': []}},
 'referralId': 'e-0-4baf7aa5f964a52031033ce3-0'}

In [74]:
def get_districts_venues(city_districts_venues):
    all_city_venues= pd.DataFrame()
    for district in city_districts_venues.keys():
        print(district)
        district_venues=[]
        for item in city_districts_venues[district]:
            x={}
            x['District']=district
            x['Name']=item['venue']['name']
            x['Category']=item['venue']['categories'][0]['name']
            x['Lat'] = item['venue']['location']['lat']
            x['Lon'] = item['venue']['location']['lng']
            x['VenueId'] = item['venue']['id']
            district_venues.append(x)
        district_df = pd.DataFrame(district_venues)
        all_city_venues = all_city_venues.append(district_df)
    return all_city_venues

In [75]:
h = get_districts_venues(warsaw_districts_venues)

Żoliborz
Praga-Południe
Mokotów
Wola
Wilanów
Wesoła
Wawer
Włochy
Ursynów
Śródmieście
Praga-Północ
Ursus
Targówek
Rembertów
Ochota
Bielany
Białołęka
Bemowo


In [76]:
h

Unnamed: 0,District,Name,Category,Lat,Lon,VenueId
0,Żoliborz,Park Żeromskiego,Park,52.268377,20.988747,4baf7aa5f964a52031033ce3
1,Żoliborz,Galeria Wypieków,Bakery,52.268523,20.986111,55508b67498e2dcf9038f190
2,Żoliborz,Plac Wilsona,Plaza,52.268914,20.985587,4bb771276edc76b0a92e321c
3,Żoliborz,Prochownia Żoliborz,Café,52.268676,20.989228,50b68fd0e4b0132956a40f65
4,Żoliborz,Plac zabaw w Parku Żeromskiego,Playground,52.267248,20.988827,4db2f35a4b226b343d6d0581
...,...,...,...,...,...,...
141,Bemowo,husarska yoo,Trail,52.200740,20.920875,50962488e4b076702d276139
142,Bemowo,Carrefour Express,Grocery Store,52.198947,20.908864,4f4cfde76b7428f5d989fd1c
143,Bemowo,Bar Miss Sajgon,Vietnamese Restaurant,52.198636,20.913068,5087db74e4b02c9e3d8be924
144,Bemowo,Hotel Stawisko Klaudyn,Hotel,52.277663,20.864516,4e70da0eb0fba1302a5fabe7


In [77]:
len(h.VenueId.unique())

1183

In [80]:
h.shape

(1183, 6)

In [79]:
h.drop_duplicates(subset='VenueId',inplace=True)

In [88]:
def create_districts_polygons(geojson):
    polygons = {}
    for district in geojson['features']:
        polygons[district['properties']['nazwa_dzie']]=district['geometry']['coordinates'][0][0]
    return polygons

In [89]:
create_districts_polygons(warsaw_geojson)

{'Żoliborz': [[20.957550244360345, 52.266927972075955],
  [20.957595033280743, 52.26712068328835],
  [20.957577637116344, 52.26723607840504],
  [20.957537312295642, 52.26727740707875],
  [20.957761257322645, 52.267475395339645],
  [20.957815144778245, 52.267498512850246],
  [20.95787439818555, 52.26757897342375],
  [20.95791009640075, 52.26762769918745],
  [20.957907153084747, 52.26764189861664],
  [20.95793322389554, 52.26764721061744],
  [20.957923228478144, 52.26768297788135],
  [20.957920876497745, 52.26769151530553],
  [20.95793214246805, 52.26770634893295],
  [20.958270099249148, 52.26802283392484],
  [20.958467694144947, 52.26820795917454],
  [20.95868797775235, 52.26841439274213],
  [20.958872967069347, 52.268452385171436],
  [20.95894444363255, 52.26846705990273],
  [20.95915945950805, 52.268511173916025],
  [20.961263616641954, 52.27015734038829],
  [20.96171409893366, 52.27050980280468],
  [20.96176841477846, 52.27055251169258],
  [20.961853359918262, 52.27058354652189],
  [

In [81]:
warsaw_geojson

{'type': 'FeatureCollection',
 'name': 'warsaw_districts',
 'crs': {'type': 'name',
  'properties': {'name': 'urn:ogc:def:crs:OGC:1.3:CRS84'}},
 'features': [{'type': 'Feature',
   'properties': {'nazwa_dzie': 'Żoliborz',
    'style': {'weight': 1,
     'opacity': 0.9,
     'color': 'black',
     'fillOpacity': 0.7,
     'fillColor': '#d9f0a3'},
    'highlight': {}},
   'geometry': {'type': 'MultiPolygon',
    'coordinates': [[[[20.957550244360345, 52.266927972075955],
       [20.957595033280743, 52.26712068328835],
       [20.957577637116344, 52.26723607840504],
       [20.957537312295642, 52.26727740707875],
       [20.957761257322645, 52.267475395339645],
       [20.957815144778245, 52.267498512850246],
       [20.95787439818555, 52.26757897342375],
       [20.95791009640075, 52.26762769918745],
       [20.957907153084747, 52.26764189861664],
       [20.95793322389554, 52.26764721061744],
       [20.957923228478144, 52.26768297788135],
       [20.957920876497745, 52.26769151530553],

In [None]:
h['correct_district'] = 

In [3]:
%history -g

 2/1:
def read_landsat_images (folder_name):
    file_list = os.listdir(folder_name)
    chanel_list = []
    for f in fiel_list:
        if (f.startswith('LC') and f.endswith('.tif')):
            if  'band' in f:
                chanel_list.append(folder_name +f)
    channel_list.sort()
    channel_numbers = np.arrange(1,8)
    bands_dictionary = dict(zip(channel_numbers, channel_list))
    return bands_directory

#test
satelite_images = read_landsat_images('LC081900232018080301T1-SC20181218151707')
for band in satelite_images:
    print(band, satelite_images[band])
 2/2:
import os
import numpy as np
import rasterio as rio
import rasterio.mask as rmask
import fiona as fio
import matplotlib as plt
 2/3:
# Kod wpisujemy normalnie w komórkach. Shift + Enter realizuje kod w danej komórce.
# Komenda poniżej obowiązkowa w celu poprawnego wyświetlania obrazów.
%matplotlib notebook
 2/4:
def read_landsat_images (folder_name):
    file_list = os.listdir(folder_name)
    chanel_list = []
    f

22/8: ufo_df['Date_occured_len']=ufo_df['Date_occured'].astype(str).str.len()
22/9: ufo_df['Date_reported_len']=ufo_df['Date_reported'].astype(str).str.len()
22/10: ufo_df.head()
22/11: bad_date_df = ufo_df.loc[(ufo_df['Date_occured_len'] != 8) | (ufo_df['Date_reported_len'] != 8)]
22/12: bad_date_df.sort_values(by=['Date_occured_len'])
22/13: bad_date_df.info()
22/14: ufo_df[ufo_df['Date_occured_len'] != 8].count()['Date_occured_len']
22/15: ufo_df[ufo_df['Date_reported_len'] != 8].count()['Date_reported_len']
22/16: ufo_df.drop(ufo_df[ufo_df.Date_occured_len !=8].index, inplace=True)
22/17: pd.to_datetime(ufo_df.loc[0]['Date_occured'],format='%Y%m%d')
22/18: ufo_df['zerodate'] = ufo_df['Date_occured'].astype(str).str.match(r'....00')
22/19: ufo_df.loc[ufo_df['zerodate'] == True]
22/20: ufo_df.drop(ufo_df.loc[ufo_df['zerodate'] == True].index, inplace=True)
22/21: ufo_df.sort_values(by=['Date_occured']).head()
22/22: ufo_df.drop(40901, inplace=True)
22/23: ufo_df['Date_occured']=pd.to

26/28: pd.__dict__['DataFrame']
26/29: pd.__dict__['DataFrame'].__dict__
26/30: dir(pd.DataFrame)
26/31: hwg_df.quantile(q=[x*0.2 for x in range(0,11)])
26/32: import pandas as pd
26/33: heights_weights_gender_data = '/users/wioletanytko/documents/workspace/r/umapro/02-Exploration/data/01_heights_weights_genders.csv'
26/34:
hwg_df = pd.read_csv(
    heights_weights_gender_data)
26/35: hwg_df.head()
26/36: hwg_df.Height =hwg_df.Height * 2.54
26/37: hwg_df.Weight = hwg_df.Weight * 0.454
26/38: hwg_df.describe()
26/39: hwg_df.quantile(q=[x*0.2 for x in range(0,11)])
26/40: 'DataFrame' in pd.__dict__
26/41: hwg_df.describe()
26/42: hwg_df.quantile(q=[x*0.2 for x in range(0,11)])
26/43: hwg_df.quantile(q=[x*0.2 for x in range(0,6)])
26/44: hwg_df.quantile(q=0.25), hwg_df.quantile(q=0.75)
26/45: hwg_df.Height.quantile(q=0.25), hwg_df.Height.quantile(q=0.75)
26/46: hwg_df.Height.quantile(q=0.05), hwg_df.Height.quantile(q=0.95)
26/47: help(avg)
26/48:
def my_var(x):
    m = sum(x)/len(x)
    r

34/93: sorted(all_spam, key=str.len())
34/94: sorted(all_spam, key=len(str))
34/95: sorted(all_spam, key=lambda x: len(x))
34/96: sorted(test_corpus, key=lambda x: len(x))
34/97: sorted(test_corpus, key=lambda x: len(x), reverse=True)
34/98: [x in test_corpus if len(x)>20]
34/99: [x for x in test_corpus if len(x)>20]
34/100: len([x for x in test_corpus if len(x)>20])
34/101: len(test_corpus)
34/102:
def build_corpus(messages_vector):
    corpus = set()
    for message in messages_vector:
        message_words = [x for x in re.findall(r'\w+', message) if len(x)<20 and not any(i.isdigit() for i in x)] 
        #print(message_words)
        corpus = corpus | set(message_words)
    return list(corpus)
34/103: test_corpus = build_corpus(all_spam)
34/104: print(test_corpus)
34/105: len(test_corpus)
34/106:
for i, message in enumerate(all_spam):
    occurances = count_words(message, test_corpus)
    for key in occurances.keys():
        matrix.at[key,i] = occurances[key]
34/107: matrix.head(5

41/47: classify_email([all_spam[7]])
41/48:
def classify_email(email, prior=0.5, c=0.2):
    matrix = create_matrix(email)
    #print(matrix.head(10))
    common_words = set(matrix_spam.index).intersection(set(matrix.index))
    print(common_words)
    number_of_words = len(re.findall(r'\w+', email[0]))
    number_of_common_words = len(common_words)
    print()
    number_of_uncommon_words = number_of_words - number_of_common_words
    if number_of_common_words < 1:
        return prior*c**number_of_words
    else:
        probability_common = np.prod([matrix_spam.at[word,'documents_percentage'] for word in common_words])
        print('probability_common: ', probability_common)
        probability_uncommon = c**number_of_uncommon_words
        print('probability_uncommon: ', probability_uncommon)
        return(prior*probability_common*probability_uncommon)
41/49: classify_email([all_spam[7]])
41/50: classify_email([all_spam[70]])
41/51:
def classify_email(email, prior=0.5, c=0.3):
  

def count_words(message):
    #words_in_message =  set(re.findall(r'\w+', message))
    #words_in_message = [word.lower() for word in words_in_message if len(word)>2 and not word.isnumeric() and word.lower() not in stop_words]
    #occurances = Counter(words_in_message)
    occurances = set([word.lower for word in word_tokenize(message) if len(word)>2 and not word.isnumeric and word.lower() not in stop_words])
    return dict.fromkeys(occurances, 1)
50/37: matrix_spam = create_matrix(all_spam)
50/38: matrix_ham = create_matrix(all_ham)
50/39:
def classify_email(email,matrix_corpus, prior=0.5, c=0.01, ):
    matrix_email = create_matrix(email)

    common_words = set(matrix_corpus.index).intersection(set(matrix_email.index))#
#    print(common_words)
    
    number_of_words = len(set(re.findall(r'\w+', email[0])))
    number_of_common_words = len(common_words)
    number_of_uncommon_words = number_of_words - number_of_common_words
    
    if number_of_common_words < 1:
        return 

53/14: matrix_spam = create_corpus_matrix(all_spam)
53/15:
def create_corpus_matrix(messages):
    dicts = [count_words(message) for message in messages]
    matrix = pd.DataFrame(dicts)
   # matrix = matrix.transpose()
    
    matrix.loc['summa']=matrix.sum()
   # matrix=matrix.loc[matrix.loc['summa']>1 ]
    
   # matrix.loc['number_of_documents']=(matrix.count()-1)
   # matrix.loc['documents_percentage']=matrix.loc['number_of_documents']/len(messages)
    #matrix.loc['word_density'] = matrix.loc['sum']/matrix.loc['sum'].sum()
    
    return matrix
53/16: matrix_spam = create_corpus_matrix(all_spam)
53/17: %time matrix_spam = create_corpus_matrix(all_spam)
53/18: all_ham = get_messages(easyham_path)[:500]
53/19: matrix_ham = create_corpus_matrix(all_ham)
53/20:
def classify_email(email,matrix_corpus, prior=0.5, c=0.01, ):
    matrix_email = create_message_matrix(email)

    common_words = set(matrix_corpus.index).intersection(set(matrix_email.index))#
#    print(common_words)
    


for email in from_lines[:20]:
    y = parseaddr(email)
    x = re.findall(r"(\S+@\S+)", email[1])
    print(y)
61/49:
for email in from_lines[:20]:
    y = parseaddr(email[1])
    x = re.findall(r"(\S+@\S+)", email[1])
    print(y)
61/50:
for email in from_lines[:20]:
    y = parseaddr(email[1])
    print(y[1])
61/51:
email_addresses = []
for email in from_lines[:20]:
    parser = parseaddr(email[1])
    email_addresses.append[parser[1]]
61/52:
email_addresses = []
for email in from_lines[:20]:
    parser = parseaddr(email[1])
    email_addresses.append(parser[1])
61/53: email_addresses[:20]
61/54: len(email_addresses)
61/55:
email_addresses = []
for email in from_lines[]:
    parser = parseaddr(email[1])
    email_addresses.append(parser[1])
61/56:
email_addresses = []
for email in from_lines:
    parser = parseaddr(email[1])
    email_addresses.append(parser[1])
61/57: len(email_addresses)
63/1:
import pandas as pd
import matplotlib.pyplot as plt
import os
import re

from email.utils

79/58: thread_number_of_messages_df = pd.DataFrame({'Number_of_messages': thread_number_of_messages})
79/59: df = emails_threads.merge(thread_number_of_messages_df, on='thread_subject', how='left')
79/60: df
79/61: df.head()
79/62: emails_threads = emails_threads.merge(thread_number_of_messages_df, on='thread_subject', how='left')
79/63: emails_threads.head()
79/64: emails_threads.thread_density = emails_threads.Number_of_messages/emails_threads.thread_time
79/65: emails_threads.describe()
79/66: emails_threads.columns
79/67: emails_threads.dtypes
79/68: emails_threads.thread_density = emails_threads.Number_of_messages/emails_threads.thread_time.seconds
79/69: emails_threads.thread_density = emails_threads.Number_of_messages/emails_threads.thread_time
79/70: emails_threads.thread_density.min()
79/71: emails_threads.thread_density.max()
79/72: emails_threads.thread_time.min()
79/73: emails_threads.thread_time.max()
79/74: emails_threads.sortby(by='thread_time').head()
79/75: emails_thre

85/244: len(all_emails_training)
85/245: grouped_from = all_emails_training.groupby(by=['From']).count().sort_values(by=['Message'])
85/246:
grouped_from['Weight'] = np.log(grouped_from['Message']+1)
grouped_from = grouped_from.Weight
85/247:
#plt.plot(grouped_from.index, grouped_from.Weight, 'or')
#plt.plot(grouped_from.index, grouped_from.Message, 'ob')
#plt.rcParams['figure.figsize']=(24,16)
85/248: all_emails_training['is_thread'] = all_emails_training.Subject.str.contains('re\[{0,1}[1-9]{0,1}[1-9]{0,1}\]{0,1}:')
85/249:
def get_threads(emails_df):
    threads_subjects=[]
    for subject,isthread in zip(emails_df.Subject, emails_df.is_thread):
        if isthread:
            thread_subject = subject.split(':',1)[1]
            thread_subject = thread_subject.lstrip()
            threads_subjects.append(thread_subject)
        else:
            threads_subjects.append('x')
    
    print(threads_subjects[:10])
    return threads_subjects
85/250: threads_subjects = get_threads(all_e

90/75:
def count_words(message):
    occurances = set([word.lower() for word in word_tokenize(message) if len(word)>2 and len(word)<15 and word.isalpha() and word.lower() not in stop_words])
    result = dict.fromkeys(occurances, 1)
    return result
90/76:
def make_thread_tdm(messages):
    dicts = [count_words(message) for message in messages]
    #print(dicts)
    matrix = pd.DataFrame(dicts)
    
    matrix.loc['summa']=matrix.sum()
    
    matrix.loc['number_of_documents']=(matrix.count()-1)
    matrix.loc['documents_percentage']=matrix.loc['number_of_documents']/len(messages)
    
    return matrix
90/77: thread_tdm = make_thread_tdm(all_emails_training[all_emails_training.is_thread==True].Message[:10])
90/78: thread_tdm.head(10)
90/79: thread_tdm.head()
90/80: thread_tdm.head(20)
90/81: thread_tdm = make_thread_tdm(all_emails_training[all_emails_training.is_thread==True].Message)
90/82: thread_tdm.head(20)
90/83: thread_tdm.tail(20)
91/1:
import pandas as pd
import matplotlib.p

95/27:
emails_threads = all_emails_training[all_emails_training.is_thread==True]
emails_threads.drop(['Message', 'is_thread'], axis=1, inplace=True)
95/28: emails_threads_senders = emails_threads.groupby(by=['From']).count().sort_values(by=['Subject'])
95/29: #emails_threads_senders.tail()
95/30:
emails_threads_senders['Weight'] = np.log(emails_threads_senders['Subject']+1)
emails_threads_senders = emails_threads_senders.Weight
95/31: grouped_by_thread_subject = emails_threads.groupby(by=['thread_subject'])
95/32: minimum_send_datetime = grouped_by_thread_subject['Send_datetime'].transform(min)
95/33: maximum_send_datetime = grouped_by_thread_subject['Send_datetime'].transform(max)
95/34: emails_threads['send_min'] = minimum_send_datetime
95/35: emails_threads['send_max'] = maximum_send_datetime
95/36: emails_threads['thread_time'] = (emails_threads.send_max - emails_threads.send_min).dt.total_seconds()
95/37: thread_number_of_messages = grouped_by_thread_subject.size()
95/38: thread_n

100/48:
def make_tdm(messages):
    dicts = [count_words(message) for message in messages]
    matrix = pd.DataFrame(dicts)
    
    matrix.loc['summa']=matrix.sum()
    
    matrix.loc['number_of_documents']=(matrix.count()-1)
    matrix.loc['documents_percentage']=matrix.loc['number_of_documents']/len(messages)
    matrix.loc['weight'] = np.log(matrix.loc['documents_percentage'])+10
    
    return matrix
100/49: thread_tdm = make_tdm(all_emails_training[all_emails_training.is_thread==True].Message)
100/50: thread_tdm.tail()
100/51: thread_tdm.get(['weight']['linux'])
100/52: #thread_tdm.get(['weight']['linux'])
100/53:
emails_tdm = make_tdm(all_emails_training.Message)
emails_tdm.loc['weight'] = emails_tdm.loc['weight']/2
emails_tdm.tail()
100/54:
#grouped_from.tail(20 -social activity measure
#emails_threads_senders.tail() -thread senders activity measure
#emails_threads.thread_density -thread activity measure
#thread_tdm.weight -words in active threads mesure
#emails_tdm.weight -w

100/411: all_emails_training = all_emails_data.loc[:1250]
100/412: len(all_emails_training)
100/413: grouped_from = all_emails_training.groupby(by=['From']).count().sort_values(by=['Message'])
100/414:
grouped_from['Weight'] = np.log(grouped_from['Message']+1)
grouped_from = grouped_from.Weight
100/415:
#plt.plot(grouped_from.index, grouped_from.Weight, 'or')
#plt.plot(grouped_from.index, grouped_from.Message, 'ob')
#plt.rcParams['figure.figsize']=(24,16)
100/416: all_emails_training['is_thread'] = all_emails_training.Subject.str.contains('re\[{0,1}[1-9]{0,1}[1-9]{0,1}\]{0,1}:')
100/417:
def get_threads_subjects(emails_df):
    threads_subjects=[]
    for subject,isthread in zip(emails_df.Subject, emails_df.is_thread):
        if isthread:
            thread_subject = subject.split(':',1)[1]
            thread_subject = thread_subject.lstrip()
            threads_subjects.append(thread_subject)
        else:
            threads_subjects.append('x')
    
    return threads_subjects
100/

106/18: smokers = longevity.loc[longevity.Smokes==1]-smokers_guess
106/19:
smokers = longevity.loc[longevity.Smokes==1]-smokers_guess
smokers
106/20:
smokers = longevity.loc[longevity.Smokes==1].AgeAtDeath-smokers_guess
smokers
106/21:
smokers = (longevity.loc[longevity.Smokes==1].AgeAtDeath-smokers_guess)**2
smokers
106/22:
smokers = (longevity.loc[longevity.Smokes==1].AgeAtDeath-smokers_guess)**2
no_smokers = (longevity.loc[longevity.Smokes==0].AgeAtDeath-no_smokers_guess)**2
106/23: smokers.append(no_smokers)
106/24: longevity.describe()
106/25: smokers.append(no_smokers).mean().**(0.5)
106/26: smokers.append(no_smokers).mean()**(0.5)
107/1:
import pandas as pd
import matplotlib.pyplot as plt
import scipy.stats as stats
from scipy.optimize import curve_fit
import numpy as np

import statsmodels.api as sm

%matplotlib inline
107/2:
import pandas as pd
import matplotlib.pyplot as plt
import scipy.stats as stats
from scipy.optimize import curve_fit
import numpy as np

import statsmodel

117/12:
#regression with statsmodels
x = log(top_sites.UniqueVisitors)
y = log(top_sites.PageViews)

x = sm.add_constant(x)

model = sm.OLS(y,x).fit()

y_predictions = model.predict(x)
rmse = rmse(y, y_predictions)
print('rmse: ', rmse)
model.summary()
117/13:
minmax_2 = sm.add_constant(minmax)
predictions = model.predict(minmax_2)
predictions
117/14: logs = log(top_sites[['UniqueVisitors', 'PageViews']]).join(top_sites.InEnglish).join(top_sites.HasAdvertising)
117/15: z = sns.lmplot(x='UniqueVisitors', y='PageViews',hue='InEnglish', data=logs)
117/16:
#multiple regression model with statsmodels -formula interface
x = logs[['UniqueVisitors', 'InEnglish']]
y = logs['PageViews']

model = smf.ols(formula='PageViews ~ UniqueVisitors + InEnglish + HasAdvertising', data=logs).fit()

model.summary()
117/17: logs.head()
117/18:
#linear regression with scikit-learn
y_sk = logs.PageViews.values.reshape(-1,1)
x_sk = logs.UniqueVisitors.values.reshape(-1,1)
lm_sk = linear_model.LinearRegression()


122/289: normal_distribution = np.random.normal(0, 5, len(x))
122/290: y = 1 - x**2 + normal_distribution
122/291:
plt.scatter(x,y)
m,c = np.polyfit(x,y, deg=1)
plt.plot(x, m*x+c, color='red')
plt.grid()
122/292: x_squared = x**2
122/293: plt.scatter(x_squared, y)
122/294:
m,c = np.polyfit(x_squared, y, deg=1)
y_model = x_squared*m +c
r = np.corrcoef(x_squared, y)
print('r matrix: ', r)
print('m: ', m, ', c: ', c)
122/295: r**2
122/296: df = pd.DataFrame({'x_squared': x_squared, 'y': y})
122/297: df.head()
122/298:
model = smf.ols(formula='y ~ x_squared', data=df).fit()
model.summary()
122/299:
my_r2 = (((y_model - y.mean())**2).sum()) / (((y - y.mean())**2).sum())
my_r2
122/300: x = pd.Series(np.arange(0,1,0.01))
122/301: y = np.sin(2*np.pi*x) + np.random.normal(0,0.1,len(x))
122/302:
fig, ax = plt.subplots(figsize=(12,6))
ax.set_xlabel('x')
ax.set_ylabel('y')
line = ax.scatter(x,y)
line.set_label('signal')
ax.legend()
ax.grid()
122/303:
m,c = np.polyfit(x,y, deg=1)
print('Coeff and i

128/8: x_squared = x**2
128/9: plt.scatter(x_squared, y)
128/10:
m,c = np.polyfit(x_squared, y, deg=1)
y_model = x_squared*m +c
r = np.corrcoef(x_squared, y)
print('r matrix: ', r)
print('m: ', m, ', c: ', c)
128/11: r**2
128/12: df = pd.DataFrame({'x_squared': x_squared, 'y': y})
128/13: df.head()
128/14:
model = smf.ols(formula='y ~ x_squared', data=df).fit()
model.summary()
128/15:
my_r2 = (((y_model - y.mean())**2).sum()) / (((y - y.mean())**2).sum())
my_r2
128/16: x = pd.Series(np.arange(0,1,0.001))
128/17: y = np.sin(2*np.pi*x) + np.random.normal(0,0.1,len(x))
128/18:
fig, ax = plt.subplots(figsize=(12,6))
ax.set_xlabel('x')
ax.set_ylabel('y')
line = ax.scatter(x,y)
line.set_label('signal')
ax.legend()
ax.grid()
128/19:
m,c = np.polyfit(x,y, deg=1)
print('Coeff and intercept for np model: ',m,c)
128/20:
line, = ax.plot(x,x*m+c, color='red')
line.set_label('linear regression')
ax.legend()
fig
128/21: sk_model = linear_model.LinearRegression()
128/22:
y_sk = y.values.reshape(-1,1)


141/61: df_postal_codes = df_list[0]
141/62: df_postal_code
141/63: df_postal_codes
141/64: >This method return *list*, and correct dataframe is it's first element
141/65: df_postal_codes.head()
141/66: df_list = pd.read_html('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M')
141/67: df_postal_codes = df_list[0]
141/68: df.postal_codes.isnull().sum()
141/69: df_postal_codes.isnull().sum()
141/70: df_postal_codes.isnull().count()
141/71: df_postal_codes.isnull().sum()
141/72: df_postal_codes['Neighborhood']
141/73: df_postal_codes['Neighborhood']=='Not assigned'
141/74: dfdf_[postal_codes['Neighborhood']=='Not assigned']
141/75: df_postal_codes[df_postal_codes['Neighborhood']=='Not assigned']
141/76: df_postal_codes[df_postal_codes['Neighborhood']=='Not assigned']
141/77: df_postal_codes[df_postal_codes['Neighborhood']=='Not assigned'].count()
141/78: df_postal_codes = df_postal_codes[~(df_postal_codes['Borough']=='Not assigned')]
141/79: df_postal_codes.isnull().sum()
1

144/97:
toronto_grouped = toronto_onehot.groupby('Neighborhood').mean().reset_index()
toronto_grouped
144/98: toronto_onehot['Neighborhood']
144/99:
# one hot encoding
toronto_onehot = pd.get_dummies(toronto_venues[['Venue Category']], prefix="", prefix_sep="")
toronto_onehot.columns
144/100: toronto_venues.head()
144/101:
# one hot encoding
toronto_onehot = pd.get_dummies(toronto_venues[['Venue Category']], prefix="", prefix_sep="")
toronto_onehot['Neighborhood']
144/102: toronto_venues[['Venue Category']]
144/103: 'Neighborhood'.isin(toronto_venues[['Venue Category']])
144/104: toronto_venues[['Venue Category']]
144/105: toronto_venues[['Venue Category']].loc[toronto_venues[['Venue Category']]=='Neighborhood']
144/106: toronto_venues[['Venue Category']]
144/107: type(toronto_venues[['Venue Category']])
144/108: toronto_venues[['Venue Category']]=='Neighborhood'
144/109: toronto_venues[['Venue Category']].loc[toronto_venues[['Venue Category']]=='Neighborhood']
144/110: toronto_venues[

145/156:
df = pd.DataFrame([districts_names, test_density]).T
df.columns = ['nazwa_dzie', 'Density']
145/157:
warsaw_coordinates = [52.2297700, 21.0117800]
warsaw_map = folium.Map(location=warsaw_coordinates, zoom_start=11)
warsaw_map.choropleth(
    geo_data=warsaw_geojson,
    name='choropleth',
    data=df,
    columns=['nazwa_dzie', 'Density'],
    key_on='feature.properties.nazwa_dzie',
    fill_color='YlGn',
    fill_opacity=0.7,
    line_opacity=0.2,
    legend_name='Density'
)
warsaw_map
145/158: warsaw_geofile = 'geodata/warsaw_districts/warsaw_districts.geojson'
145/159:
with open(warsaw_geofile) as w:
    warsaw_geojson = json.load(w)
145/160:
districts_names = []
for x in warsaw_geojson['features']:
    districts_names.append(x['properties']['nazwa_dzie'])
145/161: test_density = list(range(len(districts_names)))
145/162:
df = pd.DataFrame([districts_names, test_density]).T
df.columns = ['nazwa_dzie', 'Density']
145/163:
warsaw_coordinates = [52.2297700, 21.0117800]
warsaw_

148/7:
districts_names = []
for x in warsaw_geojson['features']:
    districts_names.append(x['properties']['nazwa_dzie'])
148/8: test_density = list(range(len(districts_names)))
148/9:
df = pd.DataFrame([districts_names, test_density]).T
df.columns = ['nazwa_dzie', 'Density']
148/10:
warsaw_coordinates = [52.2297700, 21.0117800]
warsaw_map = folium.Map(location=warsaw_coordinates, zoom_start=11)
warsaw_map.choropleth(
    geo_data=warsaw_geojson,
    name='choropleth',
    data=df,
    columns=['nazwa_dzie', 'Density'],
    key_on='feature.properties.nazwa_dzie',
    fill_color='YlGn',
    fill_opacity=0.7,
    line_opacity=0.9,
    legend_name='Density'
)
#warsaw_map
148/11:
cracow_coordinates = [50.06143, 19.93658]
cracow_map = folium.Map(location=cracow_coordinates, zoom_start=11)

folium.GeoJson(
    cracow_geojson,
    name='geojson'
).add_to(cracow_map)
cracow_map
148/12:
CLIENT_ID = os.environ.get('FOURSQUAREID')
CLIENT_SECRET = os.environ.get('FOURSQUARESECRET')
VERSION = '202

warsaw_districts_venues = {}
for district in warsaw_district_centers.index:
    print(district)
    warsaw_districts_venues[district] = get_venues(warsaw_district_centers.loc[district, 'District_center'])
150/106:
def get_district_centers(city_geojson):
    district_centers = {}
    for district in city_geojson['features']:
        district_geometry = pd.DataFrame(
            district['geometry']['coordinates'][0][0],
            columns=['longitude', 'latitude']
        )
        district_center = [[district_geometry['latitude'].mean(), district_geometry['longitude'].mean()]]
        district_centers[district['properties']['nazwa_dzie']]=district_center
    
    district_centers = pd.DataFrame.from_dict(district_centers,
                                              orient='index',
                                              columns=['District_center'])
    
    return district_centers
150/107:
def get_districts_venues(city_districts_venues):
    all_city_venues= pd.DataFrame()
   

In [None]:
def pin_venues(venues_df,city_map):
    for i in venues_df.index:
        folium.Marker([venues_df.loc[i,'Lat'], venues_df.loc[i,'Lon']]).add_to(city_map)

In [None]:
def calculate_radius(district_name):
    center_coords = warsaw_district_centers.loc[district_name, 'District_center']
    center_point = Point(center_coords[1], center_coords[0])
    
    polygon_points = []
    for point_coords in warsaw_districts_polygons [district_name]:
        polygon_point = Point(point_coords[0], point_coords[1])
        polygon_points.append(polygon_point)
    
    distances = []
    for point in polygon_points:
        distance_to_center = center_point.distance(point)
        distances.append(distance_to_center)
        
    max_distance = max(distances)
    factor = 0.6157*111.3*1000
    max_distance_meters = int(max_distance*factor)
    
    return max_distance_meters

In [None]:
warsaw_districts_radiuses = {}
for district in warsaw_district_centers.index:
    warsaw_districts_radiuses[district] = calculate_radius(district)

In [None]:
def get_venues(district_center, radius):
    RADIUS = radius
    lat, lng = district_center
    url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            RADIUS, 
            LIMIT)
    result = requests.get(url).json()
    
    venues = result['response']['groups'][0]['items']
    
    total_results = result['response']['totalResults']
    print('\tTotal results: ', total_results, '\n')
    
    #checking if there is more results -if true, next request with offset is send
    requests_to_perform = total_results//100
    
    for _ in range(requests_to_perform):
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}&offset={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            RADIUS, 
            LIMIT,
            OFFSET
        )
        result = requests.get(url).json()
        venues.extend(result['response']['groups'][0]['items'])
        
    return venues

In [None]:
warsaw_districts_venues_foursquare = {}
for district in warsaw_district_centers.index:
    print(district)
    warsaw_districts_venues_foursquare[district] = get_venues(
        warsaw_district_centers.loc[district, 'District_center'],
        warsaw_districts_radiuses[district])

In [None]:
def create_districts_polygons(geojson):
    polygons = {}
    for district in geojson['features']:
        polygons[district['properties']['nazwa_dzie']]=district['geometry']['coordinates'][0][0]
    return polygons

In [None]:
warsaw_districts_polygons = create_districts_polygons(warsaw_geojson)

In [None]:
def check_if_inside_district(venue_coords, district_shape):
    p = Point(venue_coords)
    poly = Polygon(district_shape)
    return p.within(poly)

In [None]:
warsaw_districts_venues.reset_index(inplace=True)

In [None]:
for i in warsaw_districts_venues.index:
    warsaw_districts_venues.loc[i, 'Inside'] = check_if_inside_district(
                                        [warsaw_districts_venues.loc[i,'Lon'], warsaw_districts_venues.loc[i,'Lat']],
                                        warsaw_districts_polygons[warsaw_districts_venues.loc[i,'District']])

In [None]:
warsaw_districts_venues = warsaw_districts_venues.loc[warsaw_districts_venues['Inside']==True]

In [None]:
warsaw_districts_venues = get_districts_venues(warsaw_districts_venues_foursquare)

In [None]:
warsaw_top5 = warsaw_districts_venues.groupby('Category').count().sort_values(by='VenueId', ascending=False).head()

In [None]:
def extract_data(district,district_venues_foursquare):
    district_venues=[]
    for item in district_venues_foursquare:
        x={}
        x['District']=district
        x['Name']=item['venue']['name']
        x['Category']=item['venue']['categories'][0]['name']
        x['Lat'] = item['venue']['location']['lat']
        x['Lon'] = item['venue']['location']['lng']
        x['VenueId'] = item['venue']['id']
        district_venues.append(x)
    district_df = pd.DataFrame(district_venues)
    return district_df

In [None]:
def get_districts_venues(city_districts_venues):
    all_city_venues= pd.DataFrame()
    for district in city_districts_venues.keys():
        district_df = extract_data(district, city_districts_venues[district])
        district_df['District'] = district
        all_city_venues = all_city_venues.append(district_df)
    return all_city_venues

In [None]:
def check_if_inside_district(venue_coords, district_shape):
    p = Point(venue_coords)
    poly = Polygon(district_shape)
    return p.within(poly)

In [None]:
cracow_districts_names = []
for x in cracow_geojson['features']:
    cracow_districts_names.append(x['properties']['nazwa'])

In [None]:
cracow_test_density = list(range(len(districts_names)))
df = pd.DataFrame([cracow_districts_names, test_density]).T
df.columns = ['nazwa', 'Density']

In [None]:
def create_cracow_map():    
    cracow_coordinates = [50.06143, 19.93658]
    cracow_map = folium.Map(location=cracow_coordinates, zoom_start=11)
    cracow_map.choropleth(
        geo_data=cracow_geojson,
        name='choropleth',
        data=df,
        columns=['nazwa', 'Density'],
        key_on='feature.properties.nazwa',
        fill_color='YlGn',
        fill_opacity=0.7,
        line_opacity=0.9,
        legend_name='Density'
    )
    return cracow_map
150/488:
cracow_map = create_cracow_map()

In [None]:
def create_cracow_polygons(geojson):
    polygons = {}
    for district in geojson['features']:
        polygons[district['properties']['nazwa']]=district['geometry']['coordinates'][0][0]
    return polygons

In [None]:
def get_cracow_centers(city_geojson):
    district_centers = {}
    for district in city_geojson['features']:
        district_geometry = pd.DataFrame(
            district['geometry']['coordinates'][0][0],
            columns=['longitude', 'latitude']
        )
        district_center = [[district_geometry['latitude'].mean(), district_geometry['longitude'].mean()]]
        district_centers[district['properties']['nazwa']]=district_center
    
    district_centers = pd.DataFrame.from_dict(district_centers,
                                              orient='index',
                                              columns=['District_center'])
    
    return district_centers
cracow_districts_centers = get_cracow_centers(cracow_geojson)

In [None]:
cracow_districts_polygons = create_cracow_polygons(cracow_geojson)

In [None]:
def calculate_cracow_radius(district_name):
    center_coords = cracow_districts_centers.loc[district_name, 'District_center']
    center_point = Point(center_coords[1], center_coords[0])
    
    polygon_points = []
    for point_coords in cracow_districts_polygons [district_name]:
        polygon_point = Point(point_coords[0], point_coords[1])
        polygon_points.append(polygon_point)
    
    distances = []
    for point in polygon_points:
        distance_to_center = center_point.distance(point)
        distances.append(distance_to_center)
        
    max_distance = max(distances)
    factor = 0.6157*111.3*1000
    max_distance_meters = int(max_distance*factor)
    
    return max_distance_meters

In [None]:
cracow_districts_radiuses = {}
for district in cracow_districts_centers.index:
    cracow_districts_radiuses[district] = calculate_cracow_radius(district)

In [None]:
cracow_districts_venues_foursquare = {}
for district in cracow_districts_centers.index:
    print(district)
    cracow_districts_venues_foursquare[district] = get_venues(
        cracow_districts_centers.loc[district, 'District_center'],
        cracow_districts_radiuses[district])

In [None]:
cracow_districts_venues = get_districts_venues(cracow_districts_venues_foursquare)
cracow_districts_venues.reset_index(inplace=True)

In [None]:
for i in cracow_districts_venues.index:
    cracow_districts_venues.loc[i, 'Inside'] = check_if_inside_district(
                                        [cracow_districts_venues.iloc[i]['Lon'], cracow_districts_venues.loc[i]['Lat']],
                                        cracow_districts_polygons[cracow_districts_venues.iloc[i]['District']])

In [None]:
cracow_districts_venues = cracow_districts_venues.loc[cracow_districts_venues['Inside']==True]
cracow_top5 = cracow_districts_venues.groupby('Category').count().sort_values(by='VenueId', ascending=False).head()

In [None]:
cracow_map = create_cracow_map()
for i in cracow_districts_venues.index:    
    folium.Marker([cracow_districts_venues.loc[i,'Lat'], cracow_districts_venues.loc[i,'Lon']]).add_to(cracow_map)

In [None]:
top_categories = warsaw_top5.add(cracow_top5, fill_value=0)

In [None]:
top_5_categories = top_categories.head().index
top_5_categories

In [None]:
warsaw_districts_wiki = 'https://pl.wikipedia.org/wiki/Podział_administracyjny_Warszawy'
cracow_districts_wiki = 'https://pl.wikipedia.org/wiki/Podział_administracyjny_Krakowa'

In [None]:
warsaw_districts_df = pd.read_html(warsaw_districts_wiki)