In [1]:
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
import urllib.request as ur
import os
import shutil

Path = os.getcwd()

### Collecting Twitter data using the MINET library
- We use two hashtags as a query and get all associated tweets. 
- Even though we put the limit for tweets as 30k, only \~19k and \~6k tweets are available for the hashtags "#tierslieux" and "#tierslieu" respectively.

In [2]:
!minet tw scrape tweets "#tierslieux" --limit 30000 > tweets_tierslieux_30000.csv

!minet tw scrape tweets "#tierslieu" --limit 30000 > tweets_tierslieu_30000.csv

Searching for "#tierslieux"                                                     
Collecting tweet:  63%|▋| 19033/30000 [13:41<07:53, 23.16tweet/s, queries=1, tok                                                             
Searching for "#tierslieu"                                                      
Collecting tweet:  20%|▏| 6066/30000 [04:34<18:03, 22.09tweet/s, queries=1, toke


In [3]:
shutil.move("tweets_tierslieux_30000.csv", Path[:-4]+'/data/tweets_tierslieux_30000.csv')
shutil.move("tweets_tierslieu_30000.csv", Path[:-4]+'/data/tweets_tierslieu_30000.csv')

'/home/onkar/OneDrive/Courses/Digital_Spaces/Project(1)/tierslieux_ddps/data/tweets_tierslieu_30000.csv'

In [4]:
#### Combining the two twitter datasets
df1 = pd.read_csv(Path[:-4]+'/data/tweets_tierslieux_30000.csv')
df2 = pd.read_csv(Path[:-4]+'/data/tweets_tierslieu_30000.csv')


df_combined = pd.DataFrame(columns=['id','text','hashtags','timestamp_utc','year','month','day'])

for temp_df in [df1,df2]:
    new_df = pd.DataFrame(columns=['id','text','hashtags','timestamp_utc','year','month','day'])
    for cols in ['id','text','hashtags','timestamp_utc']:
        new_df[cols] = temp_df[cols]

    date = pd.to_datetime(temp_df['timestamp_utc'], unit='s')
    new_df['year'] = pd.DatetimeIndex(date).year
    new_df['month'] = pd.DatetimeIndex(date).month
    new_df['day'] = pd.DatetimeIndex(date).day

    df_combined = pd.concat([df_combined,new_df])

df_combined = df_combined.drop_duplicates(ignore_index=True)
df_combined = df_combined.sort_values(by=['timestamp_utc'], ascending=True,ignore_index=True)

ori_hashtag = df_combined['hashtags']

for i, hasht in enumerate(ori_hashtag):
    temp_hasht = hasht.split('|')
    temp_hasht = [ht for ht in temp_hasht if ht!='tierslieux']
    temp_hasht = [ht for ht in temp_hasht if ht!='tierslieu']
    ori_hashtag[i] = temp_hasht

df_combined['hashtags'] = ori_hashtag
df_combined.to_csv(Path[:-4]+'/data/tweets_combined.csv')
df_combined

Unnamed: 0,id,text,hashtags,timestamp_utc,year,month,day
0,2408979548344320,"RT @fbon: La ""non bibliothèque"" de Chris Meade...",[],1289409320,2010,11,10
1,4884796241088512,"""Impact des nvx modes de travail [= #teletrava...",[teletravail],1289999600,2010,11,17
2,27334783331729408,#thirdplace #tierslieux #EnUnMot RT @hughpearm...,"[enunmot, thirdplace]",1295352094,2011,1,18
3,36102389715058688,observe les amoureux qui s'bécotent sur les ma...,[],1297442455,2011,2,11
4,43616655615594496,#Tierslieu#Aubervilliers http://sebastienlucas...,[tierslieu#aubervilliers],1299233995,2011,3,4
...,...,...,...,...,...,...,...
24977,1511736088813096969,🌐 Vive la fibre numérique et surtout humaine 😃...,"[coworking, hautdebit]",1649260944,2022,4,6
24978,1511744041024737283,"Le Square, Jardinerie et Tiers-Lieu\nscénograp...","[jardinerie, merchandising, mobilier, scénogra...",1649262840,2022,4,6
24979,1511927011471679490,"🏡💻 Le PETR Pays de #Langres, regroupant 3 Comm...","[langres, teletravail]",1649306464,2022,4,7
24980,1511946875519311873,Échanges intéressants lors de la conférence or...,[],1649311200,2022,4,7


### Collecting Website data using the Beautiful soup library
- We crawl the https://communemesure.fr/app/les-lieux website to get data for all available third places.
- We search CSS classes with specific attributes on the html page. 
- We then go to each individual site page and collect the name, address, geographical coordinates,founding ideas and values carried

In [5]:
df = pd.DataFrame(columns=['Name','Address','Latitude','Longitude','Founding_idea','values carried'])

url_main = ur.urlopen('https://communemesure.fr/app/les-lieux')
soup = BeautifulSoup(url_main.read())


main_page = soup.findAll('span', attrs={'class':'title mb-4'})

for i,mp in enumerate(main_page):
    temp_data = ['']*len(df.columns)
    temp_data[0] = mp.text  #Name of the mesure

    page_link = mp.findNext()

    mesure_url = ur.urlopen(page_link['href'])
    soup = BeautifulSoup(mesure_url.read())

    ########
    founding_values = soup.find('p',attrs={'class':'fontSize1em'})
    data = founding_values.text
    data = data.replace('\'','').replace('\r\n','').replace('. ','.')
    temp_data[4] = data #Founding values of the mesure

    ########
    values_carried = soup.findAll('strong',attrs={'class':'valeurs has-text-primary'})
    val = ''
    for vc in values_carried:
        val += vc.text + ','
    val = val[:-1]
    temp_data[5] = val  #Values carried by the mesure

    ########
    location = list(soup.find('div',attrs={'class':'sous-banner sous-banner-localisation'}))
    loc_data = location[3]
    temp_data[1] = loc_data.text    #Address of the mesure

    coord = (loc_data['href']).replace('geo:','')
    coord = coord.split(',')
    temp_data[2] = float(coord[0])  #Latitude
    temp_data[3] = float(coord[1])  #Longitude

    df.loc[len(df)] = temp_data

    print(i, ' is done')

df.to_csv(Path[:-4]+'/data/commune_mesure_website_data.csv',sep=',')

0  is done
1  is done
2  is done
3  is done
4  is done
5  is done
6  is done
7  is done
8  is done
9  is done
10  is done
11  is done
12  is done
13  is done
14  is done
15  is done
16  is done
17  is done
18  is done
19  is done
20  is done
21  is done
22  is done
23  is done
24  is done
25  is done
26  is done
27  is done
28  is done
29  is done
30  is done
31  is done
32  is done
33  is done
34  is done
35  is done
36  is done
37  is done
