In [8]:
import json
from unidecode import unidecode
import re
from collections import Counter

### all regions of italy in english, italian, french, spanish, german, dutch
### cities all have same index
regions_it = ['Abruzzo', 'Basilicata', 'Calabria', 'Campania', 'Emilia-Romagna', 'Friuli-Venezia Giulia', 'Lazio', 'Liguria', 'Lombardia', 'Marche', 'Molise', 'Piemonte', 'Puglia', 'Sardegna', 'Sicilia', 'Toscana', 'Trentino-Alto Adige', 'Umbria', 'Valle d\'Aosta', 'Veneto']
regions_en = ['Abruzzo', 'Basilicata', 'Calabria', 'Campania', 'Emilia-Romagna', 'Friuli-Venezia Giulia', 'Lazio', 'Liguria', 'Lombardy', 'Marches', 'Molise', 'Piedmont', 'Apulia', 'Sardinia', 'Sicily', 'Tuscany', 'Trentino-South Tyrol', 'Umbria', 'Aosta Valley', 'Veneto']
regions_fr = ['Abruzzes', 'Basilicate', 'Calabre', 'Camapanie', 'Emilie-Romagne', 'Frioul-Venetie Julienne', 'Latium', 'Ligurie', 'Lomardie', 'Marches', 'Molise', 'Piemont', 'Pouilles', 'Sardaigne', 'Sicile', 'Toscane', 'Trentin-Haut-Adige', 'Ombrie', 'Vallee d\'Aoste', 'Venetie']
regions_es = ['Abruzos', 'Basilicata', 'Calabria', 'Campania', 'Emilia-Romana', 'Friuli-Venecia Julia', 'Lacio', 'Liguria', 'Lombardia', 'Marcas', 'Molise', 'Piamonte', 'Apulia', 'Cerdena', 'Sicilia', 'Toscana', 'Trentino-Alto Adigio', 'Umbria', 'Valle de Aosta', 'Veneto']
regions_de = ['Abruzzen', 'Basilikata', 'Kalabrien', 'Kampanien', 'Emilia Romagna', 'Friaul-Julisch-Venetien', 'Latium', 'Ligurien', 'Lombardei', 'Marken', 'Molise', 'Piemont', 'Apulien',  'Sardinien', 'Sizilien', 'Toskana', 'Trentino-Sudtirol', 'Umbrien', 'Aostatal', 'Venetien']
regions_nl = ['Abruzzen', 'Basilicata', 'Calabrie', 'Campania', 'Emilia-Romagna', 'Friuli-Venezia Giulia', 'Lazio', 'Ligurie', 'Lombardije', 'Marche', 'Molise', 'Piemont', 'Apulie', 'Sardinie', 'Sicilie', 'Toscane', 'Trentino-Zuid-Tirol', 'Umbrie', 'Valle d\'Aosta', 'Veneto']

OTHERS = [regions_en, regions_fr, regions_es, regions_de, regions_nl]
ALL_REGIONS = regions_it + regions_en + regions_fr + regions_es + regions_de + regions_nl



filename = '/Users/sashapaulovich/Desktop/cs/survey/TwitterData_FINAL.json'

with open(filename) as data:
	corpus = json.load(data)

### remove @, #, https, degree sign
def clean(text):
    tags = re.compile(r'([@]\S*|[#]\S*|\bhttps\S*)')
    clean = tags.sub('', text)
    clean.replace(u'\u00b0', '')
    return clean

### replaces non-italian translations of regions to italian translations
def change_region(loc):
    for i in range(len(OTHERS)):
        for j in range(len(OTHERS[i])):
            if loc == OTHERS[i][j]:
                loc = regions_it[j]
    return loc



cities = []
regions = []
content = []

### all cities, regions, and content
for tweet in corpus:
        place = tweet['placeFullName'].split(', ')
        
        ### ignores tweets with only country information
        ### ignores tweets from non-italian regions
        
        if (len(place) == 2) and (place[1] in ALL_REGIONS):
            
            text = clean(unidecode(tweet['content']))
            
            ### only append tweet if it's unique
            if text not in content:
                
                ### unidecode removes non-words, replaces accents w ascii equivalents
                cities.append(unidecode(place[0]))
                regions.append(change_region(unidecode(place[1])))
                content.append(text)

        

In [9]:
print 'unique italian tweets: %d' %len(content)

print 'total tweets per region: '
Counter(regions) # displays number of tweets for each region

unique italian tweets: 91509
total tweets per region: 


Counter({'Abruzzo': 1874,
         'Basilicata': 670,
         'Calabria': 2335,
         'Campania': 7260,
         'Emilia-Romagna': 8227,
         'Friuli-Venezia Giulia': 2083,
         'Lazio': 13770,
         'Liguria': 3633,
         'Lombardia': 14773,
         'Marche': 1911,
         'Molise': 274,
         'Piemonte': 5593,
         'Puglia': 4478,
         'Sardegna': 3105,
         'Sicilia': 5996,
         'Toscana': 7496,
         'Trentino-Alto Adige': 1259,
         'Umbria': 1193,
         "Valle d'Aosta": 274,
         'Veneto': 5305})

In [10]:
### sort tweets per region

from collections import OrderedDict

OrderedDict(Counter(regions).most_common())

OrderedDict([('Lombardia', 14773),
             ('Lazio', 13770),
             ('Emilia-Romagna', 8227),
             ('Toscana', 7496),
             ('Campania', 7260),
             ('Sicilia', 5996),
             ('Piemonte', 5593),
             ('Veneto', 5305),
             ('Puglia', 4478),
             ('Liguria', 3633),
             ('Sardegna', 3105),
             ('Calabria', 2335),
             ('Friuli-Venezia Giulia', 2083),
             ('Marche', 1911),
             ('Abruzzo', 1874),
             ('Trentino-Alto Adige', 1259),
             ('Umbria', 1193),
             ('Basilicata', 670),
             ('Molise', 274),
             ("Valle d'Aosta", 274)])