In [1]:
def mem_limit(MB=None):
    import os
    if os.name == 'nt':
        print('WARNING: limiting memory on Windows is not supported')
        return
    
    import resource
    with open('/proc/meminfo', 'r') as mem:
        free_memory = 0
        for i in mem:
            sline = i.split()            
            if str(sline[0]) == 'MemAvailable:':
                free_memory = int(sline[1])               
                break     
        if sline[2] != 'kB':
            raise Exception('Unrecognized memory unit:', sline[2])
        soft, hard = resource.getrlimit(resource.RLIMIT_AS)
        if not MB:                    
            MB = (free_memory // 1024) // 2  
        print('Free mem:', free_memory//1024, 'MB', 
              ' Limiting to:', MB, 'MB')
        resource.setrlimit(resource.RLIMIT_AS, (MB *1024 * 1024, hard))

mem_limit()

#Please execute this cell
import jupman;
jupman.mem_limit()

Free mem: 2332 MB  Limiting to: 1166 MB
Free mem: 2352 MB  Limiting to: 1176 MB


# Midterm sim -  Fri 05, Nov 2021 EXERCISES

**Scientific Programming - Data Science Master @ University of Trento**

**THIS IS ONLY A SIMULATION: YOU EARN NOTHING, YOU LOSE NOTHING**

<!-- ## [Download exercises and solutions](_static/generated/sciprog-ds-2021-11-05-exam.zip) 

**Fri 05, Nov 2021**: Published [exam solutions](exams/2021-11-05/solutions/exam-2021-11-05-sol.ipynb)
-->

## Part A - Terence Hill and Bud Spencer movies

Among the greatest gifts of Italy to the world we can certainly count Terence Hill and Bud Spencer movies. 

We took their movies data from [Wikidata](https://wikidata.org/), a project by the Wikimedia foundation which aims to store only machine-readable data, like numbers, strings, and so on interlinked with many references. Each entity in Wikidata has an identifier, for example Terence Hill is the [entity Q243430](http://www.wikidata.org/entity/Q243430) and Bud Spencer is [Q221074](http://www.wikidata.org/entity/Q221074).

<!--Wikidata can be queried using the SPARQL language: the data was obtained with [this query](https://query.wikidata.org/#SELECT%20%3Fstar%20%3FstarLabel%20%3Fitem%20%3FitemLabel%20%28MIN%28%3Fdate%29%20AS%20%3FfirstReleased%29%0AWHERE%20%7B%0A%20%20%3Fitem%20wdt%3AP161%20%3Fstar%3B%0A%20%20%20%20%20%20%20%20wdt%3AP577%20%3Fdate.%0A%20%20%20%20%0A%20%20FILTER%20%28%3Fstar%20%3D%20wd%3AQ221074%20%7C%7C%20%3Fstar%20%3D%20wd%3AQ243430%29%20%20%0A%20%20%20%20%20%20%20%20%20%0A%20%20SERVICE%20wikibase%3Alabel%20%7B%20bd%3AserviceParam%20wikibase%3Alanguage%20%22en%22.%20%7D%0A%20%20OPTIONAL%20%7B%20%3Fitem%20wdt%3AP18%20%3F_image.%20%7D%0A%7D%20GROUP%20BY%20%3Fstar%20%3FstarLabel%20%3Fitem%20%3FitemLabel%20%3F_image%0AORDER%20BY%20%28%3Fdate%29) repeated for several languages, and downloaded in CSV format (among the many which can be chosen). <!--Even if not necessary for the purposes of the exercise, you are invited to play a bit with the interface, like trying different visualizations (i.e. try select map in the middle-left corner) - or see other examples-->


You are given some CSVs of movies, all having names ending in `-LG.csv`, where LG can be a language tag like `it`, `en`, `de`, `es`... They mostly contain the same data except for the movie labels which are in the corresponding language. The final goal will be displaying the network of movies and put in evidence the ones co-starring the famous duo. 

Each file row contains info about a single actor starring in a movie. Multiple lines with same movie id will mean multiple actors are co-starring. We can see an excerpt of **first four** lines of english version: notice second movie has id [Q180638](http://www.wikidata.org/entity/Q180638) and is co-starred by both Bud Spencer and Terence Hill

star,starLabel,movie,movieLabel,firstReleased

http://www.wikidata.org/entity/Q221074,Bud Spencer,http://www.wikidata.org/entity/Q116187,Thieves and Robbers,1983-02-11T00:00:00Z

http://www.wikidata.org/entity/Q221074,Bud Spencer,http://www.wikidata.org/entity/Q180638,Odds and Evens,1978-10-28T00:00:00Z

http://www.wikidata.org/entity/Q243430,Terence Hill,http://www.wikidata.org/entity/Q180638,Odds and Evens,1978-10-28T00:00:00Z



Now open Jupyter and start editing this notebook `exam-2021-11-05.ipynb`

## load

Write a function that given a `filename_prefix` and list of `languages`, parses the corresponding files and RETURNS a **dictionary of dictionaries**, which maps movies id to movies data, in the format as in the exerpt.

- When a label is missing, you will find instead an id like `Q3778078`: substitute it with empty string
 (HINT: to recognize ids you might use `is_digit()` method) 
- convert date numbers to proper integers
- **DO NOT** put constant ids nor language tags in the code (so no `'Q221074'` nor `'it'` ...)

In [59]:
import csv

def load(filename_prefix, languages):
    ris= dict()
    csvs = [filename_prefix+"-"+d+".csv" for d in languages]
    #get normal info
    i = 0
    while i < len(languages):
        with open(csvs[i]) as f:
            reader = csv.reader(f, delimiter = ',')
            next(reader)
            if i == 0:
                for row in reader:
                    actor_id = row[0].split("/")[4]
                    film_id = row[2].split("/")[4]
                    release = row[4].split("-")
                    release = (int(release[0]),int(release[1]),int(release[2].split('T')[0]))
                    #name = (row[3][0] != "Q") ? row[3] : ""
                    #if row[3][0] != "Q":
                    name = row[3]
                    #else:
                    #    name = ""
                    #print(ris)
                    if film_id in ris:
                        ris[film_id]['actors'].append((actor_id,row[1]))
                    else:
                        ris[film_id] = dict()
                        ris[film_id]['actors'] = [(actor_id,row[1])]
                        ris[film_id]['first_release'] = release
                        ris[film_id]['names'] = {languages[i]:name}
            else:
                for row in reader:
                    #if row[3][0] != "Q":
                    name = row[3]
                    #else:
                    #    name = ""
                    film_id = row[2].split("/")[4]
                    ris[film_id]['names'][languages[i]] = name
                    #print(row[3])
                    #print(languages[i])
                    #print(ris[film_id]['names'])
                    #print(ris[film_id]['names'][languages[i]])
                
               
                    
            
            #print(ris)
        i +=1    
     
    return(ris)
    #raise Exception('TODO IMPLEMENT ME !')
    
movies_db = load('bud-spencer-terence-hill-movies', ['en', 'it', 'de'])
#movies_db = load('bud-spencer-terence-hill-movies', ['es', 'en', 'de','it'])


Complete expected output can be found in [expected_db.py](expected_movies_db.py)

EXERPT:
{
  'Q116187': {
              'actors': [('Q221074', 'Bud Spencer')],
              'first_release': (1983, 2, 11),
              'names': {'de': 'Bud, der Ganovenschreck',
                        'en': 'Thieves and Robbers',
                        'it': 'Cane e gatto'}
             }
  'Q180638': {
              'actors': [('Q221074', 'Bud Spencer'), ('Q243430', 'Terence Hill')],
              'first_release': (1978, 10, 28),
              'names': {'de': 'Zwei sind nicht zu bremsen',
                        'en': 'Odds and Evens',
                        'it': 'Pari e dispari'}
             }
  'Q231967': {
              'actors': [('Q221074', 'Bud Spencer'), ('Q243430', 'Terence Hill')],
              'first_release': (1981, 1, 1),
              'names': {'de': 'Zwei Asse trumpfen auf',
                        'en': 'A Friend Is a Treasure',
                        'it': 'Chi trova un amico, trova un tesoro'}
             }
  .
  .
}


In [60]:
# TESTING
from pprint import pformat; from expected_movies_db import expected_movies_db
for sid in expected_movies_db.keys():
    if sid not in movies_db: print('\nERROR: MISSING movie', sid); break    
    for k in expected_movies_db[sid]:
        if k not in movies_db[sid]:
            print('\nERROR at movie', sid,'\n\n   MISSING key:', k); break    
        if expected_movies_db[sid][k] != movies_db[sid][k]:
            print('\nERROR at movie', sid, 'key:',k)
            print('  ACTUAL:\n', pformat(movies_db[sid][k]))
            print('  EXPECTED:\n', pformat(expected_movies_db[sid][k]))
            break
if len(movies_db) > len(expected_movies_db):
    print('ERROR! There are more movies than expected!')
    print('  ACTUAL:\n', len(movies_db))
    print('  EXPECTED:\n', len(expected_movies_db))


ERROR at movie Q3778078 key: names
  ACTUAL:
 {'de': 'Q3778078', 'en': 'Q3778078', 'it': 'Guaglione'}
  EXPECTED:
 {'de': 'Q3778078', 'en': '', 'it': 'Guaglione'}

ERROR at movie Q3791384 key: names
  ACTUAL:
 {'de': 'Q3791384', 'en': 'Q3791384', 'it': 'I vagabondi delle stelle'}
  EXPECTED:
 {'de': 'Q3791384', 'en': '', 'it': 'I vagabondi delle stelle'}

ERROR at movie Q3844156 key: names
  ACTUAL:
 {'de': 'Q3844156', 'en': 'Q3844156', 'it': 'Mamma sconosciuta'}
  EXPECTED:
 {'de': 'Q3844156', 'en': '', 'it': 'Mamma sconosciuta'}


## save_table

Write a function that given a movies db and a list of `languages`, writes a new file `merged.csv` 

- separate actor names with `and`
- use only the year as date
- file must be formatted like this:

movie_id,name en,name it,first_release,actors
Q116187,Thieves and Robbers,Cane e gatto,1983,Bud Spencer
Q180638,Odds and Evens,Pari e dispari,1978,Bud Spencer and Terence Hill


In [None]:
import csv

def save_table(movies, languages):
    i = 0
    with open('merged.csv','w') as f: 
        w = csv.writer(f, delimiter='w')
        w.writerow(["movie_id","name en","name it","first_release","actors"])
        keys = list(movies.keys())
        while i < len(keys):
            movie = movies[keys[i]]
            elements = [keys[i]]
            for title in languages:
                elements.append(movie['names'][title])
            elements.append(movie['first_release'][0])
            #print(movie)
            if len(movie['actors']) == 1:
                elements.append(movie['actors'][0])
            else:
                elements.append(movie['actors'][0]+" and "+movie['actors'][0])
            #while i < len(languages):
            w.writerow(elements)
            i += 1
    raise Exception('TODO IMPLEMENT ME !')    
    
save_table(movies_db, ['en','it'])
#save_table(movies_db, ['de'])


Complete expected file is in [expected-merged.csv](expected-merged.csv)

In [None]:
# TESTING
with open('expected-merged.csv',encoding='utf-8', newline='') as expected_f:
    with open('merged.csv',encoding='utf-8', newline='') as f:
        expected_reader = csv.reader(expected_f, delimiter=',')            
        reader = csv.reader(f, delimiter=',')
        i = 0
        for expected_row in expected_reader: 
            try:
                row = next(reader)
            except:
                print('ERROR at row', i, ': ACTUAL rows are less than EXPECTED!')
                break                
            for j in range(len(expected_row)):
                if expected_row[j] != row[j]:
                    print('ERROR at row', i, '  cell index', j)
                    print(row)
                    print('\nACTUAL  :', row[j])
                    print('\nEXPECTED:', expected_row[j])
                    break
            i += 1            

## show_graph

Display a NetworkX graph of movies [see examples](https://en.softpython.org/relational/relational1-intro-sol.html#Fancy-networkx-graphs) from `since_year` (included) to  `until_year` (included), in the given `language`

* display actor names as capitalized
* display co-starred movies, non co-starred movies and actors with different colors by setting node attribute `fillcolor` (see [some color names](https://www.w3.org/wiki/CSS/Properties/color/keywords))

In [None]:
import networkx as nx
from sciprog import draw_nx

def show_graph(movies, since_year, until_year, language):
    
    G = nx.DiGraph()       
    G.graph['graph']= { 'layout':'neato' }  # don't delete these!
    G.add_node('Bud Spencer', label = "BUD SPENCER")
    G.add_node('Terence Hill', label = "TERENCE HILL")
    i = 0
    for movie in movies:
        year = movie['first_release'][0]
        if year < since_year or year > since_year:
            i += 1
            continue
        else:
            title = movie['names'][language]
            
            G.add_node(i, label = title)
            for actor in movie[actors]:
                G.add_edge(actor,i)   
            i += 1
        
    
    #raise Exception('TODO IMPLEMENT ME !')
    
show_graph(movies_db, 1970, 1975, 'en')    

In [10]:
#show_graph(movies_db, 1970, 1974, 'it')