# Part 3: Awkward Array

In [1]:
import awkward as ak

In [2]:
sam_raimi_movies = ak.Array([
    {"movie": "Evil Dead", "year": 1981, "actors":
        ["Bruce Campbell", "Ellen Sandweiss", "Richard DeManincor", "Betsy Baker"]
    },
    {"movie": "Darkman", "year": 1900, "actors":
        ["Liam Neeson", "Frances McDormand", "Larry Drake", "Bruce Campbell"]
    },
    {"movie": "Army of Darkness", "year": 1992, "actors":
        ["Bruce Campbell", "Embeth Davidtz", "Marcus Gilbert", "Bridget Fonda",
         "Ted Raimi", "Patricia Tallman"]
    },
    {"movie": "A Simple Plan", "year": 1998, "actors":
        ["Bill Paxton", "Billy Bob Thornton", "Bridget Fonda", "Brent Briscoe"]
    },
    {"movie": "Spider-Man 2", "year": 2004, "actors":
        ["Tobey Maguire", "Kristen Dunst", "Alfred Molina", "James Franco",
         "Rosemary Harris", "J.K. Simmons", "Stan Lee", "Bruce Campbell"]
    },
    {"movie": "Drag Me to Hell", "year": 2009, "actors":
        ["Alison Lohman", "Justin Long", "Lorna Raver", "Dileep Rao", "David Paymer"]
    }
])

## Task: Which movies do not contain "Bruce Campbell" as an actor?

In [3]:
# find movies without Bruce Campbell
is_bruce_campbell = (sam_raimi_movies["actors"] == "Bruce Campbell")

# select movies where Bruce Campbell is _not_ in the actors list
all_not_bruce_campbell = ak.all(~is_bruce_campbell, axis=1)

# get movies without Bruce Campbell
movies_without_bruce_campbell = sam_raimi_movies[all_not_bruce_campbell]

movies_without_bruce_campbell.show(all=True)

type: 2 * {
    movie: string,
    year: int64,
    actors: var * string
}
nbytes: 907 B
backend: cpu
[{movie: 'A Simple Plan', year: 1998, actors: [...]},
 {movie: 'Drag Me to Hell', year: 2009, actors: [...]}]


---

## Bonus tasks: Bollywood Movies

In [4]:
import awkward as ak
import pyarrow as pa
import pyarrow.csv

In [5]:
# DO NOT MODIFY THIS CELL
convert_options = pyarrow.csv.ConvertOptions(
    null_values=["NA", "N/A"],
    strings_can_be_null=True,

)
table = pyarrow.csv.read_csv("data/BollywoodMovieDetail.csv", convert_options=convert_options)

movies = ak.from_arrow(table)

def split_and_trim(s):
    return ak.str.ltrim_whitespace(ak.str.rtrim_whitespace(ak.str.split_pattern(s, pattern="|")))

# table preparation
movies["genre"] = split_and_trim(movies.genre)
movies["writers"] = split_and_trim(movies.writers)
movies["actors"] = split_and_trim(movies.actors)
movies["directors"] = split_and_trim(movies.directors)

# fix releaseDate
movies["releaseDate"] = ak.str.replace_substring(movies.releaseDate, pattern=",", replacement=" ")
releaseDate = pyarrow.compute.strptime(ak.to_arrow(movies["releaseDate"], extensionarray=False), format="%d %b %Y", unit="s")

movies["releaseDate"] = ak.from_arrow(releaseDate)

movies.show(all=True)

type: 1284 * {
    imdbId: ?string,
    title: ?string,
    releaseYear: ?int64,
    releaseDate: ?datetime64[s],
    genre: option[var * string],
    writers: option[var * string],
    actors: option[var * string],
    directors: option[var * string],
    sequel: ?string,
    hitFlop: ?int64
}
nbytes: 329.7 kB
backend: cpu
[{imdbId: 'tt0118578', title: 'Albela', releaseYear: 2001, ...},
 {imdbId: 'tt0169102', title: 'Lagaan: Once Upon a Time in India', ...},
 {imdbId: 'tt0187279', title: 'Meri Biwi Ka Jawab Nahin', ...},
 {imdbId: 'tt0222024', title: 'Hum Tumhare Hain Sanam', releaseYear: 2002, ...},
 {imdbId: 'tt0227194', title: 'One 2 Ka 4', releaseYear: 2001, ...},
 {imdbId: 'tt0238936', title: 'Devdas', releaseYear: 2002, ...},
 {imdbId: 'tt0247911', title: 'Aap Mujhe Achche Lagne Lage', ...},
 {imdbId: 'tt0248126', title: 'Kabhi Khushi Kabhie Gham...', ...},
 {imdbId: 'tt0248216', title: 'Na Tum Jaano Na Hum', releaseYear: 2002, ...},
 {imdbId: 'tt0248617', title: 'Yaadein...', r

## Bollywood movie statistics

(use the awkward documentation: https://awkward-array.org/doc/main/index.html)

Your tasks:
1. What's the movie that was closest released _after_ Jan 1, 2000?
2. What's the "Comedy" movie with the most writers?
3. How many writers are there on average in the "Adventure" and "Romance" genre?

In [6]:
# Task 1:
# select movies released after year 2000
movies_after_2000 = movies[movies.releaseYear > 2000]
# sort them by release date (ascending)
indices = ak.argsort(movies_after_2000.releaseDate, ascending=True)
# select the closest one
closest_movie_after_2000 = movies_after_2000[indices[0]]

print(f"Movie closest released after Jan 1, 2000: '{closest_movie_after_2000['title']}' (release date: {closest_movie_after_2000['releaseDate']})")

Movie closest released after Jan 1, 2000: 'Farz' (release date: 2001-01-12T00:00:00)


In [7]:
# Task 2:
# find comedy movies
is_comedy = ak.any(movies.genre == "Comedy", axis=-1)
comedy_movies = movies[is_comedy]

# count number of writers per comedy movie
n_writers = ak.num(comedy_movies.writers, axis=-1)
max_n_writers = ak.max(n_writers)

# select comedy movie(s) with most writers
comedy_with_most_writers = ak.drop_none(comedy_movies[n_writers == max_n_writers])

print("Comedy movie(s) with most writers:", comedy_with_most_writers['title'])

Comedy movie(s) with most writers: ['EMI: Liya Hai To Chukana Padega']


In [8]:
# Task 3:
for genre in "Adventure", "Romance":
    # select movies in this genre
    mask = ak.any(movies.genre == genre, axis=-1)
    movies_in_genre = movies[mask]

    # compute average number of writers
    n_writers = ak.mean(ak.num(movies_in_genre.writers, axis=-1))
    
    print(f"Genre: {genre}, Number of writers (on avg): {n_writers}")

Genre: Adventure, Number of writers (on avg): 3.542372881355932
Genre: Romance, Number of writers (on avg): 2.86
