In [0]:
from pyspark.sql.functions import round,broadcast,split,explode,collect_set,length,size


In [0]:
#paths to the source
path_name_basics = "/FileStore/tables/name_basics_tsv-1.gz"
path_title_akas = "/FileStore/tables/title_akas_tsv.gz"
path_title_basics = "/FileStore/tables/title_basics_tsv.gz"
path_title_crew = "/FileStore/tables/title_crew_tsv.gz"
path_title_episode = "/FileStore/tables/title_episode_tsv.gz"
path_title_principals = "/FileStore/tables/title_principals_tsv.gz"
path_title_ratings = "/FileStore/tables/title_ratings_tsv.gz"

#Import datasets as required
#title_akas = spark.read.option("header",True).csv(path_title_akas, sep='\t')
title_ratings = spark.read.option("header",True).csv(path_title_ratings, sep='\t')
title_basics = spark.read.option("header",True).csv(path_title_basics, sep='\t')
#title_crew = spark.read.option("header",True).csv(path_title_crew, sep='\t')
name_bsics = spark.read.option("header",True).csv(path_name_basics, sep='\t')
#title_episode = spark.read.option("header",True).csv(path_title_episode, sep='\t')
#title_principals = spark.read.option("header",True).csv(path_title_principals, sep='\t')



In [0]:
#Filter the dataset with titleType = movie
movies=title_basics.filter("titleType='movie'")

In [0]:
movies.display()

tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
tt0000009,movie,Miss Jerry,Miss Jerry,0,1894,\N,45,Romance
tt0000502,movie,Bohemios,Bohemios,0,1905,\N,100,\N
tt0000574,movie,The Story of the Kelly Gang,The Story of the Kelly Gang,0,1906,\N,70,"Action,Adventure,Biography"
tt0000591,movie,The Prodigal Son,L'enfant prodigue,0,1907,\N,90,Drama
tt0000615,movie,Robbery Under Arms,Robbery Under Arms,0,1907,\N,\N,Drama
tt0000630,movie,Hamlet,Amleto,0,1908,\N,\N,Drama
tt0000675,movie,Don Quijote,Don Quijote,0,1908,\N,\N,Drama
tt0000679,movie,The Fairylogue and Radio-Plays,The Fairylogue and Radio-Plays,0,1908,\N,120,"Adventure,Fantasy"
tt0000793,movie,Andreas Hofer,Andreas Hofer,0,1909,\N,\N,Drama
tt0000814,movie,La bocana de Mar Chica,La bocana de Mar Chica,0,1909,\N,\N,\N


In [0]:
#Retrieve rating information of all the movies
averageRatingMovies=movies.join(broadcast(title_ratings),movies.tconst==title_ratings.tconst,'inner').select(movies.tconst,movies.originalTitle,title_ratings.averageRating,title_ratings.numVotes)

In [0]:
#Caclulate average Number of Votes
avgNumVotes=averageRatingMovies.agg({'numVotes': 'avg'}).select(round('avg(numVotes)',2).alias('AvgNumVotes')).collect()[0]['AvgNumVotes']

In [0]:
#Calculate rank of movies by filtering the records whole number of votes are greater than 50
averageRatingMovies_R=averageRatingMovies.withColumn("rank",round((averageRatingMovies.numVotes/avgNumVotes)*averageRatingMovies.averageRating,2)).filter("numVotes>=50").orderBy(col("rank").desc()).select('tconst','originalTitle').limit(20)


In [0]:
#Top 20 movies(Result)
Top20Movies=averageRatingMovies_R.select('originalTitle')
Top20Movies.display()

originalTitle
The Shawshank Redemption
The Dark Knight
Inception
Fight Club
Forrest Gump
Pulp Fiction
The Godfather
The Matrix
The Lord of the Rings: The Return of the King
The Lord of the Rings: The Fellowship of the Ring


In [0]:
#Split the array of titles into individual titles with respect to person names
persons=name_bsics.withColumn("titles",split(name_bsics.knownForTitles,',')).select('primaryName',explode('titles').alias('titles'))
                               

In [0]:
persons.display()

primaryName,titles
Fred Astaire,tt0053137
Fred Astaire,tt0031983
Fred Astaire,tt0050419
Fred Astaire,tt0072308
Lauren Bacall,tt0071877
Lauren Bacall,tt0037382
Lauren Bacall,tt0117057
Lauren Bacall,tt0038355
Brigitte Bardot,tt0056404
Brigitte Bardot,tt0057345


In [0]:
#Retrieve information about all the name of the persons who are known for top 20 movies
pers_workedontop20m=persons.join(averageRatingMovies_R,averageRatingMovies_R.tconst==persons.titles,'inner').select(persons.primaryName).distinct()

In [0]:
pers_workedontop20m.display()

primaryName
Dennis Berardi
Dave Brown
Dorsey Burnette
John Travolta
Tara Howie
Ross Grayson Bell
Ian Bohen
Alon Aboutboul
Philippa Boyens
Chris Burn


In [0]:
#Retrive all the titles known for persons who worked for top 20 movies
moviesof_top_20_creditedP=pers_workedontop20m.join(persons,pers_workedontop20m.primaryName==persons.primaryName)\
                          .join(movies,persons.titles==movies.tconst)\
                          .select(pers_workedontop20m.primaryName,movies.originalTitle)\
                          .groupby('primaryName').agg(collect_set('originalTitle'))\
                          .select(col('primaryName'),col('collect_set(originalTitle)').alias("othertitles"))\
                           .where(size(col("othertitles"))>=20)
                           
                          
                          

In [0]:
#Final output
moviesof_top_20_creditedP.display()

primaryName,othertitles
Adam Clark,"List(Thriller, Temporary Suspicion, Haunted, Mary Anning & the Dinosaur Hunters, Superheroes of Stoke, Paradise Waits, Escaping Ohio, Portal, Domino, Year: Prologue, It's a Disaster, Lucky, She Sings to the Stars, Defective Man!, The Butterfly Ball, Tight Loose, Just One Night, Donny Osmond - One Night Only!, Pete Winning and the Pirates, Family Blood, But I'm a Cheerleader, Purge, Screaming Flowers, Horrorathon: Volume 1, The Lord of the Rings: The Return of the King, Man with the Screaming Brain, Gridiron Gang, A Fool$ Game, Mississippi Damned, Attack of La Niña, The Carter Effect, Dog Years, Coach Carter, Jeepers Creepers 3, Agoraphobia, All.I.Can., I sproget er jeg, Eagle vs Shark, Mouse, Boy, Love and Action in Chicago, Cam, Scratch, The Conspirator)"
Adam Evans,"List(The Reverend, How to Survive a Pandemic, David Attenborough's Tasmania, Dark Ditties Presents 'Finders Keepers', Great Salt Lake: Utah's Sanctuary, Caught In-Between, Gemini Man, Dark Ditties Presents 'The Offer', Dark Ditties Presents 'Stained', Les fils du vent, The Dark Knight Rises, Re-Evolution, War for the Planet of the Apes, Severance, Everything, Everything, Spy Game, Chasing Liberty, The Loneliest Whale: The Search for 52, Dark Ditties Presents 'Dad', SubSIPPI, Cemetery Junction, Avengers: Endgame, Mortal Engines, The Changeover)"
Adam Lee,"List(Wonder Woman, Kingsman: The Secret Service, Pacific Rim, Adrift, The Old Ways, Baby Driver, Saturday at the Starlight, Grace Is Gone, Thin Blue Line, Guardians of the Galaxy, Spider-Man: Homecoming, A Million Ways to Die in the West, Clear, Champion, The Quad Force: Redemption, Battleship, Rain, The Final Wish, Inception, The Biggest Thing That Ever Hit Broadway: Redux, The Commitments, Hostiles, Stars Fell on Alabama, The Last Stand, Those Who Wish Me Dead, The Legend of Jedediah Carver, The Beacon, The Escape of Prisoner 614, Boss Level, Children of Men, I Had a Bloody Good Time at House Harker, Avengers: Infinity War, Ant-Man and the Wasp, Precis som jag, Jurassic World, I Can I Will I Did, The Ray, Twisted Dragons)"
Alan Lee,"List(My Bloody Valentine, Jack Reacher, The Hobbit: The Desolation of Smaug, The Sea Chase, Swallows and Amazons, Men in Suits, The Electric Horseman, Dad's Army, Rainbow Brite and the Star Stealer, The Lord of the Rings: The Fellowship of the Ring, Siu Tai Gik, Killer Instinct, Funny Cow, Ned Venture, The Lord of the Rings: The Two Towers, Artists and Models, The 24th Day, Jin pai shi jie, John Goldfarb, Please Come Home!, Petersen, The Lord of the Rings: The Return of the King, American Badger, Inrang, Ren zhe da, Unstoppable, Abduction, Cafe Artist, The Godfather Part II, Huang jia fei feng, Zombex, Eyes of a Stranger, Kangaroo, Innocent)"
Alan Wilson,"List(A Dirty Weekend, Vertical Limit, The Lord of the Rings: The Return of the King, King Kong, ...Maybe This Time, RED 2, Hell House, School for Seduction, Tinker Tailor Soldier Spy, Spider-Man: Homecoming, Lives of the Artists, The Lord of the Rings: The Fellowship of the Ring, Sunset Heights, Coulda, Woulda, Shoulda, Abducted, Avatar, The Man Who Knew Too Little, Road House, Frequency, Deaf Century, Master and Commander: The Far Side of the World, The Lord of the Rings: The Two Towers, Adventures of a Private Eye, American Jedi)"
Alex King,"List(Fast & Furious Presents: Hobbs & Shaw, Fantastic Beasts: The Secrets of Dumbledore, Allies, Captain America: The First Avenger, The Flanders and Alcott Report on Sexual Response, Casino Royale, The Casting Game, Suburban Wildlife, In Bruges, Saint Maud, Just Around the Corner, Welcome to the Punch, Given Our Chance, Convention, Goofballs, Fragile Heart (The Movie), Cruella, The Longest Weekend, Born to Be Our Children: Romanian Adoption Stories, V for Vendetta, Gladiator, How I Live Now, Twin Brides, Best Laid Plans, Joe Cocker: Mad Dogs & Englishmen)"
Allan Smith,"List(The Hobbit: The Desolation of Smaug, Kingdom of Heaven, Here Will I Nest, Highlander III: The Sorcerer, Weird Tales 3: The Pioneer's Lost Trunk, 6 Wheels from Hell!, Father Gaudio's Confession, Honor Among Thieves, The Desert's Lost River of Gold, Free the Airwaves, The Hobbit: The Battle of the Five Armies, El percance perfecto, Una Magnum Special per Tony Saitta, Oscar and Lucinda, The Lord of the Rings: The Fellowship of the Ring, The Damned Don't Cry, Blue Moon, Manny Cam, Ernie's Documentary: Rapping, Panique, Saint Jack, Circus of the Dead, The LittleBigPlanet Movie)"
Ann Miller,"List(Whatever Works, Spider-Man 2, Blue Jasmine, The CFF Story, Dante's Peak, Luther, Le Petit Prince, Mulholland Dr., Swung, The Little Vampire, The Adjustment Bureau, Death of a President, Baby Boom, The Silence of the Lambs, The Astronaut's Wife, The Last Samurai, Catch Me If You Can, Reveille with Beverly, On the Town, Kiss Me Kate)"
Bill Martin,"List(Cl.One, Tales from Tin Pan Alley, The Marshal's Daughter, The Puppetoon Movie, Roger Waters: The Wall, So I Married an Axe Murderer, The Snowman, 50 Over, Did You Hear About the Morgans?, Hot Spur, Epicenter, Ambition's Debt, Crackdown, O, Brazen Age, Teenage Cave Man, Saturday's Hero, Defenseless, Joyride to Nowhere, The Cry Baby Killer, Portrait in Sepia Tone, Midnight's Children, Night of the Blood Beast, The Hobby Stop, Harry and the Hendersons, Demented, And Baby Makes Three, The Shawshank Redemption, City of Hope, To All a Goodnight, Carnival Rock, Little Nemo, Mafia Zombie Killers, Joe Palooka in Winner Take All, The Haunting in Connecticut 2: Ghosts of Georgia, Last Night in Soho)"
Bill Young,"List(Ant-Man, Star Wars: Episode III - Revenge of the Sith, The Roly Poly Man, Looney Tunes: Back in Action, Look Who's Toxic, The Island, The Last Boy Scout, Flight of the Navigator, Perfect Alibi, The Man Who Sued God, Better Off Dead..., Breakfast of Aliens, Bullish, After the Fair: The Legacy of the 1964-65 New York World's Fair, The Bodyguard, Boss Level, Honey, I Blew Up the Kid, Seclusion, Mean Streets, Light Speed, Hookers for Jesus, Spaceballs, Ant-Man and the Wasp, Chopper, Riding the Bullet, Run, The Matrix, The Darkening)"
