In [98]:
import requests as r
from bs4 import BeautifulSoup
import requests
import pandas as pd
from lxml import etree

In [99]:
## URL variable is created so that it could be accessed using the requests package that we have imported. 
url = "https://www.imdb.com/chart/top"

## This is going to take into account the request status just to understand whether the response was a success or not. 
request_status = r.get(url)

## We would be printing the request status 
print(request_status)


## It is important to take the text information as we are going to be performing analysis with it. 
url_text = request_status.text

## Created an empty list to store all the important values for display later in the code. 
list_of_movies = []

## Using beautiful soup and parsing using the html parser to display the contents in a concrete way. 
soup = BeautifulSoup(url_text, "html.parser")

## Considering all the tables in the soup file just to ensure we are working with table data. 
movie_table = soup.find('tbody', attrs = {"class": "lister-list"})

## Performing the iterations for all the table rows.
for table_row in movie_table.find_all("tr"):
    
    ## Taking the table divisions 
    table_divisions = table_row.find_all('td')
    
    ## Getting the tags that contain url information
    link = table_divisions[1].find('a')
    movie_url = link.get("href")
    
    ## Taking all the members in the data and storing in members variable. 
    members = str(link.get("title")).split(", ")
    
    ## We are also going to consider year releast as well. 
    year_release = str(table_divisions[1].find('span').text).replace("(", "").replace(")", "")
    
    ## Similarly, we would be taking the ratings values as well
    ratings = table_divisions[2].find('strong').text
    
    
    ## We are now going to store all the variable values in a dictionary as shown below. 
    movie = {
        "title": link.text,
        "director": members[0].replace(" (dir.)", ""),
        "actors": members[1:],
        "release_year": year_release, 
        "ratings": ratings, 
    }
    
    ## Appending all the values from the dictionary into a list of movies that we have created early. 
    list_of_movies.append(movie)

# Converting the list into a dataframe respectively. 
dataframe = pd.DataFrame(list_of_movies)

<Response [200]>


In [100]:
dataframe.head(10)

Unnamed: 0,title,director,actors,release_year,ratings
0,The Shawshank Redemption,Frank Darabont,"[Tim Robbins, Morgan Freeman]",1994,9.2
1,The Godfather,Francis Ford Coppola,"[Marlon Brando, Al Pacino]",1972,9.1
2,The Godfather: Part II,Francis Ford Coppola,"[Al Pacino, Robert De Niro]",1974,9.0
3,The Dark Knight,Christopher Nolan,"[Christian Bale, Heath Ledger]",2008,9.0
4,12 Angry Men,Sidney Lumet,"[Henry Fonda, Lee J. Cobb]",1957,8.9
5,Schindler's List,Steven Spielberg,"[Liam Neeson, Ralph Fiennes]",1993,8.9
6,The Lord of the Rings: The Return of the King,Peter Jackson,"[Elijah Wood, Viggo Mortensen]",2003,8.9
7,Pulp Fiction,Quentin Tarantino,"[John Travolta, Uma Thurman]",1994,8.8
8,"The Good, the Bad and the Ugly",Sergio Leone,"[Clint Eastwood, Eli Wallach]",1966,8.8
9,The Lord of the Rings: The Fellowship of the Ring,Peter Jackson,"[Elijah Wood, Ian McKellen]",2001,8.8


In [101]:
def search_queries(recipes):
    """
    This function would run the queries and return the queried results based on the arguments. 
    """
    
    ## Created an empty list to store values when iterating over all the recipe values. 
    values = []
    for recipe in recipes:
        
        ## Taking the title of the recipes
        title = recipe.find("title").text
        
        ## Taking the calorie information of the recipe. 
        calories = recipe.find("nutrition").attrib["calories"]
        
        ## Creating an empty list of ingredients.
        ingredients = []
        for ingredient in recipe.findall("ingredient"):
            
            ## Appending the names of the ingredients in the list that we have created earlier. 
            ingredients.append(ingredient.attrib["name"])
        ## Taking the values of all the queries and storing in a list that contains dictionary values. 
        values.append(
            {"title": title, "ingredients": ingredients, "calories": calories})
    
    ## Finally, printing the values to understand the overall data.
    print(values)

    
## The above function is created to be used later in the code below as you would be taking a look.     
## It is very handy to create functions so that they could be later.



    
## Reading the xml file with lxml package
tree = etree.parse('recipes.xml')
recipes = tree.xpath('/collection/recipe')

# query 1
search_queries(recipes)
print('\n')

# Find the titles of all recipes.

titles = [t.find("title").text for t in tree.xpath('/collection/recipe')]
print(titles)
print('\n')

# Find the titles of recipes that use olive oil.
titles = [t.find("title").text for t in tree.xpath(
        '/collection/recipe[ingredient[@name="olive oil"]]')]
print(titles)
print('\n')

# Find the titles of all recipes with less than 500 calories.
titles = [t.find("title").text for t in tree.xpath(
        '/collection/recipe[nutrition[@calories<500]]')]
print(titles)
print('\n')

# Find the amount of sugar needed for Zuppa Inglese.
titles = [t.attrib.get("amount") for t in tree.xpath(
        '/collection/recipe[title="Zuppa Inglese"]/ingredient[@name="sugar"]')]
print(titles)
print('\n')


# Find the titles of all recipes that require 4 steps.
titles = [t.find("title").text for t in tree.xpath(
        '/collection/recipe[count(preparation/step)=4]')]
print(titles)
print('\n')

# Find the names of all ingredients that are used to make other ingredients.
titles = [t.attrib.get("name") for t in tree.xpath(
        '/collection/recipe/ingredient[count(ingredient)>1]/ingredient')]
print(titles)
print('\n')
    
# Find the names of all ingredients for which you need other ingredients.
titles = [t.attrib.get("name") for t in tree.xpath(
        '/collection/recipe/ingredient[count(ingredient)>1]')]
print(titles)
print('\n')

# Find the names of the first three ingredients in each recipe.
titles = [t.attrib.get("name") for t in tree.xpath(
        '/collection/recipe/ingredient[position() < 3]')]
print(titles)


[{'title': 'Beef Parmesan with Garlic Angel Hair Pasta', 'ingredients': ['beef cube steak', 'onion, sliced into thin rings', 'green bell pepper, sliced in rings', 'Italian seasoned bread crumbs', 'grated Parmesan cheese', 'olive oil', 'spaghetti sauce', 'shredded mozzarella cheese', 'angel hair pasta', 'minced garlic', 'butter'], 'calories': '1167'}, {'title': 'Ricotta Pie', 'ingredients': ['filling', 'dough', 'milk'], 'calories': '349'}, {'title': 'Linguine Pescadoro', 'ingredients': ['linguini pasta', 'sauce'], 'calories': '532'}, {'title': 'Zuppa Inglese', 'ingredients': ['egg yolks', 'milk', 'Savoiardi biscuits', 'sugar', 'Alchermes liquor', 'lemon zest', 'flour', 'fresh whipping cream'], 'calories': '612'}, {'title': 'Cailles en Sarcophages', 'ingredients': ['pastry', 'filling', 'package phyllo dough', 'egg whites, lightly beaten'], 'calories': '8892'}]


['Beef Parmesan with Garlic Angel Hair Pasta', 'Ricotta Pie', 'Linguine Pescadoro', 'Zuppa Inglese', 'Cailles en Sarcophages']


In [102]:
def get_direction_data(start, end, key = None):
    """
    This function would take the starting and ending route 
    and the key to access the api and return the dataframe 
    describing the direction, distance and time respectively.
    """
    
    ## The 'if' condition is created so that when there are no keys provided, it should 
    ## prompt the user to provide the API key. 
    if key == None:         
        print("Please provide the API key")
        return
    
    
    ## We would now specify the url along with start, end and key values respectively. 
    url = f"http://open.mapquestapi.com/directions/v2/route?key={key}&from={start}&to={end}"
    
    ## This would result in a response whether it was pass or fail of the request 
    result = requests.get(url)
    
    ## Since we would be working with json, it is now time to convert the response into json respectively. 
    json_data = result.json()
    
    ## Let us now create empty lists which we would be later using when interating over the values. 
    instruction_values = []
    distance_values = []
    time_values = []
    
    ## First, we would be iterating over the leg information
    for leg_information in json_data['route']['legs']:
        
        ## Now, it is time to take into account maneuver information and iterate over them. 
        for maneuver in leg_information['maneuvers']:
            ## Storing the values in the list that we have created earlier. 
            instruction_values.append(maneuver["narrative"])
            distance_values.append(maneuver["distance"])
            time_values.append(maneuver["time"])
            
            
            ## Zipping the values of the lists so that they would be converted to a dataframe in the next step.
            zipped_output = list(zip(instruction_values, distance_values, time_values))

            
    ## Converting the values into a dataframe 
    df = pd.DataFrame(zipped_output, columns = ['instruction', 'distance (mi)', 'time (s)'])
    
    
    ## Returning the dataframe that we've created 
    return df

In [103]:
## Let us now add the starting and ending values along with the API call. 

In [104]:
Start = "1203 Boylston Street"
End = "Natick Mall"
## The key value is generated as a result of creating an account and approving access to the key to take the URL
Key = "nIY6TvHdZx5eZwcc66TYgG0G9eNaALRJ"

In [105]:
## We are going to be taking the output from the function and 
## storing in a dataframe which would later be used for display purposes 
direction_dataframe = get_direction_data(start = Start, end = End, key = Key)

In [106]:
direction_dataframe.head(10)

Unnamed: 0,instruction,distance (mi),time (s)
0,Start out going east on SR 9 E/Boylston St.,0.019,2
1,Turn left onto Lee St.,0.009,16
2,Take SR 9 W.,12.924,1094
3,Turn slight right onto Natick Mall Rd. Pass th...,0.409,68
4,"Welcome to NATICK, MA 01760.",0.0,0


In [107]:
Start = "Northeastern University"
End = "Faneuil Hall Marketplace"
## The key value is generated as a result of creating an account and approving access to the key to take the URL
Key = "nIY6TvHdZx5eZwcc66TYgG0G9eNaALRJ"

In [108]:
## We are going to be taking the output from the function and 
## storing in a dataframe which would later be used for display purposes 
direction_dataframe = get_direction_data(start = Start, end = End, key = Key)

In [109]:
direction_dataframe.head(100)

Unnamed: 0,instruction,distance (mi),time (s)
0,Start out going northeast toward Gainsborough St.,0.199,52
1,Turn left onto Gainsborough St.,0.105,35
2,Turn right onto SR 9 E/Huntington Ave. Continu...,0.826,81
3,SR 9 E becomes Stuart St.,0.605,95
4,Turn left onto Washington St.,0.108,25
5,Turn right onto Essex St.,0.235,47
6,Merge onto I-93/US-1/SR 3/John FFitzgerald Expy.,0.773,79
7,Take EXIT 23 toward Government Center.,0.181,30
8,Turn left onto John FFitzgerald Surface Rd.,0.091,13
9,Turn right onto Clinton St.,0.051,6
