# Part I - Search
## Project 1a - Degrees

[Course Link](https://cs50.harvard.edu/ai/)

[Project Instructions](https://cs50.harvard.edu/ai/projects/0/degrees/)

## Instructions

In this problem, we’re interested in finding the shortest path between any two actors by choosing a sequence of movies that connects them. For example, the shortest path between Jennifer Lawrence and Tom Hanks is 2: Jennifer Lawrence is connected to Kevin Bacon by both starring in “X-Men: First Class,” and Kevin Bacon is connected to Tom Hanks by both starring in “Apollo 13.”

We can frame this as a search problem: our states are people. Our actions are movies, which take us from one actor to another (it’s true that a movie could take us to multiple different actors, but that’s okay for this problem). Our initial state and goal state are defined by the two people we’re trying to connect. By using breadth-first search, we can find the shortest path from one actor to another.

In [1]:
import time
import random
import csv
import sys
import pandas as pd
import numpy as np
from util import Node, StackFrontier, QueueFrontier

In [13]:
# %load degrees.py

# Maps names to a set of corresponding person_ids
names = {}

# Maps person_ids to a dictionary of: name, birth, movies (a set of movie_ids)
people = {}

# Maps movie_ids to a dictionary of: title, year, stars (a set of person_ids)
movies = {}

def load_data(directory):
    """
    Load data from CSV files to Pandas Dataframes
    """

    #names_df2 = pd.read_csv('large/people.csv')
    #names_df2 = names_df2.drop(columns='birth')
    #names.update({name.lower():{str(ids)} for ids, name in names_df2.values})

    # Populate people dictionary
    with open(f"{directory}/people.csv", encoding="utf-8") as f:
        reader = csv.DictReader(f)
        for row in reader:
            people[row["id"]] = {
                "name": row["name"],
                "birth": row["birth"],
                "movies": set()
           }
            if row["name"].lower() not in names:
                names[row["name"].lower()] = {row["id"]}
            else:
                names[row["name"].lower()].add(row["id"])
      
    # Load movies
    with open(f"{directory}/movies.csv", encoding="utf-8") as f:
        reader = csv.DictReader(f)
        for row in reader:
            movies[row["id"]] = {
                "title": row["title"],
                "year": row["year"],
                "stars": set()
            }

    # Load stars into both people and movie dictionaries
    with open(f"{directory}/stars.csv", encoding="utf-8") as f:
        reader = csv.DictReader(f)
        for row in reader:
            try:
                people[row["person_id"]]["movies"].add(row["movie_id"])
                movies[row["movie_id"]]["stars"].add(row["person_id"])
            except KeyError:
                pass


def main(dir_size, src, tgt):
    load_data(dir_size)
    print('Data Loaded, Now Searching.......')
    
    source = person_id_for_name(src)
    if source is None:
        sys.exit("Person not found.")
    target = person_id_for_name(tgt)
    if target is None:
        sys.exit("Person not found.")
        
    path = shortest_path(source, target)
    #print(path)
    
    if path is None:
        print(f'No Connection Between {people[source]["name"]} and {people[target]["name"]}')
    else:
        degrees = len(path[0])
        print(f'Start: {people[source]["name"]} {source}')
        print(f'Goal:  {people[target]["name"]} {target}')
        print(f'\n{degrees} degrees of separation')
        print('-------------------------------------------------------------------------')
        if degrees == 1:
            print(f"1: {people[source]['name']} and {people[target]['name']} starred in {movies[path[0][0]]['title']}")  
        else:
            print(f"1: {people[source]['name']} and {people[path[1][0]]['name']} starred in {movies[path[0][0]]['title']}")  
            for i in range(degrees-1):
                person1 = people[path[1][i]]["name"]
                person2 = people[path[1][i+1]]["name"]
                movie = movies[path[0][i+1]]["title"]
                print(f"{i + 2}: {person1} and {person2} starred in {movie}")

            
def shortest_path(source, target):
    """
    Returns the shortest list of (movie_id, person_id) pairs
    that connect the source to the target.

    If no possible path, returns None.
    """
    
    # Keep track of number of states explored
    num_explored = 0

    # Step 1: 
    # Initialize frontier to initial starting position
    start = Node(state=source, parent=None, action=None)
    frontier = QueueFrontier()
    frontier.add(start)
    
    # Step 2: 
    # Start with an empty explored set
    explored = set()
    
    
    # Step 3: 
    # Repeat Until Solution Found
    while True:
        
        # Step 3a: 
        # If nothing left in frontier, no solution
        if frontier.empty():
            return None

        # Step 3b:
        # Choose a node from the frontier
        node = frontier.remove()
        
        #print(f'NODE STATE: {node.state}')
        #print(f'LENGTH STATE: {len(node.state)}')

        
        #print(f'Nodes Explored: {num_explored}')
        #print(f'Current Node: {node.state}')
        #print(f'Target State: {target}\n')
        
        # Step 3c: 
        # Check if node from 3b is goal, if so, solution found
        # this section explained at 42 min in lecture
        if target == node.state:
            print(f'THIS SHOULD MATCH: {target}, {node.state}')
            actions = []
            cells = []
            while node.parent is not None:
                actions.append(node.action)
                cells.append(node.state)
                node = node.parent
                #print(f'Parent Node State: {node.state}\n')
            actions.reverse()
            cells.reverse()
      
            solution = (actions, cells)
            print('Solution found, Node from frontier matched target!')
            print(f'Algorithm Used:  {frontier}\n')
            print(f'Total Nodes Explored: {num_explored}\n')
            #print(solution)
            return solution

        # Step 3d:
        # If 3c fails, add node from 3b to explored set
        #print(node.state)
        
        explored.add(node.state)
        num_explored += 1

        # Step 3e. 
        # Expand node, add resulting (neighbor) nodes 
        # to the frontier if they aren't already there 
        # OR if they are not already in the explored set.
        
        # All neighbors denotes all star and movie connections
        # possible for the current node
        all_neighbors = neighbors_for_person(node.state)
        #print(all_neighbors)
        #print()
           
        # if this is first pass through go directly to solution 
        # else do the get parent thing actions, cells, thing above. 
        for action, state in all_neighbors:
            # If target in current nodes neighbors solution has been found
            if target == state:
                # Start source node should have no parent so go directly to solution 
                if node.parent==None:
                    solution = ([action],[state])
                    print('Solution found, Source matched with target first pass!')
                    print(f'Algorithm Used:  {frontier}\n')
                    print(f'Total Nodes Explored: {num_explored}\n')
                    #print(solution)
                    return(solution)
                # Else solution is longer than 1 degree of sepration
                else:
                    actions = [action]
                    cells = [state]
                    while node.parent is not None:
                        actions.append(node.action)
                        cells.append(node.state)
                        node = node.parent
                        
                    actions.reverse()
                    cells.reverse()
                    solution = (actions, cells)
                    print('Solution found, Neighbor matched target before frontier!')
                    print(f'Algorithm Used:  {frontier}\n')
                    print(f'Total Nodes Explored: {num_explored}\n')
                    #print(solution)
                    return(solution)
            # Else Solution not found so add to frontier
            else:
                if not frontier.contains_state(state) and state not in explored:
                    child = Node(state=state, parent=node, action=action)
                    frontier.add(child)
                
                
def neighbors_for_person(person_id):
    """
    Returns (movie_id, person_id) pairs for people
    who starred with a given person.
    """
    movie_ids = people[person_id]["movies"]
    neighbors = set()
    for movie_id in movie_ids:
        for person_id in movies[movie_id]["stars"]:
            neighbors.add((movie_id, person_id))
    return neighbors


def person_id_for_name(name):
    """
    Returns the IMDB id for a person's name,
    resolving ambiguities as needed.
    """
    person_ids = list(names.get(name.lower(), set()))
    if len(person_ids) == 0:
        return None
    elif len(person_ids) > 1:
        print(f"Which '{name}'?")
        for person_id in person_ids:
            person = people[person_id]
            name = person["name"]
            birth = person["birth"]
            print(f"ID: {person_id}, Name: {name}, Birth: {birth}")
        try:
            person_id = input("Intended Person ID: ")
            if person_id in person_ids:
                return person_id
        except ValueError:
            pass
        return None
    else:
        return person_ids[0]


# Running the Program

I firt create a name list with only names in movies with > 10000 IMDB Score Votes to limit obscure names. 

In [14]:
name_list = pd.read_csv('large/movies_5.csv')
name_list = name_list[name_list['num_scores'] > 10000]
name_list = name_list['actr1'].unique().tolist()
print(len(name_list))

3091


In [15]:
src = random.choice(name_list)
tgt = random.choice(name_list)

while src == tgt:
    tgt = random.choice(name_list)


# Test Metrics Results From Small Dataset Using StackFrontier()
# No Solution Error
#src = 'demi moore'
#tgt = 'emma watson' 
    
# 1 degree of separation
#src = 'jack nicholson'
#tgt = 'tom cruise' 
    
# 2 degrees of separation
#src = 'demi moore'
#tgt = 'tom hanks'
    
# 3 degrees of separation
#src = 'mandy patinkin'
#tgt = 'bill paxton'
    
# 4 degree of separation
#src = 'demi moore'
#tgt = 'mandy patinkin'

src = 'donnie wahlberg'
tgt = 'max thieriot'

print(src)
print(tgt)
print()

start = time.time()

main('large',src, tgt)

print(time.time() - start)

donnie wahlberg
max thieriot

Data Loaded, Now Searching.......
Solution found, Neighbor matched target before frontier!
Algorithm Used:  <util.QueueFrontier object at 0x7feff8ed2c50>

Total Nodes Explored: 81

Start: Donnie Wahlberg 5531
Goal:  Max Thieriot 1302735

3 degrees of separation
-------------------------------------------------------------------------
1: Donnie Wahlberg and Michelle Forbes starred in Bullfighter
2: Michelle Forbes and Gil Bellows starred in Black Day Blue Night
3: Gil Bellows and Max Thieriot starred in House at the End of the Street
21.45950150489807


In [7]:
names

{'kevin bacon': {'102', '311'},
 'tom cruise': {'129'},
 'cary elwes': {'144'},
 'tom hanks': {'158'},
 'mandy patinkin': {'1597'},
 'dustin hoffman': {'163'},
 'chris sarandon': {'1697'},
 'demi moore': {'193'},
 'jack nicholson': {'197'},
 'bill paxton': {'200'},
 'sally field': {'398'},
 'valeria golino': {'420'},
 'gerald r. molen': {'596520'},
 'gary sinise': {'641'},
 'robin wright': {'705'},
 'emma watson': {'914612'}}

In [8]:
people

{'102': {'name': 'Kevin Bacon',
  'birth': '1958',
  'movies': {'104257', '112384'}},
 '129': {'name': 'Tom Cruise', 'birth': '1962', 'movies': {'104257', '95953'}},
 '144': {'name': 'Cary Elwes', 'birth': '1962', 'movies': {'93779'}},
 '158': {'name': 'Tom Hanks', 'birth': '1956', 'movies': {'109830', '112384'}},
 '1597': {'name': 'Mandy Patinkin', 'birth': '1952', 'movies': {'93779'}},
 '163': {'name': 'Dustin Hoffman', 'birth': '1937', 'movies': {'95953'}},
 '1697': {'name': 'Chris Sarandon', 'birth': '1942', 'movies': {'93779'}},
 '193': {'name': 'Demi Moore', 'birth': '1962', 'movies': {'104257'}},
 '197': {'name': 'Jack Nicholson', 'birth': '1937', 'movies': {'104257'}},
 '200': {'name': 'Bill Paxton', 'birth': '1955', 'movies': {'112384'}},
 '398': {'name': 'Sally Field', 'birth': '1946', 'movies': {'109830'}},
 '420': {'name': 'Valeria Golino', 'birth': '1965', 'movies': {'95953'}},
 '596520': {'name': 'Gerald R. Molen', 'birth': '1935', 'movies': {'95953'}},
 '641': {'name': '

In [9]:
movies

{'112384': {'title': 'Apollo 13',
  'year': '1995',
  'stars': {'102', '158', '200', '641'}},
 '104257': {'title': 'A Few Good Men',
  'year': '1992',
  'stars': {'102', '129', '193', '197'}},
 '109830': {'title': 'Forrest Gump',
  'year': '1994',
  'stars': {'158', '398', '641', '705'}},
 '93779': {'title': 'The Princess Bride',
  'year': '1987',
  'stars': {'144', '1597', '1697', '705'}},
 '95953': {'title': 'Rain Man',
  'year': '1988',
  'stars': {'129', '163', '420', '596520'}},
 '77777': {'title': 'Ugh', 'year': '1966', 'stars': {'311'}}}

## Time comprison between pandas and csv module for reading in the larger files

In [38]:
# Panda Method 1
start = time.time()

people_df = pd.read_csv('large/people.csv')
people_df.fillna(9999)
names = {}
names_df = people_df.drop(columns='birth')
names_df['id'] = names_df['id'].astype(str)
names.update(dict(zip(names_df['name'].str.lower(), names_df['id'].values)))

for name, ids in names.items():
    s = set()
    s.add(ids)
    names[name] = s

print(time.time()-start)

5.675336122512817


In [39]:
# Pandas Method 2
tm = time.time()
names_df2 = pd.read_csv('large/people.csv')
names_df2.fillna(9999)
names_df2 = names_df2.drop(columns='birth')
#names_df['id'] = names_df['id'].astype(str)
names = {name.lower():{str(ids)} for ids, name in names_df2.values}
print(time.time() - tm)

5.723609447479248


In [25]:
# CSV Method

start = time.time()
names2 = {}

with open(f"large/people.csv", encoding="utf-8") as f:
    reader = csv.DictReader(f)
    for row in reader:
        if row["name"].lower() not in names2:
            names2[row["name"].lower()] = {row["id"]}
        else:
            names2[row["name"].lower()].add(row["id"])

print(len(names2))
print(time.time()-start)          

978917
8.696442604064941
