## Node Similarity Notebook

- Import CSV files:
    - Target prospects, which contains potential customers of interest.
    - Happiest customers, which gives examples of exemplary customers for comparison.
- Perform data cleaning, identify the columns of interests and the possible different categories for each.

In [11]:
import pandas as pd
import numpy as np
import neo4j

Important fields in new data:

Segment
	Allied Health
	Beauty Wellness
	Career Trade
	
School Type
	Private for-profit
	
Total Student Count
	150-10000
	
LMS
	From the screenshot Sophie provided

In [12]:
prospects = pd.read_csv('../data/target_prospects.csv')

print('Segment categories for prospect schools:\n')
print(prospects['Segment'].value_counts())

print('\n\nSchool types for prospect schools:\n')
print(prospects['School Type (picklist)'].value_counts())

print('\n\nSummary of "Total Students" column:\n')
print(prospects['Total Students'].dropna().str.replace(',','').astype(int).describe())

print('\n\nLMS types for prospect schools:\n')
print(prospects['LMS (multi-select)'].value_counts())

Segment categories for prospect schools:

Beauty + Wellness    291
Allied Health        175
Career + Trade        98
University             1
Community College      1
Primary Education      1
Name: Segment, dtype: int64


School types for prospect schools:

Private for-profit    422
Public                 41
Not-for-profit         40
Community College      22
Name: School Type (picklist), dtype: int64


Summary of "Total Students" column:

count      509.000000
mean      1060.589391
std       3458.835196
min          0.000000
25%        127.000000
50%        300.000000
75%        621.000000
max      51976.000000
Name: Total Students, dtype: float64


LMS types for prospect schools:

PivotPoint                                115
Canvas                                    110
Moodle                                     36
Blackboard                                 19
Milady                                     15
D2L                                        12
None                            

In [13]:
# Load happiest_customers.csv
happiest = pd.read_csv('../data/happiest_customers.csv')

# Save Record IDs of happiest customers to discern them later
happiest_records = happiest['Record ID']

# Merge datasets by concatenating
all_schools = pd.concat([happiest, prospects], ignore_index=True)

# Add a new column to capture whether a school has more or less than 150 students
all_schools['Total Students'] = all_schools['Total Students'].dropna().str.replace(',','').astype(int)
all_schools['total_students_gt_150'] = [student_count >= 150 for student_count in all_schools['Total Students']]

print('Elements in prospects:', len(prospects))
print('Elements in happiest:', len(happiest))
print('Combined length:', len(all_schools))

Elements in prospects: 571
Elements in happiest: 51
Combined length: 622


In [14]:
# neo4j setup
driver = neo4j.GraphDatabase.driver(uri="neo4j://neo4j:7687", auth=("neo4j","ucb_mids_w205"))
session = driver.session(database="neo4j")

# Functions for interacting with and building graph databases, from labs

def my_neo4j_wipe_out_database():
    "wipe out database by deleting all nodes and relationships"
    
    query = "match (node)-[relationship]->() delete node, relationship"
    session.run(query)
    
    query = "match (node) delete node"
    session.run(query)
def my_neo4j_run_query_pandas(query, **kwargs):
    "run a query and return the results in a pandas dataframe"
    
    result = session.run(query, **kwargs)
    
    df = pd.DataFrame([r.values() for r in result], columns=result.keys())
    
    return df
def my_neo4j_number_nodes_relationships():
    "print the number of nodes and relationships"
   
    
    query = """
        match (n) 
        return n.name as node_name, labels(n) as labels
        order by n.name
    """
    
    df = my_neo4j_run_query_pandas(query)
    
    number_nodes = df.shape[0]
    
    
    query = """
        match (n1)-[r]->(n2) 
        return n1.name as node_name_1, labels(n1) as node_1_labels, 
            type(r) as relationship_type, n2.name as node_name_2, labels(n2) as node_2_labels
        order by node_name_1, node_name_2
    """
    
    df = my_neo4j_run_query_pandas(query)
    
    number_relationships = df.shape[0]
    
    print("-------------------------")
    print("  Nodes:", number_nodes)
    print("  Relationships:", number_relationships)
    print("-------------------------")
def my_neo4j_create_node(node_name):
    "create a node with label Name"
    
    query = """
    
    CREATE (:Name {name: $node_name})
    
    """
    
    session.run(query, node_name=node_name)
def my_neo4j_create_relationship_one_way(from_node, to_node, weight):
    "create a relationship one way between two nodes with a weight"
    
    query = """
    
    MATCH (from:Name), 
          (to:Name)
    WHERE from.name = $from_node and to.name = $to_node
    CREATE (from)-[:LINK {weight: $weight}]->(to)
    
    """
    
    session.run(query, from_node=from_node, to_node=to_node, weight=weight)
def my_neo4j_create_relationship_two_way(from_node, to_node, weight):
    "create relationships two way between two nodes with a weight"
    
    query = """
    
    MATCH (from:Name), 
          (to:Name)
    WHERE from.name = $from_node and to.name = $to_node
    CREATE (from)-[:LINK {weight: $weight}]->(to),
           (to)-[:LINK {weight: $weight}]->(from)
    
    """
    
    session.run(query, from_node=from_node, to_node=to_node, weight=weight)

In [15]:
# Make graph database

# Delete the database if it already exists
my_neo4j_wipe_out_database()

# Make a node for each school and each unique category
# ex: my_neo4j_create_node('Alaska Career College - Anchorage')

# Add a node for each unique school name
school_names = set(all_schools['Company name'])
for school in school_names:
    #print(school)
    my_neo4j_create_node(school)

# Create a category node for each Segment, exclude nans
segments = set(all_schools['Segment'].dropna())
for segment in segments:
    if segment != np.nan:
        #print(segment)
        my_neo4j_create_node(segment)

# Create a category node for each School Type, exclude nans
school_types = set(all_schools['School Type (picklist)'].dropna())
for school_type in school_types:
    #print(school_type)
    my_neo4j_create_node(school_type)

# Create a category node for greater than or less than 150 students
my_neo4j_create_node('gte_150_students')
my_neo4j_create_node('lt_150_students')

# Create a category node for each LMS, exclude nans
# Can be multiple values, split on ; and ,
lmss = set(all_schools['LMS (multi-select)'].dropna())

all_lms_values = []
for lms in lmss:
    split_values = lms.replace(';',',').split(',')
    all_lms_values.extend(split_values)

final_lms_values = []
for lms in list(set(all_lms_values)):
    #print(list(set(all_lms_values)))
    lms_norm = lms.strip()
    if lms_norm != np.nan and lms_norm not in final_lms_values:
        my_neo4j_create_node(lms_norm)
        final_lms_values.append(lms_norm)
        
for lms in sorted(final_lms_values):
    print(lms)

Blackboard
Blue Triangle
BrightSpace by D2L
Brighthouse
Brightspace
CLIC
CLIC ( Proprietary)
CLIC (Certified Learing in Education)
Canvas
Cengage
Clic
Clic/Homegrown
Clic/Proprietary
College Office
Custom
D2L
Ellucian
Evolve Elsevier
Google Classroom
Google Classroom/Schoololgy
Google classroom
Home grown
Jupiter
Jupiter Ed
KlassApp
Klassapp
LA Film Online
LERN
LearnAveda
MS365
Microsoft
Microsoft Office 365
Microsoft Teams
Milady
Moodle
NEO
None
Open LMS (A Moodle Product)
Open LMS- A moodle product
Other
PivotPoint
Populi
Powerschool
Proctorio - https://proctorio.com/
SAP Netweaver
Sakai
Schoology
Stack Blue
ThinkWave
Toni &Guy
bright space
googlesuit
webex
zoom


In [16]:
# Create edges between school nodes and category nodes
#my_neo4j_create_relationship_one_way(school, category, weight)

for i, school in all_schools.iterrows():
    name = school['Company name']
    segment = school['Segment']
    school_type = school['School Type (picklist)']
    student_gt_150 = school['total_students_gt_150']
    lms = school['LMS (multi-select)']

    # Add edges from school to segment
    if not pd.isna(segment):
        #pass
        #print(name + ' -> ' + segment)
        my_neo4j_create_relationship_one_way(name, segment, 0)

    # Add edges from school to type
    if not pd.isna(school_type):
        #pass
        #print(name + ' -> ' + school_type)
        my_neo4j_create_relationship_one_way(name, school_type, 0) 
    
    # Add edges from school to nodes indicating greater than or equal to 150 students
    #  or less than 150 students
    if student_gt_150:
        #pass
        #print(name + ' -> ' + 'gte_150_students')
        my_neo4j_create_relationship_one_way(name, 'gte_150_students', 0)
    else:
        #pass
        #print(name + ' -> ' + 'lt_150_students')
        my_neo4j_create_relationship_one_way(name, 'lt_150_students', 0)

    # Add edges from school to each LMS system they support
    if not pd.isna(lms):
        all_lms_values = lms.replace(';',',').split(',')

        for val in all_lms_values:
            #pass
            #print(name + ' -> ' + val.strip())
            my_neo4j_create_relationship_one_way(name, val.strip(), 0)

In [17]:
# Node Similarity

def my_neo4j_node_similarity():
    """
    Run the node similarity algorithm on the school graph
    """
    
    # Drop existing projection
    query = "CALL gds.graph.drop('school_graph', false)"
    session.run(query)

    # Create new in-memory projection
    query = """
    CALL gds.graph.project(
        'school_graph',
        'Name', 
        'LINK',
        {relationshipProperties: 'weight'}
    )
    """
    session.run(query)

    # Run the node similarity algorithm
    query = """
    CALL gds.nodeSimilarity.stream('school_graph', {similarityCutoff: 0.25})
    YIELD node1, node2, similarity
    RETURN
        gds.util.asNode(node1).name AS from,
        gds.util.asNode(node2).name AS to,
        similarity
    ORDER BY similarity DESCENDING, from, to

    """
    result = session.run(query)
    
    sim_data = [r.data() for r in result]
    sim_data_df = pd.DataFrame(sim_data)
    
    return sim_data_df
    
#     for r in result:
#         print(f"{r['from']} - {r['to']}: {r['similarity']}")
        
ns_df = my_neo4j_node_similarity()

# Look at similarities to some of the best customers
#for company in happiest['Company name']:
for i, company in happiest.iterrows():
    name = company['Company name']
    segment = company['Segment']
    school_type = company['School Type (picklist)']
    lms = company['LMS (multi-select)']
    
    
    # build dataframe of companies compared to this exemplar
    comp_compare = ns_df[ns_df['to'] == name]
    
    if len(comp_compare) == 0:
        continue
    
    print(f"{name} ({segment} | {school_type} | {lms})")
    
    # Print the stats for each similar school to illustrate the similarities/differences
    for i, row in comp_compare.iterrows():
        comp_name = row['from']
        comp_similarity = row['similarity']
        
        comp_row = all_schools.loc[all_schools['Company name'] == comp_name]
        segment = comp_row['Segment'].iloc[0]
        school_type = comp_row['School Type (picklist)'].iloc[0]
        lms = comp_row['LMS (multi-select)'].iloc[0]
        more_than_150 = comp_row['total_students_gt_150'].iloc[0]
        
        print(f"\t({comp_similarity}) {comp_name}")
        print(f"\t\t{segment}")
        print(f"\t\t{school_type}")
        print(f"\t\t{lms}")
        print(f"\t\t{more_than_150}")
    print()

Mildred Elley - Corporate Office (Allied Health | Private for-profit | Moodle)
	(1.0) American College of Healthcare and Technology - Corporate Office
		Allied Health
		Private for-profit
		Moodle
		True
	(1.0) American University of Health Sciences - Hill
		Allied Health
		Private for-profit
		Moodle
		True
	(1.0) Atlantis University - Miami
		Allied Health
		Private for-profit
		Moodle
		True
	(1.0) CCI Training Center - Arlington
		Allied Health
		Private for-profit
		Moodle
		True
	(1.0) Dawn Career Institute LLC - Newark
		Allied Health
		Private for-profit
		Moodle
		True
	(1.0) Five Branches University - Santa Cruz
		Allied Health
		Private for-profit
		Moodle
		True
	(1.0) Florida Academy of Nursing
		Allied Health
		Private for-profit
		Moodle
		True
	(1.0) Florida Education Institute - Miami
		Allied Health
		Private for-profit
		Moodle
		True
	(1.0) Fox College - Beedford Park
		Allied Health
		Private for-profit
		Moodle
		True
	(1.0) Infinity College - Lafayette
		Allied H

		True
	(1.0) South Hills School of Business & Technology - College
		Allied Health
		Private for-profit
		Canvas
		True
	(1.0) Southern California Health Institute - Hollywood
		Allied Health
		Private for-profit
		Canvas
		True
	(1.0) Southwest Institute of Healing Arts - Tempe
		Allied Health
		Private for-profit
		Canvas
		True
	(1.0) St Paul's School of Nursing - Corporate Office
		Allied Health
		Private for-profit
		Canvas
		True
	(1.0) Summit College - San Bernardino
		Allied Health
		Private for-profit
		Canvas
		True
	(1.0) Swedish Institute a College of Health Sciences - New York
		Allied Health
		Private for-profit
		Canvas
		True
	(1.0) Vet Tech Institute - Corporate Office
		Allied Health
		Private for-profit
		Canvas
		True
	(1.0) West Coast Ultrasound Institute, School of Imaging - Corporate
		Allied Health
		Private for-profit
		Canvas
		True
	(1.0) West Virginia Junior College - Corporate Office
		Allied Health
		Private for-profit
		Canvas
		True
	(1.0) Wichita Techn

		Milady
		True
	(1.0) Elaine Sterling Institute - Atlanta
		Beauty + Wellness
		Private for-profit
		Milady
		True
	(1.0) Houston Barber School - Corporate
		Beauty + Wellness
		Private for-profit
		Milady
		True
	(1.0) Houston Training School - Corporate Office
		Beauty + Wellness
		Private for-profit
		Milady
		True
	(1.0) International School of Skin Nailcare & Massage Therapy - Atlanta
		Beauty + Wellness
		Private for-profit
		Milady
		True
	(1.0) Paul Mitchell the School - Arlington
		Beauty + Wellness
		Private for-profit
		Milady
		True
	(1.0) Paul Mitchell the School - Dallas
		Beauty + Wellness
		Private for-profit
		Milady
		True
	(1.0) Paul Mitchell the School - Denver
		Beauty + Wellness
		Private for-profit
		Milady
		True
	(1.0) Paul Mitchell the School - Fresno
		Beauty + Wellness
		Private for-profit
		Milady
		True
	(1.0) Paul Mitchell the School - Memphis
		Beauty + Wellness
		Private for-profit
		Milady
		True
	(1.0) Paul Mitchell the School - Michigan
		Beauty + W

	(1.0) ESI - Westminster
		Beauty + Wellness
		Private for-profit
		Milady
		False
	(1.0) Paul Mitchell the School - Arkansas
		Beauty + Wellness
		Private for-profit
		Milady
		False
	(1.0) Paul Mitchell the School - Grand Rapids
		Beauty + Wellness
		Private for-profit
		Milady
		False
	(1.0) Paul Mitchell the School - Great Lakes
		Beauty + Wellness
		Private for-profit
		Milady
		False
	(1.0) Paul Mitchell the School - Madison
		Beauty + Wellness
		Private for-profit
		Milady
		False
	(1.0) Paul Mitchell the School - Miami
		Beauty + Wellness
		Private for-profit
		Milady
		False
	(1.0) Paul Mitchell the School - Phoenix
		Beauty + Wellness
		Private for-profit
		Milady
		False
	(1.0) The Salon Professional Academy - Altoona
		Beauty + Wellness
		Private for-profit
		Milady
		False
	(1.0) The Salon Professional Academy - Cedar Falls
		Beauty + Wellness
		Private for-profit
		Milady
		False
	(1.0) The Salon Professional Academy - Colorado Springs
		Beauty + Wellness
		Private for-pr

		False
	(1.0) O'Briens Aveda Institute - Williston
		Beauty + Wellness
		Private for-profit
		PivotPoint
		False
	(1.0) Paul Mitchell The School - Norman
		Beauty + Wellness
		Private for-profit
		PivotPoint
		False
	(1.0) Paul Mitchell the School - Delaware
		Beauty + Wellness
		Private for-profit
		PivotPoint
		False
	(1.0) Paul Mitchell the School - Lombard
		Beauty + Wellness
		Private for-profit
		PivotPoint
		False
	(1.0) Paul Mitchell the School - Merrillville
		Beauty + Wellness
		Private for-profit
		PivotPoint
		False
	(1.0) Paul Mitchell the School - Nampa
		Beauty + Wellness
		Private for-profit
		PivotPoint
		False
	(1.0) Summit Salon Academy - Gainesville
		Beauty + Wellness
		Private for-profit
		PivotPoint
		False
	(1.0) Summit Salon Academy - Kokomo
		Beauty + Wellness
		Private for-profit
		PivotPoint
		False
	(1.0) Summit Salon Academy - Lexington
		Beauty + Wellness
		Private for-profit
		PivotPoint
		False
	(1.0) Summit Salon Academy - Portland
		Beauty + Wellness

In [18]:
# Build a dataframe that lists every prospect school and their similarity score relative to each happiest customer

scores = {}

# Generate lists of happiest and prospect company names
happiest_set = set(happiest['Company name'])
prospect_set = set(prospects['Company name'])

# Initialize all prospect schools with default value
DEFAULT_SCORE = 0.25

for prospect in prospect_set:
    scores[prospect] = DEFAULT_SCORE

# For each comparison to an ideal school, record the score if greater than existing (max function)
for _, similarity in ns_df.iterrows():
    comp_from = similarity['from']
    comp_to = similarity['to']
    similarity_score = similarity['similarity']
    if comp_from in prospect_set and comp_to in happiest_set:
        if similarity_score > scores[comp_from]:
            scores[comp_from] = similarity_score

scores_df = pd.DataFrame(list(scores.items()), columns=['School', 'Score'])
scores_df.to_csv('../data/school_similarity_scores.csv', index=False)