# UMSI course recommender database

## Author: Chisheng Li

### 1) Output every pair of courses (Source Course and Target Course) and their Cosine Similarity scores to `allpairs.txt` 

In [1]:
import re
import math
from operator import itemgetter

enrolled = {}
numstudents = {}
numincommon = {}
scores = {}
titles = {}

In [2]:
for line in open("courseenrollment.txt", "r"):
    line = line.rstrip('\s\r\n')
    (student, graddate, spec, term, dept, courseno) = line.split('\t')
    
    # Create a variable course that consists of the dept. abbreviation 
    # Followed by a space and course number
    course=dept+' '+courseno
    
    if course not in enrolled:
        enrolled[course] = {student: 1}
    if student not in enrolled[course]:
        enrolled[course][student] = 1
    
    if course not in numstudents:
        numstudents[course] = 0

    numstudents[course] += 1

This part calculates the cosine similarity:

In [3]:
for course1 in enrolled:
    for course2 in enrolled:
    # Initialize each value in the 2d dict. 
        if course1 not in numincommon:
            numincommon[course1] = {course2: 0}
        if course2 not in numincommon[course1]:
            numincommon[course1][course2] = 0
        for student in enrolled[course2]:
            if student in enrolled[course1]:
                # If the same student is enrolled in both courses
                # Increment the counter for students in common
                numincommon[course1][course2] += 1
        denominator = math.sqrt(numstudents[course1] * numstudents[course2])
        
        # Same initialization as numincommon
        if course1 not in scores:
            scores[course1] = {course2: 0}
        if course2 not in scores[course1]:
            scores[course1][course2] = 0
        scores[course1][course2] = numincommon[course1][course2]/denominator

In [4]:
for line in open("coursetitles.txt", "r"):
    line = line.rstrip('\s\r\n')
    (course, title) = line.split('\t')
    
    # Strip trailing sections "-1" and spaces from the course numbers
    # and replace underscores with spaces. 
    # Assign the result to the variable "course2"
    course=course.replace('-1',"")
    course2 = course.replace("_"," ")

In [5]:
# Regex for sequent if/continue
si_re = re.compile(r"^SI \d+.*")
found_re = re.compile(r"SI 50[01234].*")

Create a new text file called `allpairs.txt`

In [6]:
test = open("allpairs.txt",'w')
test.write("Source Course"+"\t"+"Target Course"+"\t"+"Cosine Similarity")
for course1 in sorted(scores):
    i = 1
    # skip if course was not sufficiently popular
    if (numstudents[course1] < 5): 
        continue
    # skip if the course is not an SI course (use regexp)
    if (si_re.match(course1) is None): 
        continue
    # skip if the course is one of the foundations: 500,501,502,503,504
    if found_re.match(course1): 
        continue

    for course2,score in sorted(scores[course1].items(), 
                                key=itemgetter(1),reverse = True):
        # skip if the course (course2) is the one we're asking about (course1)
        if course2 == course1: 
            continue
        # skip if (course2) is one of the old foundations: 501,502,503,504
        if found_re.match(course2): 
            continue
        # only consider course2 if the number of students in common is >=1
        if(numincommon[course1][course2] < 1): 
            continue
        
        # write the data set to pairs.txt
        test.write("\n%s\t%s\t%s" % (course1,course2,score))
test.close()

### 2) Create database to store course pairs
- Create a database `classSimilarity.db` and create tables for the data in `allpairs.txt`
- Create queries to insert the data into `classSimilarity.db` (the source class, the target class, and the cosine similarity value)

Source code: `createDB.py`

### 3) Query  `courseSmilarity.db` for courses within certain ranges of cosine similarity
Output the course pairs in distinct ranges of cosine similarity:
- for values from 0 <= x <= 0.25 (`cosine0.25.txt`)
- for values from 0.25 < x <= 0.5 (`cosine0.25-0.5.txt`)
- for values from 0.5 < x <= 0.75 (`cosine0.5-0.75.txt`)
- for values from 0.75 < x <= 1 (`cosine0.75-1.txt`)

In [7]:
import sqlite3 as lite
import sys
con = None
pairs=[]

In [9]:
try:
    con = lite.connect('courseSmilarity.db')
    cur = con.cursor()  
    
    cur.execute("select * from courses where score >= 0 and score <= 0.25")
    query1 = open('cosine0.25.txt','w')
    query1.write("Source Course" + "\t" + "Target Course")
    print "%s" % ("Course pairs with cosine similarity score >= 0 and <= 0.25")
    print "-------------------------------------------------------"
    print "%s\t%s" % ("Source Course","Target Course")
    for row in cur:
        t1 = row[1] + row[0]
        t2 = row[0] + row[1]
        if t1 in pairs: continue
        pairs.append(t2)
        query1.write("\n%s\t%s" %(row[0],row[1]))
        print "%s\t%s" % (row[0],row[1])
        
    print

    cur.execute("select * from courses where score > 0.25 and score <= 0.5")
    query2 = open('cosine0.25-0.5.txt','w')
    query2.write("Source Course" + "\t" + "Target Course")
    print "%s" % ("Course pairs with cosine similarity score > 0.25 and <= 0.5")
    print "-------------------------------------------------------"
    print "%s\t%s" % ("Source Course","Target Course")
    for row in cur:
        t1 = row[1] + row[0]
        t2 = row[0] + row[1]
        if t1 in pairs: continue
        pairs.append(t2)
        query2.write("\n%s\t%s" %(row[0],row[1]))
        print "%s\t%s" % (row[0],row[1])
    
    print

    cur.execute("select * from courses where score > 0.5 and score <= 0.75")
    query3=open('cosine0.5-0.75.txt','w')
    query3.write("Source Course" + "\t" + "Target Course")
    print "%s" % ("Course pairs with cosine similarity score > 0.5 and <= 0.75")
    print "-------------------------------------------------------"
    print "%s\t%s" % ("Source Course","Target Course")
    for row in cur:
        t1=row[1]+row[0]
        t2=row[0]+row[1]
        if t1 in pairs: continue
        pairs.append(t2)
        query3.write("\n%s\t%s" %(row[0],row[1]))
        print "%s\t%s" % (row[0],row[1])
    
    print
    
    cur.execute("select * from courses where score > 0.75 and score <= 1")
    query4=open('cosine0.75-1.txt','w')
    query4.write("Source Course" + "\t" + "Target Course")
    print "%s" % ("Course pairs with cosine similarity score > 0.75 and <= 1")
    print "-------------------------------------------------------"
    print "%s\t%s" % ("Source Course","Target Course")
    for row in cur:
        t1=row[1]+row[0]
        t2=row[0]+row[1]
        if t1 in pairs: continue
        pairs.append(t2)
        query4.write("\n%s\t%s" %(row[0],row[1]))
        print "%s\t%s" % (row[0],row[1])

except lite.Error, e:
    print "Error %s:" % e.args[0]
    sys.exit(1)

finally:
    if con:
        con.close()

Course pairs with cosine similarity score >= 0 and <= 0.25
-------------------------------------------------------
Source Course	Target Course
SI 505	SI 682
SI 505	SI 618
SI 505	RCIDIV 351
SI 505	NRE 512
SI 505	POLSCI 594
SI 505	SI 635
SI 505	SI 524
SI 505	CSIB 645
SI 505	IOE 536
SI 505	SI 514
SI 505	SI 540
SI 505	SI 689
SI 505	SI 658
SI 505	SI 530
SI 505	SI 622
SI 505	SI 601
SI 505	SI 575
SI 505	SI 688
SI 505	SI 544
SI 505	SI 690
SI 505	OMS 616
SI 505	PHIL 605
SI 505	HISTORY 796
SI 505	SI 510
SI 505	SI 583
SI 505	SI 649
SI 505	SI 625
SI 505	SI 840
SI 505	SI 508
SI 505	SI 681
SI 505	SI 621
SI 505	SI 683
SI 505	BIT 512
SI 505	SI 699
SI 505	STRATEGY 673
SI 505	NRE 531
SI 505	SI 648
SI 505	SI 646
SI 505	SI 670
SI 505	SI 637
SI 505	SI 676
SI 505	MO 512
SI 505	SI 516
SI 505	SI 562
SI 505	SI 580
SI 505	SI 543
SI 505	PUBPOL 754
SI 505	SI 563
SI 505	BIT 513
SI 505	ARCH 531
SI 505	SI 519
SI 505	SI 708
SI 505	SI 675
SI 505	SI 579
SI 505	SI 655
SI 505	MHS 663
SI 505	SI 654
SI 505	SI 692
SI 505	BI