In [1]:
import numpy as np
import pandas as pd

In [4]:
dfMemes = pd.read_csv("Database.csv", encoding='UTF-8',skip_blank_lines=False)


subredditIndices = {}
index = 0
for subreddit in dfMemes["Subreddit"].unique():
    subredditIndices[subreddit] = index
    subredditIndices[index] = subreddit
    index += 1

In [5]:
dfLinks = pd.read_csv('memeProject.csv', header=None, encoding='utf8')
dfLinks.columns = ['i','j']

adjacencyList = [[] for n in range(index)]

for _, row in dfLinks.iterrows():
    subredditI = row["i"]
    subredditJ = row['j']
    i = subredditIndices[subredditI]
    j = subredditIndices[subredditJ]
    
    #links are bidirectional 
    #multiple links in memeProject.csv are added multiple times so they are counted later
    adjacencyList[j].append(i)
    adjacencyList[i].append(j)

In [8]:
from matplotlib.pyplot import figure

pd.set_option('display.max_rows', 500)

def stochasticMatrix(adjacency_list):
    n = len(adjacency_list)
    matrix = np.zeros((n, n))
    for col, line in enumerate(adjacency_list):
        if len(line) > 0:
              #page has existing links
            for index in line:
                
                matrix[index][col] += 1/(len(line)) #add a portion of the final weight
        else:
            #page not found
            for index in range(n):
                matrix[index][col] = 1/n
    return matrix

#transitionMatrix = (1-beta)/n * m + beta*stochasitcMatrix
def transitionMatrix(stochastic_matrix):
    n = len(stochastic_matrix)
    beta = 0.85 #teleportation factor
    m = np.ones((n, n)) #matrix of 1's
    part1 = np.multiply(((1-beta)/n), m)
    part2 = np.multiply(beta, stochastic_matrix)
    transition_matrix = np.add(part1, part2)
    return transition_matrix

def pageRank(transition_matrix):
    n = len(transition_matrix)
    err = 0.0005 
    v1 = np.ones(n)
    v1 = np.multiply((1/n), v1)
    v2 = np.matmul(transition_matrix, v1)
    
    while not within_err(v1, v2, err): 
        #ensures differences between v1 and v2 is under the err bound
        v1 = v2
        v2 = np.matmul(transition_matrix, v1)
    return (v2.tolist())

def within_err(v1, v2, err):
    diff_vector = np.subtract(v2, v1)
    for diff in diff_vector:
        if abs(diff) > err:
            return False
    return True


m = stochasticMatrix(adjacencyList)


t = transitionMatrix(m)


p = pageRank(t)


N = 10
#Return index values(company ids) for the 10 highest page ranks
topSubredditID = sorted(range(len(p)), key = lambda sub: p[sub])[-N:]

topSubreddits = [subredditIndices[subredditID] for subredditID in topSubredditID]



