# Scraping from University Website

In [23]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
from urllib.parse import urlparse, parse_qs


In [24]:
urls = [ 'https://stuactonline.tamu.edu/app/search/index/index/search/category?q=Academic%20-%20Engineering',
        'https://stuactonline.tamu.edu/app/search/index/index/search/category?q=Academic%20-%20Agriculture%20and%20Life%20Sciences',
         'https://stuactonline.tamu.edu/app/search/index/index/search/category?q=Academic%20-%20Architecture',
         'https://stuactonline.tamu.edu/app/search/index/index/search/category?q=Academic%20-%20Government%20and%20Public%20Service',
         'https://stuactonline.tamu.edu/app/search/index/index/search/category?q=Academic%20-%20Education%20and%20Human%20Development',
         'https://stuactonline.tamu.edu/app/search/index/index/search/category?q=Academic%20-%20Geosciences',
         'https://stuactonline.tamu.edu/app/search/index/index/search/category?q=Academic%20-%20Health%20Sciences',
         'https://stuactonline.tamu.edu/app/search/index/index/search/category?q=Academic%20-%20Honors',
         'https://stuactonline.tamu.edu/app/search/index/index/search/category?q=Academic%20-%20Law',
         'https://stuactonline.tamu.edu/app/search/index/index/search/category?q=Academic%20-%20Liberal%20Arts',
         'https://stuactonline.tamu.edu/app/search/index/index/search/category?q=Academic%20-%20Business',
         'https://stuactonline.tamu.edu/app/search/index/index/search/category?q=Academic%20-%20Science',
         'https://stuactonline.tamu.edu/app/search/index/index/search/category?q=Academic%20-%20Veterinary%20Medicine%20and%20Biomedical%20Sciences',
         'https://stuactonline.tamu.edu/app/search/index/index/search/category?q=Arts/%20Entertainment',
         'https://stuactonline.tamu.edu/app/search/index/index/search/category?q=Fraternity/Sorority%20Life',
         'https://stuactonline.tamu.edu/app/search/index/index/search/category?q=International/Multicultural',
         'https://stuactonline.tamu.edu/app/search/index/index/search/category?q=Leadership/Governance',
         'https://stuactonline.tamu.edu/app/search/index/index/search/category?q=Military',
         'https://stuactonline.tamu.edu/app/search/index/index/search/category?q=Politics/Advocacy',
         'https://stuactonline.tamu.edu/app/search/index/index/search/category?q=Professional/Career',
         'https://stuactonline.tamu.edu/app/search/index/index/search/category?q=Recreation/Sports',
         'https://stuactonline.tamu.edu/app/search/index/index/search/category?q=Religious/Spiritual',
         'https://stuactonline.tamu.edu/app/search/index/index/search/category?q=Residence%20Halls',
         'https://stuactonline.tamu.edu/app/search/index/index/search/category?q=Service',
         'https://stuactonline.tamu.edu/app/search/index/index/search/category?q=Social',
         'https://stuactonline.tamu.edu/app/search/index/index/search/category?q=Special%20Interest']

dfs = []
sum = 0

for url in urls:
  page = requests.get(url)
  soup = BeautifulSoup(page.text, 'html')

  table = soup.find('table')
  # title (string), desc (string), recognition status (string), website (string), filters (string)
  cols = ["title", "desc", "filter", "rec_status", "website"]
  df = pd.DataFrame(columns = cols)

  rows = table.find_all('tr')
  for row in rows:
    title = row.find('big').text.strip()
    desc = ""
    if row.find('p'):
      desc = row.find('p').text.strip()

    rec_status = row.find('img').get('title')
    website = row.find_all('a')[-1].get('href')


    parsed_url = urlparse(url)
    query_parameters = parse_qs(parsed_url.query)
    category = query_parameters.get('q', [''])[0]
    decoded_category = category.split('%20')

    filter = ' '.join(decoded_category)
    data = [title, desc, filter, rec_status, website]

    length = len(df)
    df.loc[length] = data

    sum += 1

  dfs.append(df)

print(sum)


2091


In [25]:
# concatenating the org data
orgs = dfs[0]
other_dfs = dfs[1:]

for dframe in other_dfs:
  orgs = pd.concat([orgs, dframe], ignore_index=True)



orgs.head()

Unnamed: 0,title,desc,filter,rec_status,website
0,2D Visual Developers,To provide a space where those with a shared i...,Academic - Engineering,Recognized,https://stuactonline.tamu.edu/app/organization...
1,Aerospace Engineering Graduate Student Associa...,1. Enrich the graduate experience for graduate...,Academic - Engineering,Recognized,https://stuactonline.tamu.edu/app/organization...
2,Aggie Aerospace Women in Engineering,To support and encourage women at the Texas A&...,Academic - Engineering,Recognized,http://aawe.tamu.edu
3,Aggie Club of Engineers,"Aggie Club of Engineers is a social, service, ...",Academic - Engineering,Recognized,http://aggieclubofengineers.tamu.edu
4,Aggie Robotics,Aggie Robotics is organized as an association ...,Academic - Engineering,Recognized,http://firstalumni.tamu.edu


In [26]:
orgs.shape

(2091, 5)

In [27]:
# Concatinate all relevant text
orgs['tags'] = orgs['desc'] + " " + orgs['filter'] + " " + orgs['title']
orgs['tags'] = orgs['tags'].apply(lambda x:x.lower())

# getting rid of non-unique and not recognized orgs
non_unique_not_recognized = orgs[(orgs.duplicated(subset='title', keep=False)) & (orgs['rec_status'] != 'Recognized')]
orgs = orgs.drop(non_unique_not_recognized.index)

In [28]:
import nltk
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer

In [29]:
ps = PorterStemmer()
def getStems(tags):
    words = []
    for word in tags.split():
        words.append(ps.stem(word))

    return " ".join(words)

In [30]:
orgs['tags'] = orgs['tags'].apply(getStems)

In [31]:
# getting the 2000 most impactful words in the tags
cv = CountVectorizer(max_features=500,stop_words='english')
vectors = cv.fit_transform(orgs['tags']).toarray()

In [32]:
from sklearn.metrics.pairwise import cosine_similarity

In [91]:
similarity = cosine_similarity(vectors)

def recommend(org):
    index = orgs[orgs['title'] == org].index[0]
    distances = sorted(list(enumerate(similarity[index])),reverse=True, key = lambda x: x[1])
    recommended_orgs = pd.DataFrame(columns=orgs.columns)
    for i in distances[0:10]:
        # print(orgs.iloc[i[0]])
        recommended_orgs = pd.concat([recommended_orgs, orgs.iloc[[i[0]]]], ignore_index=True)
    return recommended_orgs

In [34]:
# in the case where we input a name that does not exist in the name pool
# pip install python-Levenshtein

import Levenshtein

def closest_name(input_name):
    closest = min(orgs['title'].tolist(), key=lambda x: Levenshtein.distance(input_name.lower(), x.lower()))
    return closest

In [35]:
val = "Design Build"
recommend(closest_name(val))

Unnamed: 0,title,desc,filter,rec_status,website,tags
0,Dream Outside the Box,The mission of Dream Outside the Box TAMU is t...,Academic - Engineering,Not Recognized,https://stuactonline.tamu.edu/app/organization...,the mission of dream outsid the box tamu is to...
1,TAMU Fluid Power Club,The mission of the TAMU Fluid Power Club (TAMU...,Academic - Engineering,Recognized,https://stuactonline.tamu.edu/app/organization...,the mission of the tamu fluid power club (tamu...
2,TAMU Futbol Club,The mission of TAMU Futbol Club(TAMU FC) is to...,Academic - Engineering,Not Recognized,https://stuactonline.tamu.edu/app/organization...,the mission of tamu futbol club(tamu fc) is to...
3,McAllen Society of Engineers,The mission of the Society of Engineers at McA...,Academic - Engineering,Not Recognized,https://stuactonline.tamu.edu/app/organization...,the mission of the societi of engin at mcallen...
4,Goju-ryu at TAMU,Sorry this Club is now Closed.,Academic - Engineering,Not Recognized,https://stuactonline.tamu.edu/app/organization...,sorri thi club is now closed. academ - engin g...
5,International Council on Systems Engineering,The mission of INCOSE at Texas A&M is to facil...,Academic - Engineering,Recognized with Restrictions,https://stuactonline.tamu.edu/app/organization...,the mission of incos at texa a&m is to facilit...
6,Society of Asian Scientists & Engineers,SASE is dedicated to the advancement of Asian ...,Academic - Engineering,Recognized,http://sase.tamu.edu,sase is dedic to the advanc of asian heritag s...
7,Aggie Exercise Physiology Club,The purpose of this organization is to provide...,Academic - Education and Human Development,Not Recognized,http://aggieaep.tamu.edu,the purpos of thi organ is to provid kinesiolo...
8,Aggie Tutors,We strive to enrich the education of K-12 Stud...,Academic - Education and Human Development,Not Recognized,https://stuactonline.tamu.edu/app/organization...,we strive to enrich the educ of k-12 student i...
9,AgSPIRE,The mission of AgSPIRE is to provide children ...,Academic - Engineering,Not Recognized,https://stuactonline.tamu.edu/app/organization...,the mission of agspir is to provid children in...


In [115]:
import pickle
import itertools
pickle.dump(orgs,open('org_list.pkl','wb'))
pickle.dump(similarity,open('similarity.pkl','wb'))
import numpy as np

In [42]:
category_map = {
    "academics" : ["Academic - Business", "Academic - Law", "Academic - Veterinary Medicine and Biomedical Sciences", "Academic - Honors", "Academic - Science", "Academic - Liberal Arts", "Academic - Education and Human Development", "Academic - Geosciences", "Academic - Architecture", "Academic - Government and Public Service", "Academic - Engineering", "Academic - Health Sciences", "Academic - Agriculture and Life Sciences"],
    "fineArts" : ["Arts/ Entertainment"],
    "greekLife" : ["Fraternity/Sorority Life"],
    "leadership" : ["Leadership/Governance"],
    "professional" : ["Professional/Career"],
    "spiritual" : ["Religious/Spiritual"],
    "cultural" : ["International/Multicultural"],
    "service" : ["Politics/Advocacy", "Service"],
    "social" : ["Social", "Recreation/Sports"],
    "military" : ["Military"],
    "residenceHalls" : ["Residence Halls"],
    "specialInterest" : ["Special Interest"]
  }

threshold_map = {
    "rec" : ["Recognized", "Exempt from Recognition"],
    "pend": ["Recognized", "Renewing Recognition", "Pending Recognition", "Exempt from Recognition", "Recognized with Suspension"],
    "all" : ["Recognized", "Not Recognized", "Renewing Recognition", "Recognized with Restrictions", "Exempt from Recognition", "Pending Recognition", "Recognized with Suspension"]
}

def filtered_subset(filters):
    cat_string, thresh = filters.split("&")
    categories = [category_map[val] for val in cat_string.split("-")]
    categories = list(itertools.chain(*categories))
    rec_statuses = threshold_map[thresh]

    # orgs_subset = orgs[orgs['filter'].isin(categories) & orgs['rec_status'].isin(rec_statuses)]
    orgs_subset = orgs[orgs['rec_status'].isin(rec_statuses)]
    return orgs_subset

In [208]:
def recommend2(org, orgs_subset):
    
    index = orgs_subset[orgs_subset['title'] == org].index[0]
    indexes_to_use = np.array(orgs_subset.index.tolist())
    # distances = sorted(list(altered_similarity.loc[index]),reverse=True,key = lambda x: x)
    distances = np.array(sorted(list(enumerate(similarity[index])),reverse=True,key = lambda x: x[1]))
    recommended_orgs = pd.DataFrame(columns=orgs_subset.columns)

    matching_indexes = np.in1d(distances[:, 0], indexes_to_use)
    distances = distances[matching_indexes]



    if len(distances) >= 10:
        for i in distances[0:10]:
            recommended_orgs = pd.concat([recommended_orgs, orgs.iloc[[i[0]]]], ignore_index=True)
    else:
        for i in distances[0:len(distances)]:
            recommended_orgs = pd.concat([recommended_orgs, orgs.iloc[[i[0]]]], ignore_index=True)
    return recommended_orgs

def closest_name2(input_name, orgs_subset):
    closest = min(orgs_subset['title'].tolist(), key=lambda x: Levenshtein.distance(input_name.lower(), x.lower()))
    return closest

def search(name, filters):
    subset = filtered_subset(filters)
    orgname = closest_name2(name, subset)
    # index = subset[subset['title'] == orgname].index[0]
    # distances = sorted(list(enumerate(similarity[index])),reverse=True,key = lambda x: x[1])
    # recommended_orgs = pd.DataFrame(columns=subset.columns)
    # if len(distances) >= 10:
    #     for i in distances[0:10]:
    #         recommended_orgs = pd.concat([recommended_orgs, subset.iloc[[i[0]]]], ignore_index=True)
    # else:
    #     for i in distances[0:len(distances)-1]:
    #         recommended_orgs = pd.concat([recommended_orgs, subset.iloc[[i[0]]]], ignore_index=True)
    # return recommended_orgs
    print(recommend2(orgname, subset))
    # return recommend2(orgname, subset).to_json(orient="table")


search("women in engineering", "academics-fineArts-greekLife-leadership-professional-spiritual-cultural-service-social-military-residenceHalls-specialInterest&rec")

                                       title  \
0                           Women In Nuclear   
1           TAMUG Women Engineering Outreach   
2         National Association of Women MBAs   
3       Aggie Aerospace Women in Engineering   
4             Women Law Students Association   
5                    TAMSA Women in Medicine   
6                 Society of Women Engineers   
7            Texas A&M Women's Lacrosse Club   
8  Women in Industrial & Systems Engineering   
9        Texas Association of Women Dentists   

                                                desc  \
0  The Texas A&M Chapter of Women in Nuclear is p...   
1  The Women in Engineering Outreach Organization...   
2  The Texas A&M chapter of National Association ...   
3  To support and encourage women at the Texas A&...   
4  WLSA is an organization for women law students...   
5  Help support and provide awareness of women an...   
6  The Society of Women Engineers (SWE), original...   
7  The purpose of this 