# Scraping from University Website

In [1]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
from urllib.parse import urlparse, parse_qs


In [2]:
urls = [ 'https://stuactonline.tamu.edu/app/search/index/index/search/category?q=Academic%20-%20Engineering',
        'https://stuactonline.tamu.edu/app/search/index/index/search/category?q=Academic%20-%20Agriculture%20and%20Life%20Sciences',
         'https://stuactonline.tamu.edu/app/search/index/index/search/category?q=Academic%20-%20Architecture',
         'https://stuactonline.tamu.edu/app/search/index/index/search/category?q=Academic%20-%20Government%20and%20Public%20Service',
         'https://stuactonline.tamu.edu/app/search/index/index/search/category?q=Academic%20-%20Education%20and%20Human%20Development',
         'https://stuactonline.tamu.edu/app/search/index/index/search/category?q=Academic%20-%20Geosciences',
         'https://stuactonline.tamu.edu/app/search/index/index/search/category?q=Academic%20-%20Health%20Sciences',
         'https://stuactonline.tamu.edu/app/search/index/index/search/category?q=Academic%20-%20Honors',
         'https://stuactonline.tamu.edu/app/search/index/index/search/category?q=Academic%20-%20Law',
         'https://stuactonline.tamu.edu/app/search/index/index/search/category?q=Academic%20-%20Liberal%20Arts',
         'https://stuactonline.tamu.edu/app/search/index/index/search/category?q=Academic%20-%20Business',
         'https://stuactonline.tamu.edu/app/search/index/index/search/category?q=Academic%20-%20Science',
         'https://stuactonline.tamu.edu/app/search/index/index/search/category?q=Academic%20-%20Veterinary%20Medicine%20and%20Biomedical%20Sciences',
         'https://stuactonline.tamu.edu/app/search/index/index/search/category?q=Arts/%20Entertainment',
         'https://stuactonline.tamu.edu/app/search/index/index/search/category?q=Fraternity/Sorority%20Life',
         'https://stuactonline.tamu.edu/app/search/index/index/search/category?q=International/Multicultural',
         'https://stuactonline.tamu.edu/app/search/index/index/search/category?q=Leadership/Governance',
         'https://stuactonline.tamu.edu/app/search/index/index/search/category?q=Military',
         'https://stuactonline.tamu.edu/app/search/index/index/search/category?q=Politics/Advocacy',
         'https://stuactonline.tamu.edu/app/search/index/index/search/category?q=Professional/Career',
         'https://stuactonline.tamu.edu/app/search/index/index/search/category?q=Recreation/Sports',
         'https://stuactonline.tamu.edu/app/search/index/index/search/category?q=Religious/Spiritual',
         'https://stuactonline.tamu.edu/app/search/index/index/search/category?q=Residence%20Halls',
         'https://stuactonline.tamu.edu/app/search/index/index/search/category?q=Service',
         'https://stuactonline.tamu.edu/app/search/index/index/search/category?q=Social',
         'https://stuactonline.tamu.edu/app/search/index/index/search/category?q=Special%20Interest']

dfs = []

for url in urls:
  page = requests.get(url)
  soup = BeautifulSoup(page.text, 'html')

  table = soup.find('table')
  # title (string), desc (string), recognition status (string), website (string), filters (string)
  cols = ["title", "desc", "filter", "rec_status", "website"]
  df = pd.DataFrame(columns = cols)

  rows = table.find_all('tr')
  for row in rows:
    title = row.find('big').text.strip()
    desc = ""
    if row.find('p'):
      desc = row.find('p').text.strip()

    rec_status = row.find('img').get('title')
    website = row.find_all('a')[-1].get('href')


    parsed_url = urlparse(url)
    query_parameters = parse_qs(parsed_url.query)
    category = query_parameters.get('q', [''])[0]
    decoded_category = category.split('%20')

    filter = ' '.join(decoded_category)

    data = [title, desc, filter, rec_status, website]

    length = len(df)
    df.loc[length] = data

  dfs.append(df)


In [3]:
# concatenating the org data
first_df = dfs[0]
other_dfs = dfs[1:]

for dframe in other_dfs:
  orgs = pd.concat([first_df, dframe], ignore_index=True)



orgs.head()

Unnamed: 0,title,desc,filter,rec_status,website
0,2D Visual Developers,To provide a space where those with a shared i...,Academic - Engineering,Recognized,https://stuactonline.tamu.edu/app/organization...
1,Aerospace Engineering Graduate Student Associa...,1. Enrich the graduate experience for graduate...,Academic - Engineering,Recognized,https://stuactonline.tamu.edu/app/organization...
2,Aggie Aerospace Women in Engineering,To support and encourage women at the Texas A&...,Academic - Engineering,Recognized,http://aawe.tamu.edu
3,Aggie Club of Engineers,"Aggie Club of Engineers is a social, service, ...",Academic - Engineering,Recognized,http://aggieclubofengineers.tamu.edu
4,Aggie Robotics,Aggie Robotics is organized as an association ...,Academic - Engineering,Recognized,http://firstalumni.tamu.edu


In [4]:
orgs.shape

(486, 5)

In [5]:
# Concatinate all relevant text
orgs['tags'] = orgs['desc'] + " " + orgs['filter'] + " " + orgs['title']
orgs['tags'] = orgs['tags'].apply(lambda x:x.lower())

# getting rid of non-unique and not recognized orgs
non_unique_not_recognized = orgs[(orgs.duplicated(subset='title', keep=False)) & (orgs['rec_status'] != 'Recognized')]
orgs = orgs.drop(non_unique_not_recognized.index)

In [6]:
import nltk
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer

ModuleNotFoundError: No module named 'nltk'

In [None]:
ps = PorterStemmer()
def getStems(tags):
    words = []
    for word in tags.split():
        words.append(ps.stem(word))

    return " ".join(words)

In [None]:
orgs['tags'] = orgs['tags'].apply(getStems)

In [None]:
# getting the 2000 most impactful words in the tags
cv = CountVectorizer(max_features=3000,stop_words='english')
vectors = cv.fit_transform(orgs['tags']).toarray()

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
similarity = cosine_similarity(vectors)

def recommend(org):
    index = orgs[orgs['title'] == org].index[0]
    distances = sorted(list(enumerate(similarity[index])),reverse=True,key = lambda x: x[1])
    for i in distances[0:25]:
        print(orgs.iloc[i[0]])

In [None]:
# in the case where we input a name that does not exist in the name pool
# pip install python-Levenshtein

import Levenshtein

def closest_name(input_name):
    closest = min(orgs['title'].tolist(), key=lambda x: Levenshtein.distance(input_name.lower(), x.lower()))
    return closest

In [None]:
val = "product"
recommend(closest_name(val))

In [None]:
import pickle
pickle.dump(orgs,open('org_list.pkl','wb'))
pickle.dump(similarity,open('similarity.pkl','wb'))