1. How many unique pages did you find? Uniqueness is established by the URL, but discarding the fragment part. 
So, for example, http://www.ics.uci.edu#aaa and http://www.ics.uci.edu#bbb are the same URL.

2. What is the longest page in terms of number of words? (HTML markup doesn’t count as words)
3. What are the 50 most common words in the entire set of pages? (Ignore English stop words, which can be found, for example, hereLinks to an external site.) Submit the list of common words ordered by frequency.
4. How many subdomains did you find in the ics.uci.edu domain? Submit the list of subdomains ordered 
alphabetically and the number of unique pages detected in each subdomain. 
The content of this list should be lines containing URL, number, for example:
http://vision.ics.uci.edu, 10 (not the actual number here)

*.ics.uci.edu/*
*.cs.uci.edu/*
*.informatics.uci.edu/*
*.stat.uci.edu/*
today.uci.edu/department/information_computer_sciences/*

In [1]:
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import os
import re

from collections import defaultdict
from urllib.parse import urlparse
from collections import Counter


import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import random
 


In [2]:
# read the downloaded files as a beautiful soup object

def extract_content(filename):
    with open(filename, 'r') as file:
        soup = BeautifulSoup(file, 'html.parser')
    return soup

In [3]:
def get_file_names(folder_path):
    return [f for f in os.listdir(folder_path) if os.path.isfile(os.path.join(folder_path, f))]

# Usage
folder_path = 'results10/downloaded/'
downloaded_files = get_file_names(folder_path)
nltk.download('stopwords')
nltk.download('wordtokenize')
nltk.download('punkt')
stop_words = set(stopwords.words('english'))

max_doc_length = 0
max_file = ''
tokens = []
for file in random.sample(downloaded_files,500):
    soup = extract_content(folder_path+file)
    text = soup.get_text()
    #print(text)
    lines = text.split('\n')
    
    l = 0
    for line in lines:
        if len(line) > 0:
            #print(line)
            l+=len(line.split(' '))
            #token_temp = re.findall(r'\b[a-zA-Z]+\b', line.lower()) #word_tokenize(line) #re.findall('[a-zA-Z]', line.lower()) #word_tokenize(line) re.findall('[a-zA-Z]', line.lower())
            #token_temp= [w for w in token_temp if not w.lower() in stop_words]
            #tokens = np.append(tokens,token_temp)
            word = re.findall(r'\b([a-zA-Z]{2,})\b', line.lower())
            tokens.extend(word)
            
    if l > max_doc_length:
        max_doc_length = l
        max_file = file
    
    tokens_cleaned = [w for w in tokens if not w.lower() in stop_words]
    tokens = tokens_cleaned
    

print(max_doc_length)
print(max_file)

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/singaram/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Error loading wordtokenize: Package 'wordtokenize' not
[nltk_data]     found in index
[nltk_data] Downloading package punkt to /home/singaram/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


2130
sli.ics.uci.edu_PmWiki_TextFormattingRules#DefinitionLists.txt


In [None]:
def computeWordFrequencies(token_list):
    # Count the occurrences of each token
    # Counter creates a dictionary of the words and frequencies
    """
    This method runs linear to the input. Making a counter variable has O(n) time complexity. 
    reference: https://stackoverflow.com/questions/42461840/what-is-the-time-complexity-of-collections-counter-in-python

    """
    word_frequencies = Counter(token_list)
    return word_frequencies


def printWordFrequencies(word_freq):
    """
    This function runs in O(nlogn) time complexity as thats the time taken to sort the Counter data structure
    by its frequency. 

    Reference: https://stackoverflow.com/questions/29240807/python-collections-counter-most-common-complexity

    """
    # Print word frequencies in decreasing order
    for token, count in word_freq.most_common(50):
        print(f"{token}: {count}")
    
word_freq = computeWordFrequencies(tokens)
# remove stop words from the word_freq

printWordFrequencies(word_freq)

In [4]:
'''

1. How many unique pages did you find? Uniqueness is established by the URL,
but discarding the fragment part. So, for example,
http://www.ics.uci.edu#aaa and http://www.ics.uci.edu#bbb are the same URL.

4. How many subdomains did you find in the ics.uci.edu domain?
Submit the list of subdomains ordered alphabetically and the number of
unique pages detected in each subdomain.

'''

def subdomainsCount(domain, file):
    # Function to extract subdomain from URL
    def extract_subdomain(url):
        parsed_url = urlparse(url)
        if parsed_url.netloc.endswith(domain):
            subdomains = parsed_url.netloc.split('.')
            if len(subdomains) > 2:
                return subdomains[0]
        return None

    # Count unique pages for subdomains of the specified domain
    subdomain_page_counts = defaultdict(int)
    with open(file, 'r') as file:
        for url in file:
            url = url.strip()
            subdomain = extract_subdomain(url)
            if subdomain and subdomain != 'www':
                subdomain_page_counts[subdomain] += 1

    # Construct the result dictionary
    result = {}
    sorted_subdomains = sorted(subdomain_page_counts.items(), key=lambda x: x[0])
    for subdomain, count in sorted_subdomains:
        result[f"http://{subdomain}.{domain}"] = count

    return result



def count_unique_pages(file_path):
    unique_pages = set()
    with open(file_path, 'r') as file:
        for line in file:
            url = line.strip()
            parsed_url = urlparse(url)

            # Discard the fragment part for uniqueness
            cleaned_url = f"{parsed_url.scheme}://{parsed_url.netloc}{parsed_url.path}"

            unique_pages.add(cleaned_url)

    return unique_pages, len(unique_pages)







In [5]:
if __name__ == '__main__':

    file_path = 'finalURL.txt' #'/content/urls.txt'
    unique_pages, unique_page_count = count_unique_pages(file_path)
    print("Number of unique pages found:", unique_page_count)



    domainToCheck = ['.ics.uci.edu','cs.uci.edu','informatics.uci.edu','stat.uci.edu']
    for domain in domainToCheck:
        result = subdomainsCount(domain, file_path)
        print(domain)
        print(result)
    

Number of unique pages found: 5810
.ics.uci.edu
{'http://Transformativeplay..ics.uci.edu': 1, 'http://acoi..ics.uci.edu': 73, 'http://aiclub..ics.uci.edu': 1, 'http://asterix..ics.uci.edu': 4, 'http://betapro..ics.uci.edu': 3, 'http://calendar..ics.uci.edu': 1, 'http://cdb..ics.uci.edu': 33, 'http://chenli..ics.uci.edu': 3, 'http://circadiomics..ics.uci.edu': 8, 'http://cml..ics.uci.edu': 5, 'http://code..ics.uci.edu': 13, 'http://computableplant..ics.uci.edu': 45, 'http://cradl..ics.uci.edu': 29, 'http://create..ics.uci.edu': 4, 'http://cwicsocal18..ics.uci.edu': 10, 'http://cybert..ics.uci.edu': 1, 'http://dejavu..ics.uci.edu': 2, 'http://elms..ics.uci.edu': 5, 'http://emj..ics.uci.edu': 8, 'http://evoke..ics.uci.edu': 5, 'http://flamingo..ics.uci.edu': 27, 'http://fr..ics.uci.edu': 9, 'http://frost..ics.uci.edu': 3, 'http://futurehealth..ics.uci.edu': 22, 'http://grape..ics.uci.edu': 1445, 'http://graphics..ics.uci.edu': 17, 'http://graphmod..ics.uci.edu': 3, 'http://hack..ics.uci.e

In [None]:
import ipyparallel as ipp

cluster = ipp.Cluster.from_file("/home/ics-home/.ipython/profile_default/security/cluster-.json")
rc = cluster.connect_client_sync()
rc