In [4]:
import matplotlib.pyplot as plt
plt.style.use('default')
plt.rc('text', usetex=True)
plt.rc('font', family='serif')
plt.rc('font', size=18)
plt.rc('axes', titlesize=18)
plt.rc('axes', labelsize=18)
plt.rc('xtick', labelsize=18)
plt.rc('ytick', labelsize=18)
plt.rc('legend', fontsize=18)
plt.rc('lines', markersize=10)

import random
from tqdm import tqdm
import time

In [5]:
def simple_list_search():
    I = [random.randint(1, 1000000) for _ in range(900000)]

    start_time = time.time()

    for _ in range(5):
        num = int(input("Enter an integer number: "))
        print("YES" if num in I else "NO")

    end_time = time.time()
    print(f"Time taken for simple list: {end_time - start_time:.2f} seconds")

def partitioned_list_search():
    partitions = 900
    D = {}

    for _ in range(900000):
        num = random.randint(1, 1000000)
        key = num % partitions
        D.setdefault(key, []).append(num)

    start_time = time.time()

    for _ in range(5):
        num = int(input("Enter an integer number: "))
        key = num % partitions
        print("YES" if key in D and num in D[key] else "NO")

    end_time = time.time()
    print(f"Time taken for partitioned list: {end_time - start_time:.2f} seconds")

In [6]:
print("Using simple list:")
simple_list_search()

print("\nUsing partitioned list:")
partitioned_list_search()

Using simple list:
NO
YES
NO
YES
NO
Time taken for simple list: 5.17 seconds

Using partitioned list:
NO
YES
NO
YES
NO
Time taken for partitioned list: 5.53 seconds


In [7]:
def generate_couples():
    return [(random.randint(1, 7), random.randint(1, 1000)) for _ in range(30000000)]

def groupAndSum(C):
    sums = [0] * 7
    for k, v in C:
        sums[k-1] += v
    return [(i+1, s) for i, s in enumerate(sums)]

The complexity of this function is linear, $\mathcal{O}(n)$. For every tuple in the list of $n$ tuples, it adds the value to the corresponding index in the sums list. It does not involve nested loops, which would imply a quadratic or higher complexity.



In [8]:
C = generate_couples()
start_time = time.time()
groupAndSum(C)
end_time = time.time()
naive_time = end_time - start_time
print(f"Naive Execution Time: {naive_time:.2f} seconds")

Naive Execution Time: 3.42 seconds


In [9]:
def groupSortedAndSum(C):
    C.sort(key=lambda x: x[0])
    sums = [0] * 7
    for k, v in C:
        sums[k-1] += v
    return [(i+1, s) for i, s in enumerate(sums)]

The sort function in Python has a time complexity of $\mathcal{O}(n \times log(n))$, which is dominated by the linear-time grouping operation, making the total time complexity $\mathcal{O}(n \times log(n) + n)$, which simplifies to $\mathcal{O}(n \times log(n))$. So, while this is more optimized than a quadratic approach, it's still slower than the linear-time naive approach.

In [10]:
start_time = time.time()
groupSortedAndSum(C)
end_time = time.time()
sorted_time = end_time - start_time
print(f"Sorted Execution Time: {sorted_time:.2f} seconds")

Sorted Execution Time: 6.82 seconds


In [11]:
def partition_and_group(C):
    partitions = {i: [] for i in range(1, 8)}
    for k, v in C:
        partitions[k].append(v)
    return [(k, sum(partitions[k])) for k in partitions]

The partitioning process has a linear time complexity of $\mathcal{O}(n)$ since we iterate through the list and append to the appropriate partition list. Summing each partition list also has a linear time complexity. Hence, the total time complexity remains linear $\mathcal{O}(n)$, but the partitioning method is more memory-efficient than sorting.

In [12]:
start_time = time.time()
partition_and_group(C)
end_time = time.time()
partitioned_time = end_time - start_time
print(f"Partitioned Execution Time: {partitioned_time:.2f} seconds")

Partitioned Execution Time: 3.39 seconds


- The naive method's execution time is $O(n)$.
- Sorting the $(k,v)$ couples helps improve the execution time compared to quadratic methods, but remains slower than linear methods. Its advantage is in scenarios requiring further operations post-grouping.
- The partitioning approach offers benefits in execution time and memory efficiency, especially when the range of $k$ values is limited.
- Parallel processing: Partitioning is conducive to parallel processing. By distributing each partition to separate threads or processors, concurrent processing is achievable, especially beneficial for larger datasets.

In [13]:
import heapq

def n_way_ms(lists):
    if not lists:
        return []

    min_heap = []
    # Push the first element of each list onto the heap.
    for i in range(len(lists)):
        if lists[i]:
            heapq.heappush(min_heap, (lists[i][0], i, 0))
            lists[i].pop(0)

    merged_list = []
    while min_heap:
        val, list_idx, element_idx = heapq.heappop(min_heap)
        merged_list.append(val)

        if len(lists[list_idx]) > 0:
            next_tuple = (lists[list_idx][0], list_idx, element_idx + 1)
            heapq.heappush(min_heap, next_tuple)
            lists[list_idx].pop(0)

    return merged_list

Pushing the first element from each of the n lists onto the heap takes $\mathcal{O}(n \times log(n))$.
Each pop operation, followed by a push operation, takes $\mathcal{O}(log(n))$. Since we have to perform this operation for each of the total elements across all the lists, let's say the total number of elements is m, the overall complexity would be $\mathcal{O}(m \times log(n))$.
Therefore, the overall time complexity is $\mathcal{O}(n \times log(n) + m \times log(n))$.

More efficient we can actually do:

The provided solution is already efficient given the constraints. The reason being, merging n sorted lists requires looking at each list at least once to place its elements in the correct position. Using a priority queue ensures we're always choosing the smallest available element from the lists in $\mathcal{O}(n \times log(n))$ time.
However, in terms of space complexity, other implementations that involve dividing and conquering the lists (merging them in pairs) might be more space-efficient as they wouldn't need to maintain a heap with an entry for each list.

In [21]:
def count_words(T):
    word_count = {}
    for line in T:
        words = line.split()
        for word in words:
            word_count[word] = word_count.get(word, 0) + 1
    return list(word_count.items())

In [57]:
T = ["aa, bb, cc, dd, aa, b, cc, dd, a, b, cc, dd, ff, gg"]
start = time.time()
word_count = count_words(T)
end = time.time()
print(f"Execution Time: {end - start} seconds")
print(word_count)

Execution Time: 5.698204040527344e-05 seconds
[('aa,', 2), ('bb,', 1), ('cc,', 3), ('dd,', 3), ('b,', 2), ('a,', 1), ('ff,', 1), ('gg', 1)]


In [58]:
import itertools


def map_words(T):
    for line in T:
        words = line.split()
        for word in words:
            yield (word, 1)


def reduce_words(word_count):
    word_count = list(word_count)
    word_count.sort(key=lambda x: x[0])  # Sort by word
    word_count = [(k, sum([v[1] for v in g])) for k, g in
                  itertools.groupby(word_count, key=lambda x: x[0])]
    return word_count

In [59]:
start = time.time()
word_count = map_words(T)
word_count = reduce_words(word_count)
end = time.time()
print(f"Execution Time: {end - start} seconds")
print(word_count)

Execution Time: 4.887580871582031e-05 seconds
[('a,', 1), ('aa,', 2), ('b,', 2), ('bb,', 1), ('cc,', 3), ('dd,', 3), ('ff,', 1), ('gg', 1)]


Improving Execution Time for Large T:

- Partitioning: If T is large, it can be partitioned into smaller chunks. Each chunk can be processed separately to generate word counts, and the results can then be merged. This is similar to the MapReduce model where the Map phase processes chunks of data to produce intermediate key-value pairs (in this case, word-count pairs) and the Reduce phase aggregates these intermediate results.
- Sorting: Once the words are extracted, they can be sorted, which will place the same words together. A single pass can then be used to count occurrences of each word. However, this approach might not be as efficient as the simple counting method for this specific problem.
- Parallel Processing: By using techniques like multi-threading or distributed processing, different portions of T can be processed concurrently, greatly improving execution times, especially for very large datasets.
In essence, techniques like partitioning and parallel processing, inspired by the MapReduce model, can indeed help in improving execution times for large collections of text lines.