In [None]:
import dask.dataframe as dd 
import pandas as pd
from collections import Counter

Good that in this case you have used a Notebook. That proves that you know when to choose the right tools for the job. I just wonder where all the references to `df` in the cells below come from...

In [None]:
# Read the protein annotation data using Dask Dataframe
protien_data = dd.read_csv('/data/dataprocessing/interproscan/all_bacilli.tsv', sep ='\t', dtype= str, header = None, names = ["0", "1", "2","3","4","5","6","7","8","9","10","11","12","13","14"] )

In [None]:

def compute_distinct_annotations(df_path):
    """
    Compute the distinct annotations from the protein annotation data.

    Parameters:
    - df_path (str): The file path to the protein annotation data in CSV format.

    Returns:
    - distinct_annotations (Dask Series): A Dask Series containing the distinct annotations.
    """
    
    # Drop duplicate annotations and compute distinct values

    # there is no `df`
    distinct_annotations = df['11'].drop_duplicates().compute(num_workers=16)
    
    return distinct_annotations

In [None]:
# calling distinict annotations
distinct_annotations = compute_distinct_annotations(protien_data)
print("Distinct annotations:", distinct_annotations)

In [None]:
def compute_average_annotations(df_path):
    """
    Compute the average number of annotations per protein.

    Parameters:
    - df_path (str): The file path to the protein annotation data in CSV format.

    Returns:
    - average_annotations (float): The average number of annotations per protein.
    """
    
    # Compute the average number of annotations per protein

    # there is no `df`
    average_annotations = df.groupby('1').size().mean().compute(num_workers=16)
    
    return average_annotations

In [None]:
average_annotations = compute_average_annotations(protien_data)
print("Average number of annotations per protein:", average_annotations)

In [None]:
def find_most_common_go_term(df_path):
    """
    Find the most common Gene Ontology (GO) term from the protein annotation data.

    Parameters:
    - df_path (str): The file path to the protein annotation data in CSV format.

    Returns:
    - most_common_go_term (str): The most common Gene Ontology (GO) term.
    """
    
    # Extract the GO terms and explode them into separate rows
    go_terms = df['0'].str.split('|').explode()
    
    # Find the most common GO term
    most_common_go_term = go_terms.value_counts().nlargest(1).compute(num_workers=16).index[0]
    
    return most_common_go_term

In [None]:
most_common_go_term = find_most_common_go_term(protien_data)
print("Most common GO term:", most_common_go_term)

In [None]:
def compute_average_feature_size(df_path):
    """
    Compute the average size of InterPRO features from the protein annotation data.

    Parameters:
    - df_path (str): The file path to the protein annotation data in CSV format.

    Returns:
    - average_feature_size (float): The average size of InterPRO features.
    """
    
    # Calculate the feature size and compute the average
    df['CustomFeatureSize'] = df['7'].astype(int) - df['6'].astype(int)
    average_feature_size = df['CustomFeatureSize'].mean().compute(num_workers=16)

    return average_feature_size

In [None]:
# calling average size function
average_size = compute_average_feature_size(protien_data)
print("Average size of Custom InterPRO feature:", average_size)

In [None]:
def compute_top_10_interpro_features(df_path):
    """
    Compute the top 10 most common InterPRO features from the protein annotation data.

    Parameters:
    - df_path (str): The file path to the protein annotation data in CSV format.

    Returns:
    - top_10_interpro_features (Dask Series): A Dask Series containing the top 10 most common InterPRO features.
    """
    
    # Compute the top 10 most common InterPRO features
    top_10_interpro_features = df['1'].value_counts().nlargest(10).compute(num_workers=16)

    return top_10_interpro_features

In [None]:
top_10_features = compute_top_10_interpro_features(protien_data)
print("Top 10 most common InterPRO features:")
print(top_10_features)

In [None]:
def compute_top_10_similar_size_features(df_path, similar_size_threshold=0.9):
    """
    Compute the top 10 most common InterPRO features with similar size to the protein from the protein annotation data.

    Parameters:
    - df_path (str): The file path to the protein annotation data in CSV format.
    - similar_size_threshold (float, optional): The threshold for similarity of feature size to protein size. Default is 0.9.

    Returns:
    - top_10_similar_size_features (Dask Series): A Dask Series containing the top 10 most common InterPRO features with similar size.
    """
    
    # Calculate the protein size
    protein_size = df['2'].astype(int)

    # Select InterPRO features with similar size to the protein
    similar_size_features = df[abs(df['FeatureSize'] - protein_size) / protein_size <= similar_size_threshold]

    # Compute the top 10 most common InterPRO features with similar size
    top_10_similar_size_features = similar_size_features['1'].value_counts().nlargest(10).compute(num_workers=16)

    return top_10_similar_size_features

In [None]:
top_10_similar_size_features = compute_top_10_similar_size_features(protien_data)
print("Top 10 most common InterPRO features with similar size:")
print(top_10_similar_size_features)

In [None]:

def compute_top_10_common_and_least_common_words(df_path):
    """
    Compute the top 10 most common and least common words found in the textual annotations of the protein annotation data.

    Parameters:
    - df_path (str): The file path to the protein annotation data in CSV format.

    Returns:
    - top_10_most_common_words (list): A list of tuples containing the top 10 most common words and their counts.
    - top_10_least_common_words (list): A list of tuples containing the top 10 least common words and their counts.
    """
    
    # Read the protein annotation data using Dask Dataframe
    df = dd.read_csv(df_path)

    # Concatenate the textual annotations into a single column
    text_annotations = df['3'] + ' ' + df['4'] + ' ' + df['5'] + df['11'] + df['12']

    # Preprocess the text annotations
    text_annotations = text_annotations.str.lower().str.replace(r'[^a-zA-Z0-9\s]', '').str.replace(r'\s+', ' ')

    # Count the frequency of each word
    word_counts = Counter(word for annotation in text_annotations for word in annotation.split())

    # Get the top 10 most common words
    top_10_most_common_words = word_counts.most_common(10)

    # Get the top 10 least common words
    top_10_least_common_words = word_counts.most_common()[:-11:-1]

    return top_10_most_common_words, top_10_least_common_words

In [None]:
top_10_most_common, top_10_least_common = compute_top_10_common_and_least_common_words(protien_data)

# Print the top 10 most common words
print("Top 10 most common words:")
for word, count in top_10_most_common:
    print(word, count)

# Print the top 10 least common words
print("\nTop 10 least common words:")
for word, count in top_10_least_common:
    print(word, count)

In [None]:
def compute_top_10_common_words_similar_size(df_path, similar_size_threshold=0.9):
    """
    Compute the top 10 most common words in the textual annotations of InterPRO features with similar size to the protein.

    Parameters:
    - df_path (str): The file path to the protein annotation data in CSV format.
    - similar_size_threshold (float, optional): The threshold for similarity of feature size to protein size. Default is 0.9.

    Returns:
    - top_10_words (list): A list of tuples containing the top 10 most common words and their counts.
    """
    
    # Read the protein annotation data using Dask Dataframe
    df = dd.read_csv(df_path)

    # Calculate the protein size
    protein_size = df['2'].astype(int)

    # Select InterPRO features with similar size to the protein
    similar_size_features = df[abs(df['FeatureSize'] - protein_size) / protein_size <= similar_size_threshold]

    # Get the textual annotation columns for the selected features
    text_annotations = similar_size_features['3'] + ' ' + similar_size_features['4'] + ' ' + similar_size_features['5'] + similar_size_features['11'] + similar_size_features['12']
    text_annotations = text_annotations.str.lower().str.replace(r'[^a-zA-Z0-9\s]', '').str.replace(r'\s+', ' ')

    # Count the frequency of each word
    word_counts = Counter(word for annotation in text_annotations for word in annotation.split())

    # Get the top 10 most common words
    top_10_words = word_counts.most_common(10)

    return top_10_words

In [None]:
top_10_common_words_similar_size = compute_top_10_common_words_similar_size(protien_data)

# Print the top 10 most common words
for word, count in top_10_common_words_similar_size:
    print(word, count)

In [None]:
def compute_coefficient_of_correlation(df_path):
    """
    Compute the coefficient of correlation between protein size and the number of features.

    Parameters:
    - df_path (str): The file path to the protein annotation data in CSV format.

    Returns:
    - coefficient_of_correlation_result (float): The coefficient of correlation between protein size and the number of features.
    """
    
    # Calculate the coefficient of correlation
    coefficient_of_correlation = df['2'].astype(int).corr(df['7'].astype(int) - df['6'].astype(int))

    # Compute the result
    coefficient_of_correlation_result = coefficient_of_correlation.compute(num_workers=16)

    return coefficient_of_correlation_result

In [None]:
correlation_result = compute_coefficient_of_correlation(protien_data)
print("Coefficient of correlation:", correlation_result)