Read the InterPROscan output file.


In [None]:
import dask.dataframe as dd
import dask.array as da
import numpy as np


def load_dataframe(file_path):
    df = dd.read_csv(file_path, delimiter="\t", dtype=str, header=None, names=list(str(range(15))))
    return df



In [None]:

# Load the DataFrame
file_path = '/data/dataprocessing/interproscan/all_bacilli.tsv'
df = load_dataframe(file_path)
num_wk = 16

In [None]:
## How many distinct protein annotations are found in the dataset?


In [None]:

distinct_protein_annotations = df['11'].nunique().compute(num_workers=num_wk)
print("Number of distinct protein annotations:", distinct_protein_annotations)


In [None]:

## How many annotations does a protein have on average?


In [None]:

average_annotations_for_protein = df.groupby('0')['IPR022291'].count().mean().compute(num_workers=num_wk)
print("Average number of annotations per protein:", average_annotations_for_protein)


In [None]:

## What is the most common GO Term found?


In [None]:

go_terms = df['0'].str.split('|').explode(num_worker=16)
most_frequent_go_term = go_terms.value_counts().nlargest(1).compute(num_workers=num_wk).index[0] 
print("Most common GO Term:", most_frequent_go_term)


In [None]:

## What is the average size of an InterPRO feature found in the dataset?


In [None]:

df['FeatureSize'] = df['7'].astype(int) - df['6'].astype(int)
average_size_of_interpro_feature = df['FeatureSize'].mean().compute(num_workers=num_wk)
print("Average size of InterPRO feature:", average_size_of_interpro_feature)


In [None]:

## What is the top 10 most common InterPRO features?


In [None]:

top_10_frequent_interpro_features = df['1'].value_counts().nlargest(10).compute(num_workers=num_wk)
print("Top 10 most common InterPRO features:", top_10_frequent_interpro_features) 


In [None]:

## If you select InterPRO features that are almost the same size (within 90-100%) as the protein itself, what is the top 10 then?


In [None]:

protein_length = df['2'].astype(int) 
selected_features = df[abs(df['FeatureSize'] - protein_length) / protein_length <= 0.9] 
top_10_frequent_selected_features = selected_features['1'].value_counts().nlargest(10).compute(num_workers=num_wk)
print("Top 10 most common selected InterPRO features:", top_10_frequent_selected_features) 


In [None]:

## If you look at those features which also have textual annotation, what is the top 10 most common word found in that annotation?


In [None]:

features_with_text = df[df['4'].notnull()]['5']
top_10_common_words_in_text = features_with_text.str.split().explode().value_counts().nlargest(10).compute(num_workers=num_wk)
print("Top 10 most common words in annotation:", top_10_common_words_in_text) 


In [None]:

## And the top 10 least common?


In [None]:

features_with_text = df[df['4'].notnull()]['5']
top_10_least_common_words_in_text = features_with_text.str.split().explode().value_counts().tail(10) 
print("Top 10 least common words in annotation:", top_10_least_common_words_in_text) 


In [None]:

## What is the coefficient of correlation between the size of the protein and the number of features found? 


In [None]:

coefficient_of_correlation = df['2'].astype(int).corr(df['7'].astype(int) - df['6'].astype(int)) 
protein_feature_correlation = coefficient_of_correlation.compute(num_worker=16)
print("Correlation coefficient between protein size and number of features:", protein_feature_correlation)
