In [None]:
'''According to the following paper:
https://pubs.rsna.org/doi/10.1148/radiol.2020200370

The common pattern in the COVID progression over time is:

Day 0-4: Ground glass opacities
Day 5-8: Crazy Paving & Consolidation
Day 9-13: Consolidation
Day>= 14: Residual Ground glass opacities & parenchymal bands

In this code, these COVID distinctive characteristics are searched in the dataset to see 
the comnbination of characteristics appearing in the limited number of radiologist currently
available in the COVID meta-data file.

We also visually contrast these characteristics sampling their corresponding CT scan images.
'''

# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import cv2
import matplotlib.pyplot as plt

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
        print(os.path.join(dirname))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# reading COVID meta data file
df = pd.read_csv('/kaggle/input/large-covid19-ct-slice-dataset/meta_data_covid.csv', encoding='windows-1252')

In [None]:
# specifiying the search terms we want to look for in the meta data file
search_terms =['consolidation','ground glass','diffuse','bilateral','paving','parenchymal']
    
num = 0
col_list = []
for i in df.columns:
    df[i].astype('str').apply(lambda x: col_list.append(df[i].name)  if any(ele in x.lower() for ele in search_terms)  else 'pass')
    
print("Number of rows containing the search_terms: ",len(col_list)) 
print("The columns containing the search_terms: ",set(col_list))

relavant_columns = list(set(col_list))
from collections import Counter
Counter(col_list)

In [None]:
def reformatting(x):
# relpace missing values with '-'
    try:
        return str(x)
    except:
        return '-'

def filtered_columns(column_name,terms):
    df[column_name] = df[column_name].iloc[:].apply(lambda x : reformatting(x)) 
    filterred_column = df.loc[df[column_name].str.lower().str.contains(terms)]
    print(column_name,len(filterred_column))
    return filterred_column
    
finding = filtered_columns('finding','|'.join(search_terms))
Presentation = filtered_columns('Presentation','|'.join(search_terms))
Discussion = filtered_columns('Case Discussion','|'.join(search_terms))
Severity = filtered_columns('Severity','|'.join(search_terms))
other = filtered_columns('Other diseases','|'.join(search_terms))

In [None]:
#All the rows with the search terms
merged = pd.merge(pd.merge(pd.merge(pd.merge(finding, Presentation, how="outer"), 
                                    Discussion, how="outer"), 
                                    Severity, how="outer"), 
                                    other, how="outer")
merged[relavant_columns]

### We want to count how many from each of lesion type and their combinations exists
### To that end, we first decode the existance of the lesion type in the case report in a new column. 

In [None]:
def new_column(x,i):
            ele = 'GGO'
            if (('ground' in x.lower()) and ('glass' in x.lower())) or ('ggo' in x.lower()):
                merged.loc[merged[i]==x,str(ele)] = str(ele)+', '
            #else:
            #    merged.loc[merged[i]==x,str(ele)] = ''
                    
    

for i in merged.columns:
    merged[i].astype('str').apply(lambda x: new_column(x,i)  if any(ele in x.lower() for ele in search_terms)  else 'pass')

In [None]:
search_terms =['consolidation','diffuse','opacification','bilateral','paving','parenchymal']
def new_column(x,i):
        for ele in search_terms:
            if ele in x.lower():            
                merged.loc[merged[i]==x,str(ele).capitalize()] = str(ele).capitalize()+', '
            #else:
            #    merged.loc[merged[i]==x,str(ele)] = ''
                    
    

for i in merged.columns:
    merged[i].astype('str').apply(lambda x: new_column(x,i)  if any(ele in x.lower() for ele in search_terms)  else 'pass')


### Number of cases with the lesion type (among the 796 exsiting reports)

In [None]:
terms_count = pd.DataFrame((merged.notnull().sum()[-7:]/796)*100 ,columns  =['Proportion']).sort_values('Proportion',ascending = True)
terms_count.rename(index={'Bilateral':'Bilateral Involvement','Parenchymal':'Parenchymal Bands','Paving':'Crazy Paving','Diffuse':'Diffuse Distribution'},inplace = True)
terms_count.reset_index()
df.rename(index={'alpha': 'mu'})
terms_count.reset_index(inplace = True)
terms_count.rename(columns={'index':'CT Characteristic'},inplace = True)
import plotly.express as px

fig = px.bar(terms_count,orientation="h", x='Proportion',y = 'CT Characteristic',width=500, height=400,color = "Proportion", color_continuous_scale = "Matter", template="simple_white")
fig.update_yaxes(ticks="",showgrid=False, zeroline=False)
fig.update_xaxes(ticks="",showgrid=False, zeroline=False)


### Now, we concat the new columns, and count the number from each unique combination

In [None]:
merged.fillna('', inplace=True)
merged['combinations'] = merged.iloc[:, -1]+merged.iloc[:, -2]+merged.iloc[:, -3]+merged.iloc[:, -4]+merged.iloc[:, -5]+merged.iloc[:, -6]+merged.iloc[:, -7]
merged['combinations'] = merged['combinations'].apply(lambda x: x.lstrip().rstrip(', '))


from collections import Counter
x = Counter(merged['combinations'])
summary = [[l,k] for k,l in sorted([(j,i) for i,j in x.items()], reverse=True)]

In [None]:
from tabulate import tabulate
summary = pd.DataFrame(summary, columns = ['Combination', 'Count']) 

# different combination of the terms and the number of rows they appear in
print(tabulate(summary, headers='keys', tablefmt='psql'))

In [None]:
import plotly.express as px
px.bar(summary.sort_values(by ='Count'),x='Count',y='Combination')
#summary.sort_values(by ='Count').plot(x="Combination", y="Count", kind="barh")


In [None]:
unique_comnibations = list(set(merged['combinations']))
sample_file_names = pd.DataFrame(
    [ p , merged[merged['combinations'] == p ].reset_index()['File name'].iloc[-1]] for p in unique_comnibations)

sample_file_names.columns=['Combinations','File Name']
sample_file_names

#merged.loc[0,'combinations']
#merged['combinations']

In [None]:
plt.figure(figsize=(80,80))  
for i in range(len(sample_file_names)):
    img = cv2.imread('/kaggle/input/large-covid19-ct-slice-dataset/curated_data/curated_data/2COVID/'+sample_file_names.loc[i,'File Name'])
    plt.imshow(img)
    plt.subplot(6, 5, i+1)
    plt.imshow(img)
    plt.title(sample_file_names.loc[i,'Combinations'],fontsize= 35) 
#plt.show()  
    