# 1、Read data

In [10]:
import pandas as pd

raw_data = pd.read_excel('dataset/1_Predicted_results_5921.xlsx')

### The data outside the temperature range [-20, 130] is corrected, and the same data is de-processed. The culture temperature of the organism in the statistical bar and the difference between the organism and OGT are summarized in the same table to obtain the pre-processed data set

### Data outside the range

In [11]:
Over_130_data = raw_data[raw_data.OGT >= 130]
Below_minus_20_data = raw_data[raw_data.OGT <= -20]
out_range_data = pd.concat([Below_minus_20_data, Over_130_data])
out_range_data.to_csv('dataset/Out_range_data.csv', index = False)

### Corrected normal data within the range

In [14]:
Temperature_interval_data = pd.read_excel('dataset/Temperature_interval_data_5907.xlsx')

### The same data were removed, and the culture temperature and its difference with OGT were counted

In [18]:
df_Deduplication = Temperature_interval_data.drop_duplicates(subset=['taxonomy_id', 'context'])

def load_OGT_cultivation():
    OGT_cultivation = pd.read_csv('dataset/OGT_cultivation.tsv', sep = '\t')
    OGT_cultivation = OGT_cultivation[['taxid', 'organism', 'temperature']]
    OGT_cultivation = OGT_cultivation.rename(columns  = {'organism':'scientific_name',
                                                   'taxid':'taxonomy_id',
                                                   'temperature':'OGT_cultivation'})
    
    return OGT_cultivation

OGT_cultivation = load_OGT_cultivation()

def merge_ogt(OGT_predict, OGT_cultivation):
    taxid_common = []
    OGT_predict['OGT_cultivation'] = None
    for x in OGT_predict.values:
        for y in OGT_cultivation.values:
            if int(x[0]) == int(y[0]):
                x[6] = y[1]
                taxid_common.append(x)
                break

    OGT_merge = pd.DataFrame(taxid_common, columns = ['taxonomy_id', 'scientific_name', 'context', 'name_in_context', 'doi', 
                                                      'OGT_journal', 'OGT_cultivation'])
    return OGT_merge

OGT_predict = df_Deduplication[['taxonomy_id', 'scientific_name', 'context', 'name_in_context', 'doi', 'OGT']]
OGT_cultivation = OGT_cultivation.groupby('taxonomy_id')['OGT_cultivation'].mean().reset_index()
OGT_dataset = merge_ogt(OGT_predict, OGT_cultivation)
print(OGT_dataset.shape)
print(OGT_dataset.columns)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  OGT_predict['OGT_cultivation'] = None


(4644, 7)
Index(['taxonomy_id', 'scientific_name', 'context', 'name_in_context', 'doi',
       'OGT_journal', 'OGT_cultivation'],
      dtype='object')


In [21]:
OGT_dataset['deviation'] = abs(OGT_dataset['OGT_journal'] - OGT_dataset['OGT_cultivation'])
OGT_dataset = OGT_dataset.sort_values(by="deviation", ascending = False)

In [23]:
OGT_dataset.to_excel(f'dataset/2_Preprocess_{len(OGT_dataset)}.xlsx', index = False)

# 2、Manually verify and correct the data set

In [1]:
import pandas as pd
Remediate_data = pd.read_excel('dataset/3_Corrections_4673.xlsx')

### Interval correct rate calculation

In [9]:
def Statistical_accuracy(num1, num2, data):
    data_acc = data[(num1 <= data.deviation) & (data.deviation < num2)]
    
    true_values = []
    for each in data_acc.values:
        if str(each[-1]) == "nan":
            true_values.append(each)
                
    return len(true_values), len(data_acc), len(true_values)/len(data_acc)

num1 = 3
num2 = float('inf')    #float('inf')
count1, count2, result = Statistical_accuracy(num1, num2, Remediate_data)
print("Accuracy of deviation: {} ({}//{})".format(result, count1, count2))

Accuracy of deviation: 0.798420458819105 (2123//2659)


### Data set correction

In [70]:
Remediate_data.columns

Index(['id', 'taxonomy_id', 'OGT_journal', 'scientific_name', 'context',
       'name_in_context', 'doi', 'OGT_cultivation', 'deviation', 'Correction'],
      dtype='object')

In [77]:
def revise_data(Remediate_data):
    Remediate_data['old_OGT'] = Remediate_data['OGT_journal'] 
    Remediate_data['old_deviation'] = None
    Remediate_data['new_OGT'] = Remediate_data['OGT_journal'] 
    Remediate_data['new_deviation'] = None
    
    # Remove data that does not contain the correct OGT
    Remediate_data = Remediate_data[Remediate_data.Correction != -1] 
    
    # All OGT_journal processes into correct results
    Remediate_data.loc[Remediate_data['Correction'].notna(), 'new_OGT'] = Remediate_data['Correction'] 
    
    # Re-count deviation values to ensure that all data is updated
    Remediate_data['old_deviation'] = abs(Remediate_data['OGT_cultivation'] - Remediate_data['old_OGT']) 
    Remediate_data['new_deviation'] = abs(Remediate_data['OGT_cultivation'] - Remediate_data['new_OGT']) 
    Remediate_data.drop(['OGT_journal', 'deviation'], axis=1, inplace=True)
    

    return Remediate_data
    
OGT_revise = revise_data(Remediate_data)
OGT_revise.to_excel(f'dataset/Remediate_data_{len(OGT_revise)}.xlsx', index = False)
OGT_revise.head(5)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  Remediate_data['old_deviation'] = abs(Remediate_data['OGT_cultivation'] - Remediate_data['old_OGT'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  Remediate_data['new_deviation'] = abs(Remediate_data['OGT_cultivation'] - Remediate_data['new_OGT'])
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  Remediate_data.drop(['OGT_journal', 'deviation'], axis=1, in

Unnamed: 0,id,taxonomy_id,scientific_name,context,name_in_context,doi,OGT_cultivation,Correction,old_OGT,new_OGT,old_deviation,new_deviation
1,2,105351,Aspergillus awamori,For the two isolates RCM30 and RCC20 and tempe...,A. awamori,10.1016/j.ijfoodmicro.2007.08.027,25.0,30.0,97.0,30.0,72.0,5.0
2,3,40993,Aspergillus carbonarius,For the two isolates RCM30 and RCC20 and tempe...,A. carbonarius,10.1016/j.ijfoodmicro.2007.08.027,27.0,27.5,97.0,27.5,70.0,0.5
3,4,40993,Aspergillus carbonarius,The isolate RCM15 showed a similar behaviour a...,A. carbonarius,10.1016/j.ijfoodmicro.2007.08.027,27.0,27.5,97.0,27.5,70.0,0.5
5,6,2285,Sulfolobus acidocaldarius,We examined the thermoacidophilic archaebacter...,"Sulfolobus acidocaldarius, S. acidocaldarius",10.1016/0014-5793(85)80084-3,71.0,75.0,2.5,75.0,68.5,4.0
6,7,408,Methylorubrum extorquens,The organisms with their growth optimum temper...,Methylobacterium extorquens,10.1016/S0969-2126(00)80059-3,29.0,30.0,95.0,30.0,66.0,1.0


# 3、Merge OGT_QA to build a new dataset

In [78]:
import pandas as pd
    
def merge_QA(OGT_journal):
    OGT_QA = pd.read_excel('dataset/OGT_QA.xlsx')
    OGT_QA = OGT_QA[['taxonomy_id', 'scientific_name', "context", 'OGT']]

    def isFloat(x):
        try:
            float(x)
            return True
        except:
            return False

    def data_standardization(OGT_journal):
        ogt_list = []
        for each in OGT_journal.values:
            temp = each[-1]
            if isFloat(temp):
                each[-1] = float(temp)
                ogt_list.append(each)
                continue

            if '-' in temp:
                min = temp.split('-')[0]
                max = temp.split('-')[-1]
                mean = (float(min) + float(max))/2
                each[-1] = float(mean)
                ogt_list.append(each)
                continue

            if '±' in temp:
                mean = float(temp.split('±')[0])
                each[-1] = float(mean)
                ogt_list.append(each)
                continue

            if 'K' in temp:
                mean = float(temp.split('K')[0]) - 272.15
                each[-1] = float(mean)
                ogt_list.append(each)
                continue

        return pd.DataFrame(ogt_list, columns = ['taxonomy_id', 'scientific_name', 'context', 'OGT_journal'])

    OGT_QA = data_standardization(OGT_QA)
    OGT_QA['doi'] = None
    OGT_merge = pd.concat([OGT_QA, OGT_journal])

    return OGT_merge

In [85]:
OGT_journal = pd.read_excel('dataset/Remediate_data_4238.xlsx')
OGT_journal = OGT_journal[['taxonomy_id', 'scientific_name', 'context', 'new_OGT', 'doi']]
OGT_journal = OGT_journal.rename(columns={'new_OGT':"OGT_journal"})
OGT_merge = merge_QA(OGT_journal)
OGT_merge.to_excel(f'dataset/4_merge_OGT_QA_{len(OGT_merge)}.xlsx', index = False)

# 4、Weight is removed by context similarity

In [86]:
def half_drop(data):  
    import difflib
    import pandas as pd
    
    def calculate_similarity(str1, str2):
        matcher = difflib.SequenceMatcher(None, str1, str2)
        return matcher.ratio()

    result = pd.DataFrame(columns=data.columns)

    for group_name, group_data in data.groupby(['taxonomy_id', 'OGT_journal']):
        if len(group_data) > 1:
            similar_rows = []
            dissimilar_rows = []

            for i, row1 in group_data.iterrows():
                is_dissimilar = True
                for j, row2 in group_data.iterrows():
                    if i != j:
                        similarity = calculate_similarity(row1['context'], row2['context'])
                        
                        if similarity >= 0.5:
                            similar_rows.append((similarity, row1))
                            is_dissimilar = False
                            
                if is_dissimilar:
                    dissimilar_rows.append(row1)
                    
            if similar_rows:
                max_similarity_row = max(similar_rows, key=lambda x: (x[0], -len(x[1]['context'])))
                result = result.append(max_similarity_row[1])
            else:
                for row in dissimilar_rows:
                    result = result.append(row)
            
        else:
            result = result.append(group_data)

    return result

In [87]:
OGT_merge = pd.read_excel('dataset/4_merge_OGT_QA_5014.xlsx')
result_dp = half_drop(OGT_merge)
result_dp = result_dp.sort_values(by="taxonomy_id", ascending = True)
result_dp.to_excel('dataset/5_Context_deduplication_{}.xlsx'.format(len(result_dp)), index = False)

  result = result.append(group_data)
  result = result.append(row)
  result = result.append(row)
  result = result.append(row)
  result = result.append(group_data)
  result = result.append(group_data)
  result = result.append(group_data)
  result = result.append(group_data)
  result = result.append(row)
  result = result.append(row)
  result = result.append(row)
  result = result.append(row)
  result = result.append(row)
  result = result.append(max_similarity_row[1])
  result = result.append(max_similarity_row[1])
  result = result.append(group_data)
  result = result.append(row)
  result = result.append(row)
  result = result.append(group_data)
  result = result.append(max_similarity_row[1])
  result = result.append(max_similarity_row[1])
  result = result.append(group_data)
  result = result.append(group_data)
  result = result.append(group_data)
  result = result.append(max_similarity_row[1])
  result = result.append(group_data)
  result = result.append(group_data)
  result = resul

  result = result.append(max_similarity_row[1])
  result = result.append(group_data)
  result = result.append(group_data)
  result = result.append(max_similarity_row[1])
  result = result.append(row)
  result = result.append(row)
  result = result.append(max_similarity_row[1])
  result = result.append(max_similarity_row[1])
  result = result.append(max_similarity_row[1])
  result = result.append(row)
  result = result.append(row)
  result = result.append(max_similarity_row[1])
  result = result.append(max_similarity_row[1])
  result = result.append(group_data)
  result = result.append(max_similarity_row[1])
  result = result.append(max_similarity_row[1])
  result = result.append(group_data)
  result = result.append(group_data)
  result = result.append(group_data)
  result = result.append(max_similarity_row[1])
  result = result.append(group_data)
  result = result.append(row)
  result = result.append(row)
  result = result.append(group_data)
  result = result.append(group_data)
  resul

  result = result.append(max_similarity_row[1])
  result = result.append(max_similarity_row[1])
  result = result.append(group_data)
  result = result.append(row)
  result = result.append(row)
  result = result.append(row)
  result = result.append(row)
  result = result.append(row)
  result = result.append(row)
  result = result.append(group_data)
  result = result.append(max_similarity_row[1])
  result = result.append(max_similarity_row[1])
  result = result.append(max_similarity_row[1])
  result = result.append(max_similarity_row[1])
  result = result.append(max_similarity_row[1])
  result = result.append(max_similarity_row[1])
  result = result.append(max_similarity_row[1])
  result = result.append(max_similarity_row[1])
  result = result.append(group_data)
  result = result.append(max_similarity_row[1])
  result = result.append(max_similarity_row[1])
  result = result.append(max_similarity_row[1])
  result = result.append(row)
  result = result.append(row)
  result = result.append(

  result = result.append(max_similarity_row[1])
  result = result.append(group_data)
  result = result.append(max_similarity_row[1])
  result = result.append(group_data)
  result = result.append(group_data)
  result = result.append(max_similarity_row[1])
  result = result.append(max_similarity_row[1])
  result = result.append(max_similarity_row[1])
  result = result.append(max_similarity_row[1])
  result = result.append(row)
  result = result.append(row)
  result = result.append(group_data)
  result = result.append(max_similarity_row[1])
  result = result.append(group_data)
  result = result.append(group_data)
  result = result.append(max_similarity_row[1])
  result = result.append(max_similarity_row[1])
  result = result.append(max_similarity_row[1])
  result = result.append(max_similarity_row[1])
  result = result.append(row)
  result = result.append(row)
  result = result.append(max_similarity_row[1])
  result = result.append(max_similarity_row[1])
  result = result.append(max_simil

  result = result.append(row)
  result = result.append(row)
  result = result.append(row)
  result = result.append(row)
  result = result.append(max_similarity_row[1])
  result = result.append(group_data)
  result = result.append(max_similarity_row[1])
  result = result.append(max_similarity_row[1])
  result = result.append(row)
  result = result.append(row)
  result = result.append(max_similarity_row[1])
  result = result.append(row)
  result = result.append(row)
  result = result.append(group_data)
  result = result.append(max_similarity_row[1])
  result = result.append(group_data)
  result = result.append(max_similarity_row[1])
  result = result.append(group_data)
  result = result.append(group_data)
  result = result.append(row)
  result = result.append(row)
  result = result.append(max_similarity_row[1])
  result = result.append(max_similarity_row[1])
  result = result.append(group_data)
  result = result.append(group_data)
  result = result.append(max_similarity_row[1])
  result 

  result = result.append(max_similarity_row[1])
  result = result.append(group_data)
  result = result.append(group_data)
  result = result.append(group_data)
  result = result.append(max_similarity_row[1])
  result = result.append(group_data)
  result = result.append(group_data)
  result = result.append(group_data)
  result = result.append(max_similarity_row[1])
  result = result.append(group_data)
  result = result.append(group_data)
  result = result.append(group_data)
  result = result.append(max_similarity_row[1])
  result = result.append(group_data)
  result = result.append(group_data)
  result = result.append(group_data)
  result = result.append(max_similarity_row[1])
  result = result.append(group_data)
  result = result.append(max_similarity_row[1])
  result = result.append(row)
  result = result.append(row)
  result = result.append(max_similarity_row[1])
  result = result.append(group_data)
  result = result.append(max_similarity_row[1])
  result = result.append(max_similarit

  result = result.append(max_similarity_row[1])
  result = result.append(max_similarity_row[1])
  result = result.append(group_data)
  result = result.append(max_similarity_row[1])
  result = result.append(max_similarity_row[1])
  result = result.append(group_data)
  result = result.append(group_data)
  result = result.append(group_data)
  result = result.append(max_similarity_row[1])
  result = result.append(max_similarity_row[1])
  result = result.append(group_data)
  result = result.append(max_similarity_row[1])
  result = result.append(group_data)
  result = result.append(max_similarity_row[1])
  result = result.append(max_similarity_row[1])
  result = result.append(max_similarity_row[1])
  result = result.append(max_similarity_row[1])
  result = result.append(row)
  result = result.append(row)
  result = result.append(group_data)
  result = result.append(max_similarity_row[1])
  result = result.append(group_data)
  result = result.append(group_data)
  result = result.append(group_

  result = result.append(max_similarity_row[1])
  result = result.append(group_data)
  result = result.append(max_similarity_row[1])
  result = result.append(group_data)
  result = result.append(row)
  result = result.append(row)
  result = result.append(group_data)
  result = result.append(max_similarity_row[1])
  result = result.append(max_similarity_row[1])
  result = result.append(group_data)
  result = result.append(group_data)
  result = result.append(group_data)
  result = result.append(group_data)
  result = result.append(row)
  result = result.append(row)
  result = result.append(row)
  result = result.append(group_data)
  result = result.append(max_similarity_row[1])
  result = result.append(group_data)
  result = result.append(row)
  result = result.append(row)
  result = result.append(group_data)
  result = result.append(max_similarity_row[1])
  result = result.append(group_data)
  result = result.append(group_data)
  result = result.append(group_data)
  result = result.app

  result = result.append(max_similarity_row[1])
  result = result.append(max_similarity_row[1])
  result = result.append(max_similarity_row[1])
  result = result.append(max_similarity_row[1])
  result = result.append(group_data)
  result = result.append(group_data)
  result = result.append(max_similarity_row[1])
  result = result.append(group_data)
  result = result.append(max_similarity_row[1])
  result = result.append(group_data)
  result = result.append(group_data)
  result = result.append(max_similarity_row[1])
  result = result.append(group_data)
  result = result.append(group_data)
  result = result.append(group_data)
  result = result.append(group_data)
  result = result.append(max_similarity_row[1])
  result = result.append(max_similarity_row[1])
  result = result.append(row)
  result = result.append(row)
  result = result.append(max_similarity_row[1])
  result = result.append(max_similarity_row[1])
  result = result.append(group_data)
  result = result.append(max_similarity_ro

  result = result.append(max_similarity_row[1])
  result = result.append(row)
  result = result.append(row)
  result = result.append(row)
  result = result.append(row)
  result = result.append(max_similarity_row[1])
  result = result.append(max_similarity_row[1])
  result = result.append(row)
  result = result.append(row)
  result = result.append(max_similarity_row[1])
  result = result.append(group_data)
  result = result.append(group_data)
  result = result.append(max_similarity_row[1])
  result = result.append(max_similarity_row[1])
  result = result.append(group_data)
  result = result.append(group_data)
  result = result.append(max_similarity_row[1])
  result = result.append(group_data)
  result = result.append(group_data)
  result = result.append(group_data)
  result = result.append(group_data)
  result = result.append(max_similarity_row[1])
  result = result.append(group_data)
  result = result.append(group_data)
  result = result.append(max_similarity_row[1])
  result = result.

  result = result.append(max_similarity_row[1])
  result = result.append(max_similarity_row[1])
  result = result.append(group_data)
  result = result.append(max_similarity_row[1])
  result = result.append(max_similarity_row[1])
  result = result.append(group_data)
  result = result.append(group_data)
  result = result.append(group_data)
  result = result.append(max_similarity_row[1])
  result = result.append(max_similarity_row[1])
  result = result.append(group_data)
  result = result.append(group_data)
  result = result.append(max_similarity_row[1])
  result = result.append(row)
  result = result.append(row)
  result = result.append(row)
  result = result.append(max_similarity_row[1])
  result = result.append(max_similarity_row[1])
  result = result.append(max_similarity_row[1])
  result = result.append(max_similarity_row[1])
  result = result.append(group_data)
  result = result.append(max_similarity_row[1])
  result = result.append(max_similarity_row[1])
  result = result.append(ma

  result = result.append(max_similarity_row[1])
  result = result.append(group_data)
  result = result.append(row)
  result = result.append(row)
  result = result.append(max_similarity_row[1])
  result = result.append(max_similarity_row[1])
  result = result.append(row)
  result = result.append(row)
  result = result.append(group_data)
  result = result.append(group_data)
  result = result.append(max_similarity_row[1])
  result = result.append(max_similarity_row[1])
  result = result.append(row)
  result = result.append(row)
  result = result.append(group_data)
  result = result.append(max_similarity_row[1])
  result = result.append(group_data)
  result = result.append(max_similarity_row[1])
  result = result.append(max_similarity_row[1])
  result = result.append(max_similarity_row[1])
  result = result.append(max_similarity_row[1])
  result = result.append(max_similarity_row[1])
  result = result.append(max_similarity_row[1])
  result = result.append(max_similarity_row[1])
  result = 

  result = result.append(max_similarity_row[1])
  result = result.append(max_similarity_row[1])
  result = result.append(group_data)
  result = result.append(row)
  result = result.append(row)
  result = result.append(group_data)
  result = result.append(max_similarity_row[1])
  result = result.append(group_data)
  result = result.append(max_similarity_row[1])
  result = result.append(group_data)
  result = result.append(max_similarity_row[1])
  result = result.append(row)
  result = result.append(row)
  result = result.append(row)
  result = result.append(row)
  result = result.append(max_similarity_row[1])
  result = result.append(group_data)
  result = result.append(group_data)
  result = result.append(max_similarity_row[1])
  result = result.append(max_similarity_row[1])
  result = result.append(max_similarity_row[1])
  result = result.append(group_data)
  result = result.append(max_similarity_row[1])
  result = result.append(max_similarity_row[1])
  result = result.append(group_da

  result = result.append(max_similarity_row[1])
  result = result.append(group_data)
  result = result.append(group_data)
  result = result.append(max_similarity_row[1])
  result = result.append(max_similarity_row[1])
  result = result.append(group_data)
  result = result.append(group_data)
  result = result.append(max_similarity_row[1])
  result = result.append(row)
  result = result.append(row)
  result = result.append(max_similarity_row[1])
  result = result.append(max_similarity_row[1])
  result = result.append(group_data)
  result = result.append(group_data)
  result = result.append(max_similarity_row[1])
  result = result.append(group_data)
  result = result.append(group_data)
  result = result.append(row)
  result = result.append(row)
  result = result.append(max_similarity_row[1])
  result = result.append(group_data)
  result = result.append(group_data)
  result = result.append(group_data)
  result = result.append(group_data)
  result = result.append(group_data)
  result = resu

  result = result.append(max_similarity_row[1])
  result = result.append(group_data)
  result = result.append(group_data)
  result = result.append(group_data)
  result = result.append(row)
  result = result.append(row)
  result = result.append(group_data)
  result = result.append(group_data)
  result = result.append(group_data)
  result = result.append(row)
  result = result.append(row)
  result = result.append(max_similarity_row[1])
  result = result.append(group_data)
  result = result.append(max_similarity_row[1])
  result = result.append(group_data)
  result = result.append(max_similarity_row[1])
  result = result.append(group_data)
  result = result.append(max_similarity_row[1])
  result = result.append(row)
  result = result.append(row)
  result = result.append(max_similarity_row[1])
  result = result.append(group_data)
  result = result.append(group_data)
  result = result.append(group_data)
  result = result.append(group_data)
  result = result.append(max_similarity_row[1])
  r

  result = result.append(max_similarity_row[1])
  result = result.append(group_data)
  result = result.append(group_data)
  result = result.append(row)
  result = result.append(row)
  result = result.append(max_similarity_row[1])
  result = result.append(group_data)
  result = result.append(row)
  result = result.append(row)
  result = result.append(max_similarity_row[1])
  result = result.append(group_data)
  result = result.append(group_data)
  result = result.append(group_data)
  result = result.append(group_data)
  result = result.append(row)
  result = result.append(row)
  result = result.append(row)
  result = result.append(group_data)
  result = result.append(group_data)
  result = result.append(group_data)
  result = result.append(max_similarity_row[1])
  result = result.append(max_similarity_row[1])
  result = result.append(max_similarity_row[1])
  result = result.append(group_data)
  result = result.append(row)
  result = result.append(row)
  result = result.append(max_simila

  result = result.append(max_similarity_row[1])
  result = result.append(group_data)
  result = result.append(row)
  result = result.append(row)
  result = result.append(row)
  result = result.append(group_data)
  result = result.append(row)
  result = result.append(row)
  result = result.append(group_data)
  result = result.append(group_data)
  result = result.append(group_data)
  result = result.append(max_similarity_row[1])
  result = result.append(max_similarity_row[1])
  result = result.append(max_similarity_row[1])
  result = result.append(group_data)
  result = result.append(group_data)
  result = result.append(max_similarity_row[1])
  result = result.append(group_data)
  result = result.append(max_similarity_row[1])
  result = result.append(group_data)
  result = result.append(max_similarity_row[1])
  result = result.append(group_data)
  result = result.append(max_similarity_row[1])
  result = result.append(max_similarity_row[1])
  result = result.append(group_data)
  result = 

  result = result.append(max_similarity_row[1])
  result = result.append(row)
  result = result.append(row)
  result = result.append(row)
  result = result.append(max_similarity_row[1])
  result = result.append(max_similarity_row[1])
  result = result.append(max_similarity_row[1])
  result = result.append(max_similarity_row[1])
  result = result.append(max_similarity_row[1])
  result = result.append(group_data)
  result = result.append(group_data)
  result = result.append(group_data)
  result = result.append(group_data)
  result = result.append(group_data)
  result = result.append(group_data)
  result = result.append(row)
  result = result.append(row)
  result = result.append(row)
  result = result.append(row)
  result = result.append(row)
  result = result.append(max_similarity_row[1])
  result = result.append(group_data)
  result = result.append(group_data)
  result = result.append(group_data)
  result = result.append(max_similarity_row[1])
  result = result.append(max_similarity_row

  result = result.append(max_similarity_row[1])
  result = result.append(group_data)
  result = result.append(row)
  result = result.append(row)
  result = result.append(group_data)
  result = result.append(group_data)
  result = result.append(group_data)
  result = result.append(max_similarity_row[1])
  result = result.append(row)
  result = result.append(row)
  result = result.append(row)
  result = result.append(row)
  result = result.append(row)
  result = result.append(row)
  result = result.append(row)
  result = result.append(row)
  result = result.append(row)
  result = result.append(group_data)
  result = result.append(group_data)
  result = result.append(group_data)
  result = result.append(group_data)
  result = result.append(row)
  result = result.append(row)
  result = result.append(group_data)
  result = result.append(row)
  result = result.append(row)
  result = result.append(group_data)
  result = result.append(max_similarity_row[1])
  result = result.append(group_data)

  result = result.append(max_similarity_row[1])
  result = result.append(max_similarity_row[1])
  result = result.append(group_data)
  result = result.append(group_data)
  result = result.append(group_data)
  result = result.append(group_data)
  result = result.append(row)
  result = result.append(row)
  result = result.append(row)
  result = result.append(row)
  result = result.append(group_data)
  result = result.append(max_similarity_row[1])
  result = result.append(max_similarity_row[1])
  result = result.append(group_data)
  result = result.append(group_data)
  result = result.append(group_data)
  result = result.append(group_data)
  result = result.append(max_similarity_row[1])
  result = result.append(group_data)
  result = result.append(row)
  result = result.append(row)
  result = result.append(row)
  result = result.append(group_data)
  result = result.append(group_data)
  result = result.append(group_data)
  result = result.append(group_data)
  result = result.append(row)
  

  result = result.append(max_similarity_row[1])
  result = result.append(row)
  result = result.append(row)
  result = result.append(row)
  result = result.append(group_data)
  result = result.append(group_data)
  result = result.append(group_data)
  result = result.append(max_similarity_row[1])
  result = result.append(max_similarity_row[1])
  result = result.append(max_similarity_row[1])
  result = result.append(max_similarity_row[1])
  result = result.append(group_data)
  result = result.append(group_data)
  result = result.append(max_similarity_row[1])
  result = result.append(group_data)
  result = result.append(max_similarity_row[1])
  result = result.append(row)
  result = result.append(row)
  result = result.append(max_similarity_row[1])
  result = result.append(group_data)
  result = result.append(group_data)
  result = result.append(group_data)
  result = result.append(row)
  result = result.append(row)
  result = result.append(max_similarity_row[1])
  result = result.append(

  result = result.append(row)
  result = result.append(group_data)
  result = result.append(max_similarity_row[1])
  result = result.append(max_similarity_row[1])
  result = result.append(group_data)
  result = result.append(max_similarity_row[1])
  result = result.append(group_data)
  result = result.append(group_data)
  result = result.append(group_data)
  result = result.append(group_data)
  result = result.append(group_data)
  result = result.append(max_similarity_row[1])
  result = result.append(group_data)
  result = result.append(row)
  result = result.append(row)
  result = result.append(row)
  result = result.append(row)
  result = result.append(row)
  result = result.append(row)
  result = result.append(row)
  result = result.append(row)
  result = result.append(row)
  result = result.append(row)
  result = result.append(row)
  result = result.append(group_data)
  result = result.append(group_data)
  result = result.append(max_similarity_row[1])
  result = result.append(max_s

  result = result.append(max_similarity_row[1])
  result = result.append(group_data)
  result = result.append(group_data)
  result = result.append(group_data)
  result = result.append(group_data)
  result = result.append(group_data)
  result = result.append(max_similarity_row[1])
  result = result.append(group_data)
  result = result.append(max_similarity_row[1])
  result = result.append(group_data)
  result = result.append(row)
  result = result.append(row)
  result = result.append(group_data)
  result = result.append(group_data)
  result = result.append(group_data)
  result = result.append(group_data)
  result = result.append(max_similarity_row[1])
  result = result.append(group_data)
  result = result.append(group_data)
  result = result.append(row)
  result = result.append(row)
  result = result.append(row)
  result = result.append(row)
  result = result.append(row)
  result = result.append(row)
  result = result.append(max_similarity_row[1])
  result = result.append(group_data)
  

  result = result.append(max_similarity_row[1])
  result = result.append(row)
  result = result.append(row)
  result = result.append(group_data)
  result = result.append(group_data)
  result = result.append(group_data)
  result = result.append(group_data)
  result = result.append(max_similarity_row[1])
  result = result.append(max_similarity_row[1])
  result = result.append(group_data)
  result = result.append(max_similarity_row[1])
  result = result.append(group_data)
  result = result.append(max_similarity_row[1])
  result = result.append(max_similarity_row[1])
  result = result.append(max_similarity_row[1])
  result = result.append(max_similarity_row[1])
  result = result.append(group_data)
  result = result.append(group_data)
  result = result.append(max_similarity_row[1])
  result = result.append(group_data)
  result = result.append(group_data)
  result = result.append(row)
  result = result.append(row)
  result = result.append(group_data)
  result = result.append(max_similarity_r

  result = result.append(max_similarity_row[1])
  result = result.append(max_similarity_row[1])
  result = result.append(group_data)
  result = result.append(group_data)
  result = result.append(group_data)
  result = result.append(group_data)
  result = result.append(row)
  result = result.append(row)
  result = result.append(group_data)
  result = result.append(group_data)
  result = result.append(group_data)
  result = result.append(group_data)
  result = result.append(max_similarity_row[1])
  result = result.append(group_data)
  result = result.append(group_data)
  result = result.append(max_similarity_row[1])
  result = result.append(row)
  result = result.append(row)
  result = result.append(row)
  result = result.append(max_similarity_row[1])
  result = result.append(max_similarity_row[1])
  result = result.append(max_similarity_row[1])
  result = result.append(group_data)
  result = result.append(max_similarity_row[1])
  result = result.append(group_data)
  result = result.appe

  result = result.append(max_similarity_row[1])
  result = result.append(group_data)
  result = result.append(row)
  result = result.append(row)
  result = result.append(max_similarity_row[1])
  result = result.append(max_similarity_row[1])
  result = result.append(row)
  result = result.append(row)
  result = result.append(row)
  result = result.append(row)
  result = result.append(group_data)
  result = result.append(max_similarity_row[1])
  result = result.append(group_data)
  result = result.append(max_similarity_row[1])
  result = result.append(group_data)
  result = result.append(max_similarity_row[1])
  result = result.append(group_data)
  result = result.append(max_similarity_row[1])
  result = result.append(group_data)
  result = result.append(row)
  result = result.append(row)
  result = result.append(group_data)
  result = result.append(max_similarity_row[1])
  result = result.append(group_data)
  result = result.append(max_similarity_row[1])
  result = result.append(max_sim

  result = result.append(max_similarity_row[1])
  result = result.append(row)
  result = result.append(row)
  result = result.append(group_data)
  result = result.append(max_similarity_row[1])
  result = result.append(group_data)
  result = result.append(max_similarity_row[1])
  result = result.append(group_data)
  result = result.append(group_data)
  result = result.append(max_similarity_row[1])
  result = result.append(max_similarity_row[1])
  result = result.append(group_data)
  result = result.append(group_data)
  result = result.append(group_data)
  result = result.append(group_data)
  result = result.append(row)
  result = result.append(row)
  result = result.append(group_data)
  result = result.append(max_similarity_row[1])
  result = result.append(max_similarity_row[1])
  result = result.append(max_similarity_row[1])
  result = result.append(group_data)
  result = result.append(group_data)
  result = result.append(group_data)
  result = result.append(max_similarity_row[1])
  re

  result = result.append(max_similarity_row[1])
  result = result.append(group_data)
  result = result.append(max_similarity_row[1])
  result = result.append(group_data)
  result = result.append(max_similarity_row[1])
  result = result.append(max_similarity_row[1])
  result = result.append(max_similarity_row[1])
  result = result.append(group_data)
  result = result.append(group_data)
  result = result.append(group_data)
  result = result.append(row)
  result = result.append(row)
  result = result.append(row)
  result = result.append(group_data)
  result = result.append(max_similarity_row[1])
  result = result.append(max_similarity_row[1])
  result = result.append(row)
  result = result.append(row)
  result = result.append(max_similarity_row[1])
  result = result.append(group_data)
  result = result.append(max_similarity_row[1])
  result = result.append(group_data)
  result = result.append(group_data)
  result = result.append(group_data)
  result = result.append(group_data)
  result = 

  result = result.append(max_similarity_row[1])
  result = result.append(group_data)
  result = result.append(max_similarity_row[1])
  result = result.append(max_similarity_row[1])
  result = result.append(group_data)
  result = result.append(group_data)
  result = result.append(max_similarity_row[1])
  result = result.append(max_similarity_row[1])
  result = result.append(group_data)
  result = result.append(group_data)
  result = result.append(max_similarity_row[1])
  result = result.append(max_similarity_row[1])
  result = result.append(group_data)
  result = result.append(max_similarity_row[1])
  result = result.append(group_data)
  result = result.append(row)
  result = result.append(row)
  result = result.append(max_similarity_row[1])
  result = result.append(group_data)
  result = result.append(row)
  result = result.append(row)
  result = result.append(group_data)
  result = result.append(max_similarity_row[1])
  result = result.append(group_data)
  result = result.append(group

  result = result.append(row)
  result = result.append(max_similarity_row[1])
  result = result.append(max_similarity_row[1])
  result = result.append(group_data)
  result = result.append(row)
  result = result.append(row)
  result = result.append(row)
  result = result.append(row)
  result = result.append(row)
  result = result.append(group_data)
  result = result.append(group_data)
  result = result.append(group_data)
  result = result.append(row)
  result = result.append(row)
  result = result.append(row)
  result = result.append(row)
  result = result.append(group_data)
  result = result.append(max_similarity_row[1])
  result = result.append(max_similarity_row[1])
  result = result.append(group_data)
  result = result.append(row)
  result = result.append(row)
  result = result.append(max_similarity_row[1])
  result = result.append(max_similarity_row[1])
  result = result.append(max_similarity_row[1])
  result = result.append(max_similarity_row[1])
  result = result.append(group_dat

  result = result.append(group_data)
  result = result.append(group_data)
  result = result.append(row)
  result = result.append(row)
  result = result.append(group_data)
  result = result.append(max_similarity_row[1])
  result = result.append(max_similarity_row[1])
  result = result.append(group_data)
  result = result.append(group_data)
  result = result.append(group_data)
  result = result.append(group_data)
  result = result.append(group_data)
  result = result.append(group_data)
  result = result.append(group_data)
  result = result.append(group_data)
  result = result.append(group_data)
  result = result.append(max_similarity_row[1])
  result = result.append(group_data)
  result = result.append(max_similarity_row[1])
  result = result.append(group_data)
  result = result.append(group_data)
  result = result.append(max_similarity_row[1])
  result = result.append(max_similarity_row[1])
  result = result.append(group_data)
  result = result.append(group_data)
  result = result.appen

  result = result.append(max_similarity_row[1])
  result = result.append(max_similarity_row[1])
  result = result.append(group_data)
  result = result.append(max_similarity_row[1])
  result = result.append(group_data)
  result = result.append(group_data)
  result = result.append(row)
  result = result.append(row)
  result = result.append(row)
  result = result.append(row)
  result = result.append(group_data)
  result = result.append(max_similarity_row[1])
  result = result.append(group_data)
  result = result.append(max_similarity_row[1])
  result = result.append(group_data)
  result = result.append(row)
  result = result.append(row)
  result = result.append(row)
  result = result.append(max_similarity_row[1])
  result = result.append(max_similarity_row[1])
  result = result.append(max_similarity_row[1])
  result = result.append(max_similarity_row[1])
  result = result.append(max_similarity_row[1])
  result = result.append(max_similarity_row[1])
  result = result.append(group_data)
  r

  result = result.append(max_similarity_row[1])
  result = result.append(max_similarity_row[1])
  result = result.append(group_data)
  result = result.append(group_data)
  result = result.append(max_similarity_row[1])
  result = result.append(max_similarity_row[1])
  result = result.append(row)
  result = result.append(row)
  result = result.append(row)
  result = result.append(max_similarity_row[1])
  result = result.append(group_data)
  result = result.append(group_data)
  result = result.append(row)
  result = result.append(row)
  result = result.append(max_similarity_row[1])
  result = result.append(row)
  result = result.append(row)
  result = result.append(row)
  result = result.append(row)
  result = result.append(group_data)
  result = result.append(max_similarity_row[1])
  result = result.append(group_data)
  result = result.append(row)
  result = result.append(row)
  result = result.append(group_data)
  result = result.append(max_similarity_row[1])
  result = result.append(gr

  result = result.append(max_similarity_row[1])
  result = result.append(group_data)
  result = result.append(group_data)
  result = result.append(group_data)
  result = result.append(group_data)
  result = result.append(group_data)
  result = result.append(row)
  result = result.append(row)
  result = result.append(group_data)
  result = result.append(group_data)
  result = result.append(group_data)
  result = result.append(group_data)
  result = result.append(group_data)
  result = result.append(group_data)
  result = result.append(group_data)
  result = result.append(row)
  result = result.append(row)
  result = result.append(row)
  result = result.append(max_similarity_row[1])
  result = result.append(group_data)
  result = result.append(row)
  result = result.append(row)
  result = result.append(group_data)
  result = result.append(max_similarity_row[1])
  result = result.append(max_similarity_row[1])
  result = result.append(group_data)
  result = result.append(group_data)
  resu

  result = result.append(max_similarity_row[1])
  result = result.append(group_data)
  result = result.append(group_data)
  result = result.append(max_similarity_row[1])
  result = result.append(max_similarity_row[1])
  result = result.append(group_data)
  result = result.append(group_data)
  result = result.append(group_data)
  result = result.append(group_data)
  result = result.append(group_data)
  result = result.append(group_data)
  result = result.append(max_similarity_row[1])
  result = result.append(group_data)
  result = result.append(max_similarity_row[1])
  result = result.append(row)
  result = result.append(row)
  result = result.append(row)
  result = result.append(row)
  result = result.append(row)
  result = result.append(row)
  result = result.append(row)
  result = result.append(row)
  result = result.append(group_data)
  result = result.append(max_similarity_row[1])
  result = result.append(group_data)
  result = result.append(group_data)
  result = result.append(max

  result = result.append(max_similarity_row[1])
  result = result.append(row)
  result = result.append(row)
  result = result.append(row)
  result = result.append(row)
  result = result.append(row)
  result = result.append(max_similarity_row[1])
  result = result.append(max_similarity_row[1])
  result = result.append(row)
  result = result.append(row)
  result = result.append(row)
  result = result.append(row)
  result = result.append(row)
  result = result.append(max_similarity_row[1])
  result = result.append(group_data)
  result = result.append(group_data)
  result = result.append(max_similarity_row[1])
  result = result.append(group_data)
  result = result.append(row)
  result = result.append(row)
  result = result.append(max_similarity_row[1])
  result = result.append(max_similarity_row[1])
  result = result.append(row)
  result = result.append(row)
  result = result.append(max_similarity_row[1])
  result = result.append(group_data)
  result = result.append(max_similarity_row[1])


  result = result.append(row)
  result = result.append(row)
  result = result.append(row)
  result = result.append(row)
  result = result.append(row)
  result = result.append(row)
  result = result.append(row)
  result = result.append(group_data)
  result = result.append(row)
  result = result.append(row)
  result = result.append(row)
  result = result.append(row)
  result = result.append(row)
  result = result.append(row)
  result = result.append(row)
  result = result.append(row)
  result = result.append(row)
  result = result.append(max_similarity_row[1])
  result = result.append(row)
  result = result.append(row)
  result = result.append(group_data)
  result = result.append(group_data)
  result = result.append(group_data)
  result = result.append(max_similarity_row[1])
  result = result.append(max_similarity_row[1])
  result = result.append(group_data)
  result = result.append(group_data)
  result = result.append(max_similarity_row[1])
  result = result.append(group_data)
  result 

  result = result.append(max_similarity_row[1])
  result = result.append(group_data)
  result = result.append(group_data)
  result = result.append(max_similarity_row[1])
  result = result.append(max_similarity_row[1])
  result = result.append(group_data)
  result = result.append(group_data)
  result = result.append(group_data)
  result = result.append(group_data)
  result = result.append(row)
  result = result.append(row)
  result = result.append(group_data)
  result = result.append(max_similarity_row[1])
  result = result.append(max_similarity_row[1])
  result = result.append(max_similarity_row[1])
  result = result.append(row)
  result = result.append(row)
  result = result.append(group_data)
  result = result.append(max_similarity_row[1])
  result = result.append(max_similarity_row[1])
  result = result.append(group_data)
  result = result.append(group_data)
  result = result.append(group_data)
  result = result.append(max_similarity_row[1])
  result = result.append(group_data)
  re

  result = result.append(row)
  result = result.append(row)
  result = result.append(row)
  result = result.append(row)
  result = result.append(row)
  result = result.append(group_data)
  result = result.append(max_similarity_row[1])
  result = result.append(group_data)
  result = result.append(max_similarity_row[1])
  result = result.append(group_data)
  result = result.append(group_data)
  result = result.append(group_data)
  result = result.append(max_similarity_row[1])
  result = result.append(group_data)
  result = result.append(max_similarity_row[1])
  result = result.append(group_data)
  result = result.append(group_data)
  result = result.append(row)
  result = result.append(row)
  result = result.append(group_data)
  result = result.append(group_data)
  result = result.append(group_data)
  result = result.append(group_data)
  result = result.append(group_data)
  result = result.append(group_data)
  result = result.append(group_data)
  result = result.append(max_similarity_row

  result = result.append(max_similarity_row[1])
  result = result.append(group_data)
  result = result.append(max_similarity_row[1])
  result = result.append(max_similarity_row[1])
  result = result.append(max_similarity_row[1])
  result = result.append(group_data)
  result = result.append(group_data)
  result = result.append(max_similarity_row[1])
  result = result.append(group_data)
  result = result.append(group_data)
  result = result.append(group_data)
  result = result.append(group_data)
  result = result.append(group_data)
  result = result.append(row)
  result = result.append(row)
  result = result.append(group_data)
  result = result.append(group_data)
  result = result.append(max_similarity_row[1])
  result = result.append(group_data)
  result = result.append(max_similarity_row[1])
  result = result.append(max_similarity_row[1])
  result = result.append(max_similarity_row[1])
  result = result.append(group_data)
  result = result.append(group_data)
  result = result.append(ro

  result = result.append(max_similarity_row[1])
  result = result.append(group_data)
  result = result.append(group_data)
  result = result.append(group_data)
  result = result.append(group_data)
  result = result.append(group_data)
  result = result.append(group_data)
  result = result.append(max_similarity_row[1])
  result = result.append(row)
  result = result.append(row)
  result = result.append(group_data)
  result = result.append(group_data)
  result = result.append(group_data)
  result = result.append(max_similarity_row[1])
  result = result.append(group_data)
  result = result.append(group_data)
  result = result.append(group_data)
  result = result.append(max_similarity_row[1])
  result = result.append(group_data)
  result = result.append(max_similarity_row[1])
  result = result.append(group_data)
  result = result.append(max_similarity_row[1])
  result = result.append(row)
  result = result.append(row)
  result = result.append(max_similarity_row[1])
  result = result.append(g

  result = result.append(max_similarity_row[1])
  result = result.append(max_similarity_row[1])
  result = result.append(group_data)
  result = result.append(max_similarity_row[1])
  result = result.append(group_data)
  result = result.append(group_data)
  result = result.append(max_similarity_row[1])
  result = result.append(max_similarity_row[1])
  result = result.append(group_data)
  result = result.append(group_data)
  result = result.append(max_similarity_row[1])
  result = result.append(group_data)
  result = result.append(group_data)
  result = result.append(group_data)
  result = result.append(group_data)
  result = result.append(max_similarity_row[1])
  result = result.append(group_data)
  result = result.append(group_data)
  result = result.append(group_data)
  result = result.append(group_data)
  result = result.append(group_data)
  result = result.append(group_data)
  result = result.append(max_similarity_row[1])
  result = result.append(max_similarity_row[1])
  result = re

  result = result.append(max_similarity_row[1])
  result = result.append(group_data)
  result = result.append(max_similarity_row[1])
  result = result.append(group_data)
  result = result.append(group_data)
  result = result.append(group_data)
  result = result.append(group_data)
  result = result.append(max_similarity_row[1])
  result = result.append(group_data)
  result = result.append(row)
  result = result.append(row)
  result = result.append(group_data)
  result = result.append(row)
  result = result.append(row)
  result = result.append(group_data)
  result = result.append(max_similarity_row[1])
  result = result.append(group_data)
  result = result.append(group_data)
  result = result.append(group_data)
  result = result.append(group_data)
  result = result.append(group_data)
  result = result.append(row)
  result = result.append(row)
  result = result.append(group_data)
  result = result.append(max_similarity_row[1])
  result = result.append(max_similarity_row[1])
  result = res

  result = result.append(max_similarity_row[1])
  result = result.append(max_similarity_row[1])
  result = result.append(group_data)
  result = result.append(group_data)
  result = result.append(group_data)
  result = result.append(group_data)
  result = result.append(group_data)
  result = result.append(group_data)
  result = result.append(max_similarity_row[1])
  result = result.append(group_data)
  result = result.append(group_data)
  result = result.append(group_data)
  result = result.append(group_data)
  result = result.append(max_similarity_row[1])
  result = result.append(group_data)
  result = result.append(group_data)
  result = result.append(group_data)
  result = result.append(group_data)
  result = result.append(max_similarity_row[1])
  result = result.append(group_data)
  result = result.append(max_similarity_row[1])
  result = result.append(group_data)
  result = result.append(group_data)
  result = result.append(group_data)
  result = result.append(row)
  result = resul

  result = result.append(max_similarity_row[1])
  result = result.append(group_data)
  result = result.append(group_data)
  result = result.append(row)
  result = result.append(row)
  result = result.append(row)
  result = result.append(max_similarity_row[1])
  result = result.append(max_similarity_row[1])
  result = result.append(group_data)
  result = result.append(group_data)
  result = result.append(group_data)
  result = result.append(max_similarity_row[1])
  result = result.append(group_data)
  result = result.append(group_data)
  result = result.append(group_data)
  result = result.append(group_data)
  result = result.append(max_similarity_row[1])
  result = result.append(max_similarity_row[1])
  result = result.append(max_similarity_row[1])
  result = result.append(row)
  result = result.append(row)
  result = result.append(group_data)
  result = result.append(group_data)
  result = result.append(row)
  result = result.append(row)
  result = result.append(group_data)
  result =

  result = result.append(max_similarity_row[1])
  result = result.append(max_similarity_row[1])
  result = result.append(row)
  result = result.append(row)
  result = result.append(max_similarity_row[1])
  result = result.append(group_data)
  result = result.append(max_similarity_row[1])
  result = result.append(group_data)
  result = result.append(group_data)
  result = result.append(group_data)
  result = result.append(max_similarity_row[1])
  result = result.append(group_data)
  result = result.append(group_data)
  result = result.append(group_data)
  result = result.append(group_data)
  result = result.append(group_data)
  result = result.append(max_similarity_row[1])
  result = result.append(group_data)
  result = result.append(group_data)
  result = result.append(group_data)
  result = result.append(group_data)
  result = result.append(group_data)
  result = result.append(max_similarity_row[1])
  result = result.append(group_data)
  result = result.append(group_data)
  result = r

  result = result.append(group_data)
  result = result.append(group_data)
  result = result.append(group_data)
  result = result.append(group_data)
  result = result.append(max_similarity_row[1])
  result = result.append(group_data)
  result = result.append(group_data)
  result = result.append(group_data)
  result = result.append(group_data)
  result = result.append(group_data)
  result = result.append(group_data)
  result = result.append(group_data)
  result = result.append(group_data)
  result = result.append(max_similarity_row[1])
  result = result.append(group_data)
  result = result.append(group_data)
  result = result.append(group_data)
  result = result.append(group_data)
  result = result.append(group_data)
  result = result.append(row)
  result = result.append(row)
  result = result.append(group_data)
  result = result.append(group_data)
  result = result.append(group_data)
  result = result.append(group_data)
  result = result.append(row)
  result = result.append(row)
  resul

  result = result.append(max_similarity_row[1])
  result = result.append(group_data)
  result = result.append(group_data)
  result = result.append(max_similarity_row[1])
  result = result.append(group_data)
  result = result.append(group_data)
  result = result.append(max_similarity_row[1])
  result = result.append(group_data)
  result = result.append(group_data)
  result = result.append(group_data)
  result = result.append(group_data)
  result = result.append(group_data)
  result = result.append(group_data)
  result = result.append(group_data)
  result = result.append(group_data)
  result = result.append(group_data)
  result = result.append(group_data)
  result = result.append(max_similarity_row[1])
  result = result.append(group_data)
  result = result.append(group_data)
  result = result.append(group_data)
  result = result.append(group_data)
  result = result.append(group_data)
  result = result.append(group_data)
  result = result.append(group_data)
  result = result.append(group_

  result = result.append(group_data)
  result = result.append(group_data)
  result = result.append(max_similarity_row[1])
  result = result.append(group_data)
  result = result.append(max_similarity_row[1])
  result = result.append(group_data)
  result = result.append(row)
  result = result.append(row)
  result = result.append(row)
  result = result.append(group_data)
  result = result.append(group_data)
  result = result.append(group_data)
  result = result.append(group_data)
  result = result.append(group_data)
  result = result.append(row)
  result = result.append(row)
  result = result.append(group_data)
  result = result.append(group_data)
  result = result.append(group_data)
  result = result.append(max_similarity_row[1])
  result = result.append(group_data)
  result = result.append(group_data)
  result = result.append(group_data)
  result = result.append(group_data)
  result = result.append(row)
  result = result.append(row)
  result = result.append(group_data)
  result = result

  result = result.append(max_similarity_row[1])
  result = result.append(group_data)
  result = result.append(group_data)
  result = result.append(group_data)
  result = result.append(group_data)
  result = result.append(group_data)
  result = result.append(row)
  result = result.append(row)
  result = result.append(group_data)
  result = result.append(row)
  result = result.append(row)
  result = result.append(group_data)
  result = result.append(row)
  result = result.append(row)
  result = result.append(group_data)
  result = result.append(group_data)
  result = result.append(group_data)
  result = result.append(max_similarity_row[1])
  result = result.append(row)
  result = result.append(row)
  result = result.append(max_similarity_row[1])
  result = result.append(group_data)
  result = result.append(group_data)
  result = result.append(group_data)
  result = result.append(group_data)
  result = result.append(group_data)
  result = result.append(group_data)
  result = result.append

  result = result.append(group_data)
  result = result.append(max_similarity_row[1])
  result = result.append(group_data)
  result = result.append(group_data)
  result = result.append(group_data)
  result = result.append(group_data)
  result = result.append(group_data)
  result = result.append(group_data)
  result = result.append(group_data)
  result = result.append(group_data)
  result = result.append(group_data)
  result = result.append(group_data)
  result = result.append(group_data)
  result = result.append(group_data)
  result = result.append(group_data)
  result = result.append(group_data)
  result = result.append(group_data)
  result = result.append(group_data)
  result = result.append(group_data)
  result = result.append(group_data)
  result = result.append(max_similarity_row[1])
  result = result.append(group_data)
  result = result.append(max_similarity_row[1])
  result = result.append(group_data)
  result = result.append(group_data)
  result = result.append(max_similarity_ro

# 5、The data in the intersection of OGT data set and culture temperature data set were analyzed

In [91]:
def load_OGT_cultivation():
    OGT_cultivation = pd.read_csv('dataset/OGT_cultivation.tsv', sep = '\t')
    OGT_cultivation = OGT_cultivation[['taxid', 'organism', 'temperature']]
    OGT_cultivation = OGT_cultivation.rename(columns  = {'organism':'scientific_name',
                                                   'taxid':'taxonomy_id',
                                                   'temperature':'OGT_cultivation'})
    
    return OGT_cultivation

OGT_cultivation = load_OGT_cultivation()

def merge_ogt(OGT_journal, OGT_cultivation):
    merged_data = OGT_journal.merge(OGT_cultivation[['taxonomy_id', 'OGT_cultivation']], on='taxonomy_id', how='inner')

    return merged_data

OGT_journal = result_dp[['taxonomy_id', 'context', 'scientific_name', 'OGT_journal']]
OGT_cultivation = load_OGT_cultivation()
OGT_cultivation = OGT_cultivation.groupby('taxonomy_id')['OGT_cultivation'].mean().reset_index()
OGT_final = merge_ogt(OGT_journal, OGT_cultivation)
OGT_final['Deviation'] = abs(OGT_final['OGT_cultivation'] - OGT_final['OGT_journal']) 
OGT_final = OGT_final.sort_values(by="Deviation", ascending = True)
OGT_final.to_excel('dataset/6_Deviation_{}.xlsx'.format(len(OGT_final)), index = False)