In [15]:
import transformers

In [33]:
import torch
import torch.nn as nn
import numpy as np
import pandas as pd

In [17]:
model = transformers.BertModel.from_pretrained('bert-base-uncased')

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


### very basic and similar to https://arxiv.org/pdf/2005.09207.pdf

In [18]:
def encode_table_query(model, table, query):
    tokenizer = transformers.BertTokenizer.from_pretrained('bert-base-uncased')
    inputs = tokenizer.encode_plus(table, query, add_special_tokens=True, return_tensors='pt')
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1)

In [19]:
def retrieve_relevant_info(table_representation, query_representation):
    scores = torch.nn.functional.cosine_similarity(table_representation, query_representation)
    relevant_rows = scores.topk(k=1, dim=0).indices.tolist()
    return relevant_rows

In [20]:
table = "Name | Capital | Population\n------------------------------\nFrance| Paris  | 66 million\n------------------------------\nGermany| Berlin | 83 million\n------------------------------\nItaly | Rome   | 60 million"
query = "Capital and population of Cities?"

table_representation = encode_table_query(model, table, "")
query_representation = encode_table_query(model, "", query)
relevant_rows = retrieve_relevant_info(table_representation, query_representation)

print("Relevant rows:", relevant_rows)

Relevant rows: [0]


In [21]:
def encode_table(model, table, caption, reference):
    input_text = caption + " " + reference + " " + table
    input_ids = transformers.BertTokenizer.from_pretrained('bert-base-uncased').encode(input_text, return_tensors='pt')
    with torch.no_grad():
        outputs = model(input_ids)
    return outputs[0].mean(dim=1)

In [22]:
class RelevancePredictor(nn.Module):
    def __init__(self, hidden_size, num_classes):
        super(RelevancePredictor, self).__init__()
        self.fc = nn.Linear(hidden_size, num_classes)

    def forward(self, x):
        x = self.fc(x)
        return x

In [23]:
# Example usage
table = "Name | Capital | Population\n------------------------------\nFrance| Paris  | 66 million\n------------------------------\nGermany| Berlin | 83 million\n------------------------------\nItaly | Rome   | 60 million"
caption = "Information about countries"
reference = "Source: Wikipedia"

table_representation = encode_table(model, table, caption, reference)
model = RelevancePredictor(hidden_size=table_representation.size(-1), num_classes=1)
predictions = model(table_representation)

print("Relevance predictions:", predictions)

Relevance predictions: tensor([[-0.2970]], grad_fn=<AddmmBackward0>)


### Rows and columns with maximum number of non-zero values

In [28]:
# define the table as a numpy array
table = np.array([
    [ 'Country', 'Population', 'Area (km²)', 'GDP per capita (USD)'],
    [ 'USA', '331,449,281', '9,826,630', '62,794'],
    [ 'Canada', '38,048,738', '9,984,670', '48,265'],
    [ 'Mexico', '129,166,028', '1,964,375', '9,747'],
    [ 'Argentina', '45,197,956', '2,780,400', '11,866'],
    [ 'Brazil', '212,559,417', '8,515,767', '9,126']
])

# find rows with the maximum number of non-zero values
non_zero_counts = np.count_nonzero(table[1:], axis=1)
max_non_zero_count = np.max(non_zero_counts)
max_non_zero_rows = table[1:][non_zero_counts == max_non_zero_count]

print("Rows with maximum number of non-zero values:")
for row in max_non_zero_rows:
    print(row)

# find columns with the maximum number of non-zero values
non_zero_counts = np.count_nonzero(table[1:], axis=0)
max_non_zero_count = np.max(non_zero_counts)
max_non_zero_cols = table[:,non_zero_counts == max_non_zero_count]

print("Columns with maximum number of non-zero values:")
for col in max_non_zero_cols.T:
    print(col)

Rows with maximum number of non-zero values:
['USA' '331,449,281' '9,826,630' '62,794']
['Canada' '38,048,738' '9,984,670' '48,265']
['Mexico' '129,166,028' '1,964,375' '9,747']
['Argentina' '45,197,956' '2,780,400' '11,866']
['Brazil' '212,559,417' '8,515,767' '9,126']
Columns with maximum number of non-zero values:
['Country' 'USA' 'Canada' 'Mexico' 'Argentina' 'Brazil']
['Population' '331,449,281' '38,048,738' '129,166,028' '45,197,956'
 '212,559,417']
['Area (km²)' '9,826,630' '9,984,670' '1,964,375' '2,780,400' '8,515,767']
['GDP per capita (USD)' '62,794' '48,265' '9,747' '11,866' '9,126']


### Rows with the most frequent values as relevant

In [29]:
data = {
    "Country": ["USA", "Canada", "Mexico", "Argentina", "Brazil"],
    "Population": [331449281, 38048738, 129166028, 45197956, 212559417],
    "Area (km²)": [9826630, 9984670, 1964375, 2780400, 8515767],
    "GDP per capita (USD)": [62794, 48265, 9747, 11866, 9126]
}
table = pd.DataFrame(data)

most_frequent_row = table.mode().iloc[0]

relevant_rows = table[table.eq(most_frequent_row).any(axis=1)]

print(relevant_rows)

     Country  Population  Area (km²)  GDP per capita (USD)
1     Canada    38048738     9984670                 48265
2     Mexico   129166028     1964375                  9747
3  Argentina    45197956     2780400                 11866
4     Brazil   212559417     8515767                  9126


### Columns with the highest standard deviation as relevant

In [30]:
data = {
    "Country": ["USA", "Canada", "Mexico", "Argentina", "Brazil"],
    "Population": [331449281, 38048738, 129166028, 45197956, 212559417],
    "Area (km²)": [9826630, 9984670, 1964375, 2780400, 8515767],
    "GDP per capita (USD)": [62794, 48265, 9747, 11866, 9126]
}
table = pd.DataFrame(data)

highest_std_col = table.std().idxmax()

relevant_cols = table.loc[:, [highest_std_col]]

print(relevant_cols)

   Population
0   331449281
1    38048738
2   129166028
3    45197956
4   212559417


  highest_std_col = table.std().idxmax()


### Columns with a high percentage of non-empty cells

In [31]:
data = {
    "Country": ["USA", "Canada", "Mexico", "Argentina", "Brazil"],
    "Population": [331449281, 38048738, 129166028, 45197956, 212559417],
    "Area (km²)": [9826630, 9984670, 1964375, 2780400, 8515767],
    "GDP per capita (USD)": [62794, 48265, 9747, 11866, 9126]
}
table = pd.DataFrame(data)

non_empty_percent = table.count() / table.shape[0] * 100

relevant_cols = table.loc[:, non_empty_percent > 50]

print(relevant_cols)

     Country  Population  Area (km²)  GDP per capita (USD)
0        USA   331449281     9826630                 62794
1     Canada    38048738     9984670                 48265
2     Mexico   129166028     1964375                  9747
3  Argentina    45197956     2780400                 11866
4     Brazil   212559417     8515767                  9126


### Rows that contain aggregate data such as sums, averages, etc. can also be considered important

In [34]:
data = {'City': ['New York', 'Los Angeles', 'Chicago', 'Houston', 'Phoenix'],
        'Population': [8336817, 3979576, 2693976, 2320268, 1680992],
        'Median Age': [36.4, 35.2, 34.1, 33.8, 33.6],
        'Average Income (USD)': [77000, 65000, 63000, 59000, 55000],
        'Average Rent (USD)': [2900, 2300, 1800, 1200, 1100]}

df = pd.DataFrame(data)

agg_cols = ['Population', 'Median Age', 'Average Income (USD)', 'Average Rent (USD)']

df['Mean Population'] = df[agg_cols].mean(axis=1)
df['Sum Avg Income and Rent'] = df['Average Income (USD)'] + df['Average Rent (USD)']

sorted_df = df.sort_values(by=['Mean Population', 'Sum Avg Income and Rent'], ascending=[False, False])

print(sorted_df)

          City  Population  Median Age  Average Income (USD)  \
0     New York     8336817        36.4                 77000   
1  Los Angeles     3979576        35.2                 65000   
2      Chicago     2693976        34.1                 63000   
3      Houston     2320268        33.8                 59000   
4      Phoenix     1680992        33.6                 55000   

   Average Rent (USD)  Mean Population  Sum Avg Income and Rent  
0                2900      2104188.350                    79900  
1                2300      1011727.800                    67300  
2                1800       689702.525                    64800  
3                1200       595125.450                    60200  
4                1100       434281.400                    56100  


### Correlation: Relevant rows and columns can be selected based on the correlation of the data in a column with the data in other columns.

| City      | Population | Area (km²) | Average temperature (°C) | Elevation (m) |
|-----------|------------|------------|--------------------------|---------------|
| Tokyo     | 13,515,271 | 2,187      | 16.2                     | 44            |
| New York  | 8,175,133  | 468.9      | 12.9                     | 10            |
| London    | 8,982,000  | 1,572      | 9.8                      | 35            |
| Paris     | 2,148,271  | 105.4      | 11.5                     | 35            |
| Istanbul  | 15,029,231 | 5,461      | 13.9                     | 110           |


In [35]:
data = {'City': ['Tokyo', 'New York', 'London', 'Paris', 'Istanbul'],
        'Population': [13515271, 8175133, 8982000, 2148271, 15029231],
        'Area (km²)': [2187, 468.9, 1572, 105.4, 5461],
        'Average temperature (°C)': [16.2, 12.9, 9.8, 11.5, 13.9],
        'Elevation (m)': [44, 10, 35, 35, 110]}
df = pd.DataFrame(data)

corr_matrix = df.corr()

relevant_columns = []
threshold = 0.8
for col in corr_matrix.columns:
    if max(corr_matrix[col]) > threshold:
        relevant_columns.append(col)

df_relevant = df[relevant_columns]

print(df_relevant)

   Population  Area (km²)  Average temperature (°C)  Elevation (m)
0    13515271      2187.0                      16.2             44
1     8175133       468.9                      12.9             10
2     8982000      1572.0                       9.8             35
3     2148271       105.4                      11.5             35
4    15029231      5461.0                      13.9            110


  corr_matrix = df.corr()
