In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import train_test_split

In [14]:
liar_train_fp = '../../train.tsv'
liar_val_fp = '../../valid.tsv'
liar_test_fp = '../../test.tsv'

liar_train_out_fp = '../../Clean_Liar_Train.csv'
liar_val_out_fp = '../../Clean_Liar_Val.csv'
liar_test_out_fp = '../../Clean_Liar_Test.csv'

In [3]:
col_names = ['file', 'politifact_label', 'text', 'topic', 'speaker', 'job', 'home_state',
             'political_party', 'pants_on_fire_hist_count', 'false_hist_count',
             'barely_true_hist_count', 'half_true_hist_count',
             'mostly_true_hist_count', 'source']
use_cols = ['politifact_label', 'text', 'topic', 'speaker', 'job', 'home_state',
            'political_party', 'source']
train = pd.read_csv(liar_train_fp, sep='\t', header=None, names=col_names, usecols=use_cols)
val = pd.read_csv(liar_val_fp, sep='\t', header=None, names=col_names, usecols=use_cols)
test = pd.read_csv(liar_test_fp, sep='\t', header=None, names=col_names, usecols=use_cols)

In [4]:
print(f'train: {train.shape}')
print(f'val: {val.shape}')
print(f'test: {test.shape}')

train: (10240, 8)
val: (1284, 8)
test: (1267, 8)


In [5]:
df = pd.concat([train, val, test])
print(f'full data: {df.shape}')

full data: (12791, 8)


In [6]:
df.politifact_label.value_counts(normalize=True)

half-true      0.205379
false          0.195997
mostly-true    0.191854
barely-true    0.164412
true           0.160503
pants-fire     0.081854
Name: politifact_label, dtype: float64

In [8]:
def preprocess_data(df, min_words=6, out_fp=None):
    # create binary label
    real_labels = ['true', 'mostly-true']
    fake_labels = ['false', 'pants-fire']
    conditions = [df['politifact_label'].isin(real_labels), df['politifact_label'].isin(fake_labels)]
    choices = [1, 0]
    df.loc[:, 'label'] = np.select(conditions, choices, default=np.nan)
    
    # remove rows not included in binary label
    rows_to_remove = df[df.label.isna()].shape[0]
    print(f'removing {rows_to_remove} rows ({rows_to_remove / df.shape[0]*100:0.2f}%) outside of binary categories')
    df = df[df.label.isna() == False]
    
    # remove rows with fewer than min_words
    print(f'removing {df[df.text.str.split().str.len() >= min_words].shape[0]} rows with fewer than {min_words} words')
    df = df[df.text.str.split().str.len() >= min_words]
    if out_fp is not None:
        df.to_csv(out_fp, index=False)
    return df

In [9]:
clean = preprocess_data(df)
clean.shape

removing 4730 rows (36.98%) outside of binary categories
removing 7947 rows with fewer than 6 words


(7947, 9)

In [10]:
test_prop = 0.2
clean_train, clean_test = train_test_split(clean, test_size=test_prop)
clean_train, clean_val = train_test_split(clean_train, test_size=test_prop)

In [11]:
print(f'train size: {clean_train.shape}')
print(f'val size: {clean_val.shape}')
print(f'test size: {clean_test.shape}')

train size: (5085, 9)
val size: (1272, 9)
test size: (1590, 9)


In [15]:
clean_train.to_csv(liar_train_out_fp, index=False)
clean_val.to_csv(liar_val_out_fp, index=False)
clean_test.to_csv(liar_test_out_fp, index=False)

In [None]:
clean_train.head(3)

In [None]:
# look at random samples
rand_idxs = np.random.randint(0, df.index.max(), 5)
for text in df.loc[rand_idxs,'text']:
    print(text, '\n')

In [None]:
df.text.str.split().str.len().describe()

In [None]:
min_words = 6
short_texts = df[df.text.str.split().str.len() < min_words]
print(f'{short_texts.shape[0]} rows with fewer than {min_words} words')

In [None]:
for text in short_texts.sample(n=5)['text']:
    print(text, '\n')

In [None]:
train.text.str.split().str.len().hist()
plt.show()

In [None]:
# real_labels = ['true', 'mostly-true']
# fake_labels = ['false', 'pants-fire']
# conditions = [train.politifact_label.isin(real_labels), train.politifact_label.isin(fake_labels)]
# choices = [1, 0]
    
# train.loc[:, 'label'] = np.select(conditions, choices, default=np.nan)
# train.loc[:,['politifact_label', 'text', 'label']].head(10)

In [None]:
# what % will be removed?
train[train.label.isna()].shape[0] / train.shape[0]

In [None]:
def calc_params(n, m, c_in, c_out):
    return (n * m * c_in + 1) * c_out

In [None]:
n = 5
m = 5
c_in = 128
c_out = 128
calc_params(n, m, c_in, c_out)

In [None]:
import torchtext

In [None]:
fake_fp = '../data/Fake.csv'
real_fp = '../data/True.csv'

In [None]:
fake = pd.read_csv(fake_fp)
fake.head()

In [None]:
fake.shape

In [None]:
real = pd.read_csv(real_fp)
real.head()

In [None]:
real.shape

In [None]:
# add labels before concatenating
fake.loc[:,'label'] = 0
real.loc[:,'label'] = 1

In [None]:
df = pd.concat([fake, real], ignore_index=True)
df.head()

In [None]:
df.shape

In [None]:
df.label.value_counts(normalize=True)

In [None]:
df = df[['text', 'label']]
df.shape

In [None]:
text_lengths = df.text.str.len()
print(f'average text length (chars): {text_lengths.mean():0.2f}')

In [None]:
text_lengths.describe()

In [None]:
from sklearn.model_selection import train_test_split
X = df['text']
y = df['label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
X_train.shape

In [None]:
X_test.shape

In [None]:
train = pd.concat([X_train, y_train], axis=1)
train.head()

In [None]:
test = pd.concat([X_test, y_test], axis=1)
test.head()

In [None]:
train.label.value_counts(normalize=True)

In [None]:
test.label.value_counts(normalize=True)

In [None]:
# train_fp = '../data/train.csv'
# test_fp = '../data/test.csv'
# train.to_csv(train_fp)
# test.to_csv(test_fp)

In [None]:
batch = 16
max_seq_len = 100
emb_dim = 768
n_filters = [128, 128, 128]
filter_sizes = [3, 4, 5]

X = torch.randn((batch, max_seq_len, emb_dim))
X.shape

In [None]:
conv_list = [nn.Conv1d(max_seq_len, n_filters[i], filter_sizes[i]) for i in range(len(n_filters))]
conv_list

In [None]:
conv_output = [conv(X) for conv in conv_list]
conv_output[0].shape
for conv in conv_output:
    print(f'output_dim: {conv.shape}')

In [None]:
pooled_output = [nn.MaxPool1d(5)(X) for X in conv_output]
for pooled in pooled_output:
    print(f'output_dim: {pooled.shape}')

In [None]:
torch.cat([X for X in pooled_output], dim=2).shape

In [None]:
x_embed = torch.randn((128,768))
x_embed.shape

In [None]:
x_embed.permute(1, 0).unsqueeze(0).shape

In [None]:
x_embed.unsqueeze(0).permute(0,2,1).shape

In [None]:
X.shape

In [None]:
loss = nn.CrossEntropyLoss()
inp = torch.randn(3, 5, requires_grad=True)
target = torch.empty(3, dtype=torch.long).random_(5)
# output = loss(inp, target)
# output.backward()

In [None]:
inp.shape

In [None]:
target.shape

In [None]:
logits = torch.randn((128,128,2))
logits.shape

In [None]:
logits.squeeze(0).shape