In [4]:
def read_dataframe(tsv_file: str) -> pd.DataFrame:
    
    # creates a "dataframe" or "df" for short. This is similar to a 2-D python dict.
    df = pd.read_csv(tsv_file, delimiter='\t', dtype=object)
    
    # replaces all "null" or "NaN" values with an empty string
    df.fillna("", inplace=True)
    
    # labels the columns in the dataset using the data dictionary described in the README
    df.columns = [
        'id',                # Column 1: the ID of the statement ([ID].json).
        'label',             # Column 2: the label.
        'statement',         # Column 3: the statement.
        'subjects',          # Column 4: the subject(s).
        'speaker',           # Column 5: the speaker.
        'speaker_job_title', # Column 6: the speaker's job title.
        'state_info',        # Column 7: the state info.
        'party_affiliation', # Column 8: the party affiliation.
        
        # Column 9-13: the total credit history count, including the current statement.
        'count_1', # barely true counts.
        'count_2', # false counts.
        'count_3', # half true counts.
        'count_4', # mostly true counts.
        'count_5', # pants on fire counts.
        
        'context' # Column 14: the context (venue / location of the speech or statement).
    ]
    
    return df

In [7]:
train_liar = read_dataframe(r"C:\Users\thevi\Downloads\Project\liar_dataset\train.tsv")
# Drop all columns except 'label' and 'statement'
columns_to_keep = ['label', 'statement']
train_liar = train_liar[columns_to_keep]

print(train_liar.head())

         label                                          statement
0    half-true  When did the decline of coal start? It started...
1  mostly-true  Hillary Clinton agrees with John McCain "by vo...
2        false  Health care reform legislation is likely to ma...
3    half-true  The economic turnaround started at the end of ...
4         true  The Chicago Bears have had more starting quart...


In [8]:
# Get the counts of each label
label_counts = train_liar['label'].value_counts()

print(label_counts)

label
half-true      2114
false          1994
mostly-true    1962
true           1676
barely-true    1654
pants-fire      839
Name: count, dtype: int64


In [10]:
# Function to categorize labels as 'Fake' or 'Real'
def categorize_label(label):
    if label in ['false', 'barely-true', 'pants-fire']:
        return 'Fake'
    elif label in ['half-true', 'mostly-true', 'true']:
        return 'Real'
    else:
        return label  # For any label that doesn't match the above

# Apply the function to the 'label' column
train_liar['label'] = train_liar['label'].apply(categorize_label)

# Get the counts of each category
label_counts = train_liar['label'].value_counts()

print(label_counts)
# Save the combined dataset to a new CSV file
train_liar.to_csv("C:\\Users\\thevi\\Downloads\\Project\\archive\\train_liar.csv", index=False)

label
Real    5752
Fake    4487
Name: count, dtype: int64


In [16]:
valid_liar = read_dataframe(r"C:\Users\thevi\Downloads\Project\liar_dataset\valid.tsv")
# Drop all columns except 'label' and 'statement'
columns_to_keep = ['label', 'statement']
valid_liar = valid_liar[columns_to_keep]
# Apply the function to the 'label' column
valid_liar['label'] = valid_liar['label'].apply(categorize_label)
# Save the combined dataset to a new CSV file
valid_liar.to_csv("C:\\Users\\thevi\\Downloads\\Project\\archive\\valid_liar.csv", index=False)
print(valid_liar.head())

  label                                          statement
0  Fake  When Obama was sworn into office, he DID NOT u...
1  Fake  Says Having organizations parading as being so...
2  Real     Says nearly half of Oregons children are poor.
3  Real  On attacks by Republicans that various program...
4  Fake  Says when armed civilians stop mass shootings ...


In [17]:
test_liar = read_dataframe(r"C:\Users\thevi\Downloads\Project\liar_dataset\test.tsv")
# Drop all columns except 'label' and 'statement'
columns_to_keep = ['label', 'statement']
test_liar = test_liar[columns_to_keep]
# Apply the function to the 'label' column
test_liar['label'] = test_liar['label'].apply(categorize_label)
# Save the combined dataset to a new CSV file
test_liar.to_csv("C:\\Users\\thevi\\Downloads\\Project\\archive\\test_liar.csv", index=False)
print(test_liar.head())

  label                                          statement
0  Fake  Wisconsin is on pace to double the number of l...
1  Fake  Says John McCain has done nothing to help the ...
2  Real  Suzanne Bonamici supports a plan that will cut...
3  Fake  When asked by a reporter whether hes at the ce...
4  Real  Over the past five years the federal governmen...
