In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report

column_names = [
    'id',
    'label',
    'statement',
    'subject',
    'speaker',
    'job_title',
    'state_info',
    'party_affiliation',
    'barely_true_counts',
    'false_counts',
    'half_true_counts',
    'mostly_true_counts',
    'pants_on_fire_counts',
    'context'
]

#Load the datasets
train_df = pd.read_csv('data/raw/train.tsv', sep='\t', names=column_names)
valid_df = pd.read_csv('data/raw/valid.tsv', sep='\t', names=column_names)
test_df = pd.read_csv('data/raw/test.tsv', sep='\t', names=column_names)

print(f"Training set shape: {train_df.shape}")


Training set shape: (10240, 13)


In [8]:
def create_binary_labels(df):
    #define mapping
    label_map = {
        'true': 1,
        'mostly-true': 1,
        'false' : 0,
        'pants-fire': 0,
        'barely-true': 0,
    }

    #create a copy to avoid SettingWithCopyWarning
    df_copy = df.copy()
    #apply mapping
    df_copy['label_binary'] = df_copy['label'].map(label_map)
    # drop rows where label is not in map
    df_copy.dropna(subset=['label_binary'], inplace=True)
    #convert new label column to int
    df_copy['label_binary'] = df_copy['label_binary'].astype(int)
    df_copy['statement'].fillna('', inplace=True) #handle potential missing statements
    return df_copy

# apply the function to each dataframe
train_binary_df = create_binary_labels(train_df)
valid_binary_df = create_binary_labels(valid_df)
test_binary_df = create_binary_labels(test_df)

print("-- Data shapes after converting to binary labels --")
print(f"Training set shape: {train_binary_df.shape}")
print(f"Validation set shape: {valid_binary_df.shape}")
print(f"Test set shape: {test_binary_df.shape}")    

print("\n-- Label distribution in training set --")
print(train_binary_df['label_binary'].value_counts())

-- Data shapes after converting to binary labels --
Training set shape: (0, 14)
Validation set shape: (0, 14)
Test set shape: (0, 14)

-- Label distribution in training set --
Series([], Name: count, dtype: int64)


In [9]:
# define the feature and target variables
x_train = train_binary_df['statement']
y_train = train_binary_df['label_binary']

x_valid = valid_binary_df['statement']
y_valid = valid_binary_df['label_binary']

#create pipeline
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(stop_words='english', ngram_range=(1,2))),
    ('clf', LogisticRegression(solver='liblinear', random_state=42))
])

#rain the model
print("\n-- Training the model --")
pipeline.fit(x_train, y_train)
print("Model training completed.")

#make predictions on validation set
y_pred = pipeline.predict(x_valid)

#print classification report
print("\n-- Classification Report on Validation Set --")
print(classification_report(y_valid, y_pred, target_names=['False', 'True']))


-- Training the model --


ValueError: empty vocabulary; perhaps the documents only contain stop words