# Bag of Words Baseline
We train and evaluate a Bag of Words based model, to benchmark our final model against.

In [3]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
import nltk
from nltk.corpus import stopwords
import string

# Ensure necessary NLTK resources are downloaded
nltk.download('wordnet')
nltk.download('punkt_tab')
nltk.download('stopwords')

# Load the training dataset
train_df = pd.read_csv('../data/train.csv')

# Load the test dataset
test_df = pd.read_csv('../data/dev.csv')

# Preprocessing function
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    # Tokenize and convert to lowercase
    tokens = nltk.word_tokenize(text.lower())
    # Remove stopwords and punctuation
    tokens = [word for word in tokens if word not in stop_words and word not in string.punctuation]
    return ' '.join(tokens)

# Preprocess the text data
train_df['processed_text'] = train_df['text'].apply(preprocess_text)
test_df['processed_text'] = test_df['text'].apply(preprocess_text)

# Split into features (X) and target (y)
X_train = train_df['processed_text']
y_train = train_df['label']
X_test = test_df['processed_text']
y_test = test_df['label']

# Vectorize the text using Bag-of-Words (BoW) model
vectorizer = CountVectorizer()
X_train_bow = vectorizer.fit_transform(X_train)
X_test_bow = vectorizer.transform(X_test)

# Train a classifier (Logistic Regression in this case)
clf = LogisticRegression()
clf.fit(X_train_bow, y_train)

# Make predictions
y_pred = clf.predict(X_test_bow)

# Evaluate the model
print(classification_report(y_test, y_pred))


[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


              precision    recall  f1-score   support

           0       0.92      0.98      0.95      1895
           1       0.43      0.18      0.26       199

    accuracy                           0.90      2094
   macro avg       0.68      0.58      0.60      2094
weighted avg       0.87      0.90      0.88      2094



In [5]:
# Identify misclassified samples
misclassified_indices = [i for i in range(len(y_test)) if y_test.iloc[i] != y_pred[i]]
misclassified_df = pd.DataFrame({
    'text': test_df['text'].iloc[misclassified_indices],
    'true_label': y_test.iloc[misclassified_indices],
    'predicted_label': y_pred[misclassified_indices]
})

# Display or save the misclassified samples
print("\nMisclassified Samples:")
misclassified_df



Misclassified Samples:


Unnamed: 0,text,true_label,predicted_label
0,"His present "" chambers "" may be quite humble ,...",1,0
1,Krueger recently harnessed that creativity to ...,1,0
2,10:41am - Parents of children who died must ge...,1,0
4,We are alarmed to learn of your recently circu...,1,0
9,He depicts demonstrations by refugees at the b...,1,0
...,...,...,...
1862,BRITAIN 'S protracted campaign of budget cutti...,0,1
1898,Rather sad . Good set of pictures that tells a...,1,0
1912,""" This new project will see an active engageme...",0,1
1913,Veterans left on scrapheap : The homeless plig...,1,0


In [7]:
pd.set_option('display.max_colwidth', None)
misclassified_df.head()

Unnamed: 0,text,true_label,predicted_label
0,"His present "" chambers "" may be quite humble , but Shiyani has the tiny space very neatly organized and clean . Many people pass him by but do not manage to see him , because the space is partially hidden behind trees , which gives him a relative privacy . "" There are many homeless sleeping around the station , "" Captain Xoli Mbele , from the nearby Johannesburg Central Police station said .",1,0
1,"Krueger recently harnessed that creativity to self-publish a book featuring the poems , artwork , photography and short stories of 16 ill or disabled artists from around the world . She hopes the book , which contains some of her own work as well , will show how talented disabled people can be .",1,0
2,"10:41am - Parents of children who died must get compensation , free medicine must be provided to poor families across UP : Ram Gopal Yadav",1,0
4,"We are alarmed to learn of your recently circulated proposals that would eviscerate the Lifeline program and leave many of the most vulnerable people in the country without access to affordable communications . As you are well aware , the Lifeline program provides a modest monthly subsidy of $9.25 to connect low-income Americans to phone and internet services . As broadband prices continue to soar , and affordability continues to suffer , adoption gaps remain . The Lifeline program has proven critical for poor families and people of color who are caught on the wrong side of the digital divide .",1,0
9,"He depicts demonstrations by refugees at the border post , their catastrophic living conditions and the desperate attempt of several hundred to cross a river a few kilometres from the camp to get into Macedonia on 14 March 2016 .",1,0
