In [1]:
# Import necessary libraries for data manipulation, regular expressions, and machine learning
import pandas as pd
import re

# Import scikit-learn utilities and classes
from sklearn.utils import resample
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score

In [2]:
# Load the training dataset
train = pd.read_csv('train.csv')

# Print the columns, shape (rows and columns), and the length (number of rows) of the training set
print("Training Set: Columns: {}, Shape: {}, Length: {}".format(train.columns, train.shape, len(train)))

# Load the test dataset
test = pd.read_csv('test.csv')

# Print the columns, shape (rows and columns), and the length (number of rows) of the test set
print("Test Set: Columns: {}, Shape: {}, Length: {}".format(test.columns, test.shape, len(test)))

Training Set: Columns: Index(['id', 'label', 'tweet'], dtype='object'), Shape: (31962, 3), Length: 31962
Test Set: Columns: Index(['id', 'tweet'], dtype='object'), Shape: (17197, 2), Length: 17197


In [3]:
# Define a function to clean text data in a DataFrame
def clean_text(df, text_field):
    # Convert text to lowercase
    df[text_field] = df[text_field].str.lower()
    # Apply regex to remove unwanted characters and patterns
    df[text_field] = df[text_field].apply(lambda elem: re.sub(
        r"(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)|^rt|http.+?", "", elem))
    return df

# Clean the text data in the 'tweet' column of both the training and test datasets
test_clean = clean_text(test, "tweet")
train_clean = clean_text(train, "tweet")

In [4]:
# Split the training data into majority and minority classes
train_majority = train_clean[train_clean.label == 0]
train_minority = train_clean[train_clean.label == 1]

# Upsample the minority class to match the number of samples in the majority class
train_minority_upsampled = resample(
    train_minority, 
    replace=True,
    n_samples=len(train_majority),
    random_state=123
)

# Combine the majority class with the upsampled minority class
train_upsampled = pd.concat([train_minority_upsampled, train_majority])

# Check the distribution of the labels to ensure balanced classes
label_counts = train_upsampled['label'].value_counts()
print(label_counts)

label
1    29720
0    29720
Name: count, dtype: int64


In [5]:
# Define the pipeline with CountVectorizer, TfidfTransformer, and SGDClassifier
pipeline_sgd = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('sgd', SGDClassifier(random_state=123))
])

In [6]:
# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(
    train_upsampled['tweet'], 
    train_upsampled['label'], 
    test_size=0.2, 
    random_state=123)

In [7]:
# Train the model
pipeline_sgd.fit(X_train, y_train)

# Make predictions
y_pred = pipeline_sgd.predict(X_val)

# Evaluate the model
accuracy = accuracy_score(y_val, y_pred)
precision = precision_score(y_val, y_pred)
recall = recall_score(y_val, y_pred)
f1 = f1_score(y_val, y_pred)

# Print evaluation metrics
print(f'Accuracy: {accuracy:.5f}')
print(f'Precision: {precision:.5f}')
print(f'Recall: {recall:.5f}')
print(f'F1 Score: {f1:.5f}')

Accuracy: 0.96846
Precision: 0.95224
Recall: 0.98679
F1 Score: 0.96921
