<h1>
    Imports
</h1>

In [1]:
import re
import pandas as pd
import numpy as np
#from transformers import BertTokenizer, BertModel
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.utils import resample
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB, GaussianNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from nltk.corpus import stopwords
from gensim.models import Word2Vec
from sklearn.svm import SVC


<h1>Getting data and EDA</h1>

In [3]:
df = pd.read_csv("data/AStar_Maritime_Events_Data.csv")
df.head()

Unnamed: 0,idxEventList,Date,Headline,Description,Severity,PortCode,PortState,Country,Latitude,Longitude,Avg Affected Stay Duration by Event
0,67,20230721,Rail service resumes to and from the Port of H...,Updated local media sources report on July 28 ...,Severe,CAHAL,HALIFAX,Canada,44.628117,-63.562271,0.488889
1,70,20230724,Port of Mundra experiencing cargo backlog due ...,Maritime media sources report on July 24 that ...,Severe,INMUN,MUNDRA,India,22.74256,69.71233,8.871528
2,72,20230724,Port workers at Le Havre extend strike action ...,Updated intelligence received by Everstream An...,Severe,FRLEH,LE HAVRE,France,49.473257,0.152048,3.693287
3,78,20230719,Truck congestion reported at Port of Durban fo...,"On July 20, maritime sources revealed that the...",Moderate,ZADUR,DURBAN,South Africa,-29.882656,31.020382,20.145833
4,85,20230709,Truckers from Intergremial de Transporte Profe...,Media sources on July 9 indicate that the Inte...,Minor,UYMVD,MONTEVIDEO,Uruguay,-34.904917,-56.208573,41.102778


In [4]:
# Define stop words for English
stop_words = set(stopwords.words('english'))

In [5]:
# Function to clean and tokenize the text
def clean_text(text):
    # Remove non-alphabetic characters
    text = re.sub(r'\W', ' ', text)
    # Lowercase the text
    text = text.lower()
    # Tokenize and remove stopwords
    tokens = [word for word in text.split() if word not in stop_words]
    return ' '.join(tokens)


In [6]:
# Clean the 'content' column (assuming 'content' contains the article text and 'search_query' is the category)
df['cleaned_content'] = df['Headline'].fillna("").apply(clean_text)

In [7]:
# Split the data into features (content) and labels (categories)
X = df['cleaned_content']
y = df['Severity']

In [8]:
# Split into training and test data (80% training, 20% testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


<h1>Multinomial Naïve Bayes Model, Vectorizer: IfidfVecotorizer</h1>

In [9]:
# Use CountVectorizer to convert the text into numerical data (word counts)
vectorizer = TfidfVectorizer()
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

In [10]:
# Create and train the Naive Bayes model
# we use multinomial NB because its categorical data
nb_model = MultinomialNB()
nb_model.fit(X_train_vec, y_train)

In [11]:
# Predict on the test set
y_pred = nb_model.predict(X_test_vec)

In [12]:
# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)

# Calculate precision
precision = precision_score(y_test, y_pred, average='weighted')  # Use 'binary' for binary classification or 'macro'/'micro' for multiclass

# Calculate recall
recall = recall_score(y_test, y_pred, average='weighted')  # Same here, use 'binary' or 'macro'/'micro' for multiclass


# Calculate F1 score
f1 = f1_score(y_test, y_pred, average='weighted')

# Print the results
print(f"Accuracy: {accuracy:.2f}")
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1 Score: {f1:.2f}")

Accuracy: 0.56
Precision: 0.47
Recall: 0.56
F1 Score: 0.47


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


<h1>Multinomial Naïve Bayes Model, Vectorizer: CountVectorizer</h1>

In [13]:
# Use CountVectorizer to convert the text into numerical data (word counts)
vectorizer = CountVectorizer()
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

In [14]:
# Create and train the Naive Bayes model
# we use multinomial NB because its categorical data
nb_model = MultinomialNB()
nb_model.fit(X_train_vec, y_train)

In [15]:
# Predict on the test set
y_pred = nb_model.predict(X_test_vec)

In [16]:
# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)

# Calculate precision
precision = precision_score(y_test, y_pred, average='weighted')  # Use 'binary' for binary classification or 'macro'/'micro' for multiclass

# Calculate recall
recall = recall_score(y_test, y_pred, average='weighted')  # Same here, use 'binary' or 'macro'/'micro' for multiclass


# Calculate F1 score
f1 = f1_score(y_test, y_pred, average='weighted')

# Print the results
print(f"Accuracy: {accuracy:.2f}")
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1 Score: {f1:.2f}")

Accuracy: 0.59
Precision: 0.57
Recall: 0.59
F1 Score: 0.55


<h1>New Test Case</h1>

In [17]:
# Function to classify new articles
def classify_new_article(article_text):
    cleaned_text = clean_text(article_text)
    vectorized_text = vectorizer.transform([cleaned_text])
    predicted_category = nb_model.predict(vectorized_text)[0]
    return predicted_category

In [18]:
# Example usage: Classify a new article
new_article = "trucks"
predicted_category = classify_new_article(new_article)
print(f"Predicted Category: {predicted_category}")

Predicted Category: Moderate


<h1>SVM Model, Tokenizer: Word2Vec</h1>

In [19]:
# Use CountVectorizer to convert the text into numerical data (word counts)
X_train_tokenized = [sentence.split() for sentence in X_train]
X_test_tokenized = [sentence.split() for sentence in X_test]
word2vec_model = Word2Vec(sentences=X_train_tokenized, vector_size=500, window=1, min_count=1, workers=5)

In [20]:
def get_avg_word2vec_vector(words, model, vector_size):
    vector = np.zeros(vector_size)
    count = 0
    
    for word in words:
        if word in model.wv:
            vector += model.wv[word]
            count += 1
    
    if count > 0:
        vector /= count
    
    return vector

In [21]:
# Vector size is the same as what you set in the Word2Vec model
vector_size = word2vec_model.vector_size

# Convert training and test data
X_train_vec = np.array([get_avg_word2vec_vector(sentence, word2vec_model, vector_size) for sentence in X_train_tokenized])
X_test_vec = np.array([get_avg_word2vec_vector(sentence, word2vec_model, vector_size) for sentence in X_test_tokenized])

In [22]:
# Create and train the Naive Bayes model
# we use multinomial NB because its categorical data
param_grid = {'C': [0.1, 1, 10], 'kernel': ['linear', 'rbf', 'poly']}
grid = GridSearchCV(SVC(), param_grid, refit=True, verbose=2)
grid.fit(X_train_vec, y_train)
print("Best Parameters:", grid.best_params_)
y_pred = grid.predict(X_test_vec)
svm_model = SVC(kernel='poly', C=10)
svm_model.fit(X_train_vec, y_train)

Fitting 5 folds for each of 9 candidates, totalling 45 fits
[CV] END ...............................C=0.1, kernel=linear; total time=   0.0s
[CV] END ...............................C=0.1, kernel=linear; total time=   0.0s
[CV] END ...............................C=0.1, kernel=linear; total time=   0.0s
[CV] END ...............................C=0.1, kernel=linear; total time=   0.0s
[CV] END ...............................C=0.1, kernel=linear; total time=   0.0s
[CV] END ..................................C=0.1, kernel=rbf; total time=   0.0s
[CV] END ..................................C=0.1, kernel=rbf; total time=   0.0s
[CV] END ..................................C=0.1, kernel=rbf; total time=   0.0s
[CV] END ..................................C=0.1, kernel=rbf; total time=   0.0s
[CV] END ..................................C=0.1, kernel=rbf; total time=   0.0s
[CV] END .................................C=0.1, kernel=poly; total time=   0.0s
[CV] END .................................C=0.1, 

In [23]:
# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)

# Calculate precision
precision = precision_score(y_test, y_pred, average='weighted')  # Use 'binary' for binary classification or 'macro'/'micro' for multiclass

# Calculate recall
recall = recall_score(y_test, y_pred, average='weighted')  # Same here, use 'binary' or 'macro'/'micro' for multiclass


# Calculate F1 score
f1 = f1_score(y_test, y_pred, average='weighted')

# Print the results
print(f"Accuracy: {accuracy:.2f}")
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1 Score: {f1:.2f}")

Accuracy: 0.56
Precision: 0.59
Recall: 0.56
F1 Score: 0.54


In [24]:
nb_model = GaussianNB()
nb_model.fit(X_train_vec, y_train)
y_pred_nb = nb_model.predict(X_test_vec)


In [25]:
# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred_nb)

# Calculate precision
precision = precision_score(y_test, y_pred_nb, average='weighted')  # Use 'binary' for binary classification or 'macro'/'micro' for multiclass

# Calculate recall
recall = recall_score(y_test, y_pred_nb, average='weighted')  # Same here, use 'binary' or 'macro'/'micro' for multiclass


# Calculate F1 score
f1 = f1_score(y_test, y_pred_nb, average='weighted')

# Print the results
print(f"Accuracy: {accuracy:.2f}")
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1 Score: {f1:.2f}")

Accuracy: 0.48
Precision: 0.57
Recall: 0.48
F1 Score: 0.48


<h1>Trying without text cleaning</h1>

In [26]:
df.head()
# Function to clean and tokenize the text
def clean_text(text):
    # Remove non-alphabetic characters
    # text = re.sub(r'\W', ' ', text)
    # Lowercase the text
    text = text.lower()
    # Tokenize and remove stopwords
    tokens = [word for word in text.split() if word not in stop_words]
    return ' '.join(tokens)
    
# Clean the 'content' column (assuming 'content' contains the article text and 'search_query' is the category)
df['cleaned_content'] = df['Headline'].fillna("").apply(clean_text)

X = df['cleaned_content']
y = df['Severity']

# Split into training and test data (80% training, 20% testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [27]:

# we use multinomial NB because its categorical data
nb_model = MultinomialNB()
nb_model.fit(X_train_vec, y_train)
# Predict on the test set
y_pred = nb_model.predict(X_test_vec)
# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)

# Calculate precision
precision = precision_score(y_test, y_pred, average='weighted')  # Use 'binary' for binary classification or 'macro'/'micro' for multiclass

# Calculate recall
recall = recall_score(y_test, y_pred, average='weighted')  # Same here, use 'binary' or 'macro'/'micro' for multiclass


# Calculate F1 score
f1 = f1_score(y_test, y_pred, average='weighted')

# Print the results
print(f"Accuracy: {accuracy:.2f}")
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1 Score: {f1:.2f}")

ValueError: Negative values in data passed to MultinomialNB (input X)

### We found that without cleaning we got roughly similar scores to the model

<h1>Logistic Regression, Vectorizer: Word2Vec
Vectorizer</h1>

In [56]:
# Function to clean and tokenize the text
def clean_text(text):
    # Remove non-alphabetic characters
    text = re.sub(r'\W', ' ', text)
    # Lowercase the text
    text = text.lower()
    # Tokenize and remove stopwords
    tokens = [word for word in text.split() if word not in stop_words]
    return tokens

# Clean and tokenize the 'Headline' column
df['tokens'] = df['Headline'].fillna("").apply(clean_text)

# Train Word2Vec model on the tokens
word2vec_model = Word2Vec(sentences=df['tokens'], vector_size=100, window=5, min_count=1, workers=4)

# Function to get the average Word2Vec vector for a document
def get_avg_word2vec(tokens, model, vector_size):
    vectors = [model.wv[word] for word in tokens if word in model.wv]
    if vectors:
        return np.mean(vectors, axis=0)
    else:
        return np.zeros(vector_size)

# Apply the function to create averaged Word2Vec embeddings
df['avg_word2vec'] = df['tokens'].apply(lambda x: get_avg_word2vec(x, word2vec_model, 100))

# Define features and target variable
X = np.vstack(df['avg_word2vec'].values)
y = df['Severity']

# Split into training and test data (80% training, 20% testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Build the Logistic Regression model for multiclass classification
logreg_model = LogisticRegression(multi_class='multinomial', solver='lbfgs', max_iter=1000)
logreg_model.fit(X_train, y_train)

# Predict on the test set
y_pred = logreg_model.predict(X_test)

# Calculate evaluation metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

# Print the results
print("Logistic Regression with Word2Vec Performance:")
print(f"Accuracy: {accuracy:.2f}")
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1 Score: {f1:.2f}")

Logistic Regression with Word2Vec Performance:
Accuracy: 0.53
Precision: 0.59
Recall: 0.53
F1 Score: 0.37


  _warn_prf(average, modifier, msg_start, len(result))


<h1>SVM, Vectorizer: Word2Vec Vectorizer</h1>

In [59]:
# Build the SVM model for multiclass classification
svm_model = LinearSVC()
svm_model.fit(X_train, y_train)

# Predict on the test set
y_pred = svm_model.predict(X_test)

# Calculate evaluation metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

# Print the results
print("SVM with Word2Vec Performance:")
print(f"Accuracy: {accuracy:.2f}")
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1 Score: {f1:.2f}")

SVM with Word2Vec Performance:
Accuracy: 0.53
Precision: 0.59
Recall: 0.53
F1 Score: 0.37


  _warn_prf(average, modifier, msg_start, len(result))


<h1>Logistic Regression, Vectorizer: tf-idf Vectorizer</h1>

In [60]:
# Function to clean and tokenize the text
def clean_text(text):
    # Remove non-alphabetic characters
    text = re.sub(r'\W', ' ', text)
    # Lowercase the text
    text = text.lower()
    # Tokenize and remove stopwords
    tokens = [word for word in text.split() if word not in stop_words]
    return ' '.join(tokens)

# Clean the 'Headline' column (assuming 'Headline' contains the article text)
df['cleaned_content'] = df['Headline'].fillna("").apply(clean_text)

# Define features and target variable
X = df['cleaned_content']
y = df['Severity']

# Split into training and test data (80% training, 20% testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Vectorize the text data using TF-IDF Vectorizer
vectorizer = TfidfVectorizer()
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

# Build the Logistic Regression model for multiclass classification
logreg_model = LogisticRegression(multi_class='multinomial', solver='lbfgs', max_iter=1000)
logreg_model.fit(X_train_vec, y_train)

# Predict on the test set
y_pred = logreg_model.predict(X_test_vec)

# Calculate evaluation metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')  # 'weighted' for multiclass
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

# Print the results
print("Logistic Regression Model Performance:")
print(f"Accuracy: {accuracy:.2f}")
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1 Score: {f1:.2f}")

Logistic Regression Model Performance:
Accuracy: 0.56
Precision: 0.47
Recall: 0.56
F1 Score: 0.49


  _warn_prf(average, modifier, msg_start, len(result))


<h1> SVM , Vectorizer: tf-idf Vectorizer</h1>

In [29]:
# Function to clean and tokenize the text
def clean_text(text):
    # Remove non-alphabetic characters
    text = re.sub(r'\W', ' ', text)
    # Lowercase the text
    text = text.lower()
    # Tokenize and remove stopwords
    tokens = [word for word in text.split() if word not in stop_words]
    return ' '.join(tokens)

# Clean the 'Headline' column (assuming 'Headline' contains the article text)
df['cleaned_content'] = df['Description'].fillna("").apply(clean_text)

# Define features and target variable
X = df['cleaned_content']
y = df['Severity']

# Split into training and test data (80% training, 20% testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Vectorize the text data using TF-IDF Vectorizer
vectorizer = TfidfVectorizer()
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

# Build the SVM model for multiclass classification
svm_model = LinearSVC()
svm_model.fit(X_train_vec, y_train)

# Predict on the test set
y_pred = svm_model.predict(X_test_vec)

# Calculate evaluation metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')  # 'weighted' for multiclass
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

# Print the results
print("SVM Model Performance:")
print(f"Accuracy: {accuracy:.2f}")
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1 Score: {f1:.2f}")

SVM Model Performance:
Accuracy: 0.56
Precision: 0.58
Recall: 0.56
F1 Score: 0.52


<h1>Logistic Regression, Vectorizer: CountVectorizer</h1>

In [30]:
# Function to clean and tokenize the text
def clean_text(text):
    # Remove non-alphabetic characters
    text = re.sub(r'\W', ' ', text)
    # Lowercase the text
    text = text.lower()
    # Tokenize and remove stopwords
    tokens = [word for word in text.split() if word not in stop_words]
    return ' '.join(tokens)

# Clean the 'Headline' column (assuming 'Headline' contains the article text)
df['cleaned_content'] = df['Description'].fillna("").apply(clean_text)

# Define features and target variable
X = df['cleaned_content']
y = df['Severity']

# Split into training and test data (80% training, 20% testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Vectorize the text data using TF-IDF Vectorizer
vectorizer = CountVectorizer()
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

# Build the Logistic Regression model for multiclass classification
logreg_model = LogisticRegression(multi_class='multinomial', solver='lbfgs', max_iter=1000)
logreg_model.fit(X_train_vec, y_train)

# Predict on the test set
y_pred = logreg_model.predict(X_test_vec)

# Calculate evaluation metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')  # 'weighted' for multiclass
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

# Print the results
print("Logistic Regression Model Performance:")
print(f"Accuracy: {accuracy:.2f}")
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1 Score: {f1:.2f}")

Logistic Regression Model Performance:
Accuracy: 0.53
Precision: 0.54
Recall: 0.53
F1 Score: 0.50




<h1> SVM , Vectorizer: CountVectorizer</h1>

In [28]:
# Function to clean and tokenize the text
def clean_text(text):
    # Remove non-alphabetic characters
    text = re.sub(r'\W', ' ', text)
    # Lowercase the text
    text = text.lower()
    # Tokenize and remove stopwords
    tokens = [word for word in text.split() if word not in stop_words]
    return ' '.join(tokens)

# Clean the 'Headline' column (assuming 'Headline' contains the article text)
df['cleaned_content'] = df['Description'].fillna("").apply(clean_text)

# Define features and target variable
X = df['cleaned_content']
y = df['Severity']

# Split into training and test data (80% training, 20% testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Vectorize the text data using TF-IDF Vectorizer
vectorizer = CountVectorizer()
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

# Build the SVM model for multiclass classification
svm_model = LinearSVC()
svm_model.fit(X_train_vec, y_train)

# Predict on the test set
y_pred = svm_model.predict(X_test_vec)

# Calculate evaluation metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')  # 'weighted' for multiclass
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

# Print the results
print("SVM Model Performance:")
print(f"Accuracy: {accuracy:.2f}")
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1 Score: {f1:.2f}")

SVM Model Performance:
Accuracy: 0.51
Precision: 0.50
Recall: 0.51
F1 Score: 0.49




<h1>Bianry split for the Logisitic Regression instead of a many way split</h1>

In [66]:
y.value_counts()

Severity
Moderate    486
Minor       277
Severe       84
Name: count, dtype: int64

In [69]:
# Function to clean and tokenize the text
def clean_text(text):
    # Remove non-alphabetic characters
    text = re.sub(r'\W', ' ', text)
    # Lowercase the text
    text = text.lower()
    # Tokenize and remove stopwords
    tokens = [word for word in text.split() if word not in stop_words]
    return ' '.join(tokens)

# Clean the 'Headline' column
df['cleaned_content'] = df['Headline'].fillna("").apply(clean_text)

# Convert 'Severity' to a binary target: 1 for 'Severe', 0 for others
df['is_severe'] = df['Severity'].apply(lambda x: 1 if x == 'Severe' else 0)

# Define features and target variable
X = df['cleaned_content']
y = df['is_severe']

# Split into training and test data (80% training, 20% testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Vectorize the text data using CountVectorizer
vectorizer = CountVectorizer()
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

# Build the Logistic Regression model for binary classification
logreg_model = LogisticRegression()
logreg_model.fit(X_train_vec, y_train)

# Predict on the test set
y_pred = logreg_model.predict(X_test_vec)

# Calculate evaluation metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

# Print the results
print("Binary Logistic Regression (Severe vs. Non-Severe) Performance:")
print(f"Accuracy: {accuracy:.2f}")
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1 Score: {f1:.2f}")

Binary Logistic Regression (Severe vs. Non-Severe) Performance:
Accuracy: 0.91
Precision: 1.00
Recall: 0.06
F1 Score: 0.11


<h1>Random Forrest with Class Weighting</h1>

In [71]:
# Function to clean and tokenize the text
def clean_text(text):
    # Remove non-alphabetic characters
    text = re.sub(r'\W', ' ', text)
    # Lowercase the text
    text = text.lower()
    # Tokenize and remove stopwords
    tokens = [word for word in text.split() if word not in stop_words]
    return ' '.join(tokens)

# Clean the 'Headline' column
df['cleaned_content'] = df['Headline'].fillna("").apply(clean_text)

# Convert 'Severity' to a binary target: 1 for 'Severe', 0 for others
df['is_severe'] = df['Severity'].apply(lambda x: 1 if x == 'Severe' else 0)

# Define features and target variable
X = df['cleaned_content']
y = df['is_severe']

# Split into training and test data (80% training, 20% testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Vectorize the text data using CountVectorizer
vectorizer = CountVectorizer()
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

# Build the Random Forest model with class weighting
rf_model = RandomForestClassifier(class_weight='balanced', random_state=42)
rf_model.fit(X_train_vec, y_train)

# Predict on the test set
y_pred = rf_model.predict(X_test_vec)

# Calculate evaluation metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

# Print the results
print("Random Forest with Class Weighting (Severe vs. Non-Severe) Performance:")
print(f"Accuracy: {accuracy:.2f}")
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1 Score: {f1:.2f}")

Random Forest with Class Weighting (Severe vs. Non-Severe) Performance:
Accuracy: 0.85
Precision: 0.60
Recall: 0.11
F1 Score: 0.19


In [75]:
# Function to clean and tokenize the text
def clean_text(text):
    # Remove non-alphabetic characters
    text = re.sub(r'\W', ' ', text)
    # Lowercase the text
    text = text.lower()
    # Tokenize and remove stopwords
    tokens = [word for word in text.split() if word not in stop_words]
    return ' '.join(tokens)

# Clean the 'Headline' column
df['cleaned_content'] = df['Headline'].fillna("").apply(clean_text)

# Convert 'Severity' to a binary target: 1 for 'Severe', 0 for others
df['is_severe'] = df['Severity'].apply(lambda x: 1 if x == 'Severe' else 0)

# Separate the majority and minority classes
df_majority = df[df['is_severe'] == 0]
df_minority = df[df['is_severe'] == 1]

# Downsample the majority class to match the minority class size
df_majority_downsampled = resample(df_majority, 
                                   replace=False,    # sample without replacement
                                   n_samples=len(df_minority),  # match minority class count
                                   random_state=42)

# Combine the downsampled majority class with the minority class
df_balanced = pd.concat([df_majority_downsampled, df_minority])

# Define features and target variable for the balanced dataset
X = df_balanced['cleaned_content']
y = df_balanced['is_severe']

# Split into training and test data (80% training, 20% testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Vectorize the text data using CountVectorizer
vectorizer = CountVectorizer()
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

# Build the Gradient Boosting model
gb_model = GradientBoostingClassifier(random_state=42)
gb_model.fit(X_train_vec, y_train)

# Predict on the test set
y_pred = gb_model.predict(X_test_vec)

# Calculate evaluation metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

# Print the results
print("Gradient Boosting (Severe vs. Non-Severe) Performance:")
print(f"Accuracy: {accuracy:.2f}")
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1 Score: {f1:.2f}")

Gradient Boosting (Severe vs. Non-Severe) Performance:
Accuracy: 0.76
Precision: 0.71
Recall: 0.71
F1 Score: 0.71
