In [1]:
import pandas as pd

# Load train and test data
train_data = pd.read_csv('train_data.csv')
test_data = pd.read_csv('test_data.csv')

In [2]:
train_data.head()

Unnamed: 0,Date,URL,Title,Source,Country,Label
0,20240815T010000Z,https://borneobulletin.com.bn/explosions-repor...,Explosions reported near two ships off Yemen :...,borneobulletin.com.bn,Brunei,2
1,20240716T194500Z,https://www.hindustantimes.com/india-news/crew...,"Crew , including 13 Indians , still missing af...",hindustantimes.com,India,2
2,20240809T100000Z,https://www.yahoo.com/news/multiple-attacks-ta...,Multiple attacks target merchant ship off Yeme...,yahoo.com,United States,3
3,20240717T041500Z,https://timesofoman.com/article/147862-oil-tan...,Oil tanker with 13 Indians on board sinks off ...,timesofoman.com,Oman,2
4,20240812T201500Z,https://menafn.com/1108546043/Multiple-Attacks...,Multiple Attacks Target Merchant Ship Off Yemen,menafn.com,Qatar,3


In [3]:
test_data.head()

Unnamed: 0,Date,URL,Title,Source,Country
0,20221207T020000Z,https://www.rnz.co.nz/news/national/480280/eng...,Engineer fined over huge fire at Napier Port,rnz.co.nz,
1,20221221T150000Z,https://www.ship-technology.com/news/ictsi-lea...,ICTSI reaches 30 - year lease extension for Ba...,ship-technology.com,United States
2,20221018T084500Z,https://www.malaymail.com/news/money/mediaoutr...,DHL : Ocean freight rate moving towards manage...,malaymail.com,United States
3,20221028T151500Z,https://focustaiwan.tw/society/202210280021,Indonesians stuck on vessel in Kaohsiung set t...,focustaiwan.tw,Taiwan
4,20221018T104500Z,https://bdnews24.com/bangladesh/0ggpvbnije,Body found in container sent from Chattogram t...,bdnews24.com,Bangladesh


In [4]:
# Import necessary libraries for text processing
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import string

# Download necessary resources for NLTK
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Initialize stopwords and lemmatizer
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

# Function to clean text (remove stop words and lemmatize)
def clean_text(text):
    # Tokenize the text
    tokens = nltk.word_tokenize(text)
    
    # Remove punctuation and stop words, then apply lemmatization
    cleaned_tokens = [lemmatizer.lemmatize(word.lower()) for word in tokens 
                      if word.lower() not in stop_words and word not in string.punctuation]
    
    # Join tokens back to a single string
    cleaned_text = ' '.join(cleaned_tokens)
    return cleaned_text

# Apply the cleaning function to 'title' columns in train and test data
train_data['cleaned_title'] = train_data['Title'].apply(clean_text)
test_data['cleaned_title'] = test_data['Title'].apply(clean_text)

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Regin\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Regin\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Regin\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [5]:
train_data.head()

Unnamed: 0,Date,URL,Title,Source,Country,Label,cleaned_title
0,20240815T010000Z,https://borneobulletin.com.bn/explosions-repor...,Explosions reported near two ships off Yemen :...,borneobulletin.com.bn,Brunei,2,explosion reported near two ship yemen securit...
1,20240716T194500Z,https://www.hindustantimes.com/india-news/crew...,"Crew , including 13 Indians , still missing af...",hindustantimes.com,India,2,crew including 13 indian still missing oil tan...
2,20240809T100000Z,https://www.yahoo.com/news/multiple-attacks-ta...,Multiple attacks target merchant ship off Yeme...,yahoo.com,United States,3,multiple attack target merchant ship yemen uni...
3,20240717T041500Z,https://timesofoman.com/article/147862-oil-tan...,Oil tanker with 13 Indians on board sinks off ...,timesofoman.com,Oman,2,oil tanker 13 indian board sink oman coast
4,20240812T201500Z,https://menafn.com/1108546043/Multiple-Attacks...,Multiple Attacks Target Merchant Ship Off Yemen,menafn.com,Qatar,3,multiple attack target merchant ship yemen


In [6]:
test_data.head()

Unnamed: 0,Date,URL,Title,Source,Country,cleaned_title
0,20221207T020000Z,https://www.rnz.co.nz/news/national/480280/eng...,Engineer fined over huge fire at Napier Port,rnz.co.nz,,engineer fined huge fire napier port
1,20221221T150000Z,https://www.ship-technology.com/news/ictsi-lea...,ICTSI reaches 30 - year lease extension for Ba...,ship-technology.com,United States,ictsi reach 30 year lease extension baltic con...
2,20221018T084500Z,https://www.malaymail.com/news/money/mediaoutr...,DHL : Ocean freight rate moving towards manage...,malaymail.com,United States,dhl ocean freight rate moving towards manageab...
3,20221028T151500Z,https://focustaiwan.tw/society/202210280021,Indonesians stuck on vessel in Kaohsiung set t...,focustaiwan.tw,Taiwan,indonesian stuck vessel kaohsiung set return h...
4,20221018T104500Z,https://bdnews24.com/bangladesh/0ggpvbnije,Body found in container sent from Chattogram t...,bdnews24.com,Bangladesh,body found container sent chattogram malaysia


In [11]:
# Support Vector Machine (SVM) Algorithm

# Import necessary libraries
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report


# Ensure the 'Cleaned_Title' and 'LABEL' columns are present in the dataset
assert 'Cleaned_Title' in df.columns and 'LABEL' in df.columns, "The dataset must have 'Cleaned_Title' and 'LABEL' columns."

# Convert the 'LABEL' column to string format if necessary
df['LABEL'] = df['LABEL'].astype(str)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df['Cleaned_Title'], df['LABEL'], test_size=0.3, random_state=42)

# Vectorize the text data using TF-IDF
vectorizer = TfidfVectorizer(max_features=1000, stop_words='english')
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# Train an SVM classifier
svm_model = SVC(kernel='linear', C=1.0, random_state=42, probability=True)
svm_model.fit(X_train_tfidf, y_train)

# Make predictions on the test set
y_pred = svm_model.predict(X_test_tfidf)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)
print("Classification Report:\n", classification_report(y_test, y_pred))

# Add predictions to the test data
X_test = X_test.reset_index(drop=True)  # Reset index to align with predictions
test_data_with_predictions = pd.DataFrame(X_test, columns=['Cleaned_Title'])
test_data_with_predictions['Predicted_Label'] = y_pred

# Save the entire test data with predictions to a CSV file
test_data_with_predictions.to_csv('svm_predictions.csv', index=False)

# Display the first few rows to verify
print(test_data_with_predictions.head())

# Function for classifying new titles using the trained model
def classify_new_title(title):
    title_tfidf = vectorizer.transform([title])
    prediction = svm_model.predict(title_tfidf)
    return prediction[0]

# Example prediction
example_title = "Tanker in Red Sea Targeted by Speedboat Gunfire"
predicted_category = classify_new_title(example_title)
print("Predicted Category:", predicted_category)


AssertionError: The dataset must have 'Cleaned_Title' and 'LABEL' columns.