In [16]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import joblib

# Load and preprocess the training dataset
train_df = pd.read_csv("/content/articles.csv", encoding='latin1')
train_df = train_df[['Heading', 'Full_Article', 'Article_Type']]
train_df.dropna(inplace=True)



In [17]:
# Vectorization using TfidfVectorizer
vectorizer = TfidfVectorizer(max_features=1000, stop_words='english')
X_train = vectorizer.fit_transform(train_df['Full_Article'])
y_train = train_df['Article_Type']



In [18]:
# Train the classifier model (SVM as an example)
clf = SVC()
clf.fit(X_train, y_train)

# Save the trained model to disk
joblib.dump(clf, 'your_model.pkl')



['your_model.pkl']

In [21]:
# Load the unknown article URLs
unknown_df = pd.read_csv("/content/unknown_articles.csv", nrows=10)

# Initialize an empty list to store article predictions
article_predictions = []





In [22]:
# Iterate through URLs and scrape article content
for url in unknown_df['Article.URL']:
    try:
        response = requests.get(url)
        soup = BeautifulSoup(response.text, 'html.parser')

        # Adjust the code to extract the relevant content based on the website structure
        # For example, if the article text is in <p> tags, you can extract it as follows:
        article_text = ' '.join([p.get_text() for p in soup.find_all('p')])

        # Vectorize the article content using the same vectorizer
        X_unknown = vectorizer.transform([article_text])

        # Predict the Article_type
        prediction = clf.predict(X_unknown)
        article_predictions.append(prediction[0])
    except Exception as e:
        print(f"Error processing {url}: {e}")
        article_predictions.append(None)

In [24]:
# Add the predictions to the DataFrame
unknown_df['Predicted_Article_type'] = article_predictions

# Print or further analyze the results
print(unknown_df[['Article.URL', 'Predicted_Article_type']])

                                         Article.URL Predicted_Article_type
0  http://australianaviation.com.au/2018/10/a-com...               Military
1  http://australianaviation.com.au/2018/10/victo...             Commercial
2  http://australianaviation.com.au/2018/10/army-...               Military
3  https://attain.news/community/special-sea-king...               Military
4  https://m.ariva.de/amp/ad-hoc-airbus-board-of-...             Executives
5  http://m.ariva.de/amp/u-s-army-pilots-fly-auto...             Commercial
6  https://www.arabianaerospace.aero/kuwait-h225-...             Commercial
7  https://www.atlasinfo.fr/Marrakech-Air-Show-20...             Commercial
8  https://www.atlasinfo.fr/Des-shows-aeriens-en-...             Commercial
9  https://www.airmedandrescue.com/story/113203/l...             Commercial
