In [17]:
import pandas as pd          # Data manipulation
import nltk                 # Natural Language Processing (NLP)
from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split   # For splitting the data
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer   # Feature extraction
from sklearn.naive_bayes import MultinomialNB          # Naive Bayes classifier
from sklearn.metrics import classification_report, accuracy_score   # Model evaluation

In [18]:
import pandas as pd

# Read the CSV file into a DataFrame, skipping lines with parsing errors
data = pd.read_csv('/Users/joannestremy/Documents/Global/newsCorpora.csv', on_bad_lines='skip', header=None, usecols=[0])

# Now you can work with the 'data' DataFrame


In [19]:
# Display the first few rows
print(data.head())

# Check for missing values
print(data.isnull().sum())

# Examine data types
print(data.dtypes)


                                                   0
0   1\tFed official says weak data caused by weather
1  2\tFed's Charles Plosser sees high bar for cha...
2  3\tUS open: Stocks fall after Fed official hin...
3            4\tFed risks falling 'behind the curve'
4  5\tFed's Plosser: Nasty Weather Has Curbed Job...
0    0
dtype: int64
0    object
dtype: object


In [20]:
# Removing Duplicates
data.drop_duplicates(inplace=True)

# Converting to Lowercase
data[0] = data[0].str.lower()

# Handling Missing Values
data[0].fillna("", inplace=True)


In [21]:
import nltk
nltk.download('punkt')

# Tokenize the headlines
data['tokens'] = data[0].apply(nltk.word_tokenize)


[nltk_data] Downloading package punkt to
[nltk_data]     /Users/joannestremy/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [22]:
from nltk.corpus import stopwords

# Download stopwords data (you only need to do this once)
import nltk
nltk.download('stopwords')

# Remove stopwords from each token list
stop_words = set(stopwords.words('english'))
data['tokens'] = data['tokens'].apply(lambda tokens: [word for word in tokens if word not in stop_words])


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/joannestremy/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [23]:
from nltk.stem import PorterStemmer

# Create a stemmer
stemmer = PorterStemmer()

# Apply stemming to each token list
data['tokens'] = data['tokens'].apply(lambda tokens: [stemmer.stem(word) for word in tokens])


In [24]:
import re

# Define a function to remove special characters from a list of tokens
def remove_special_characters(tokens):
    cleaned_tokens = [re.sub(r'[^a-zA-Z0-9\s]', '', token) for token in tokens]
    return cleaned_tokens

# Apply the function to the 'tokens' column
data['tokens'] = data['tokens'].apply(remove_special_characters)


In [25]:
# Display the first few rows
print(data.head())


                                                   0  \
0   1\tfed official says weak data caused by weather   
1  2\tfed's charles plosser sees high bar for cha...   
2  3\tus open: stocks fall after fed official hin...   
3            4\tfed risks falling 'behind the curve'   
4  5\tfed's plosser: nasty weather has curbed job...   

                                              tokens  
0   [1, fed, offici, say, weak, data, caus, weather]  
1  [2, fed, s, charl, plosser, see, high, bar, ch...  
2  [3, us, open, , stock, fall, fed, offici, hint...  
3               [4, fed, risk, fall, behind, curv, ]  
4  [5, fed, s, plosser, , nasti, weather, curb, j...  


In [26]:
# Define the categories and their associated keywords
category_keywords = {
    'politics': ['politics', 'government', 'election', 'president', 'congress', 'senate'],
    'sports': ['sports', 'football', 'basketball', 'baseball', 'soccer', 'athlete'],
    'technology': ['technology', 'innovation', 'software', 'AI', 'cybersecurity', 'internet'],
    'business': ['business', 'economy', 'finance', 'stocks', 'market', 'entrepreneur'],
    'health': ['health', 'medicine', 'wellness', 'pandemic', 'vaccine', 'doctor'],
    'entertainment': ['entertainment', 'celebrity', 'movies', 'music', 'Hollywood', 'film'],
}


In [27]:
# Function for categorization
def categorize_tokens(tokens):
    category_counts = {}
    for category, keywords in category_keywords.items():
        count = sum(1 for token in tokens if token in keywords)
        category_counts[category] = count
    return max(category_counts, key=category_counts.get)

# Apply categorization function and create a new column
data['category'] = data['tokens'].apply(categorize_tokens)


In [28]:
# View the first few rows of the DataFrame
print(data.head())


                                                   0  \
0   1\tfed official says weak data caused by weather   
1  2\tfed's charles plosser sees high bar for cha...   
2  3\tus open: stocks fall after fed official hin...   
3            4\tfed risks falling 'behind the curve'   
4  5\tfed's plosser: nasty weather has curbed job...   

                                              tokens  category  
0   [1, fed, offici, say, weak, data, caus, weather]  politics  
1  [2, fed, s, charl, plosser, see, high, bar, ch...  politics  
2  [3, us, open, , stock, fall, fed, offici, hint...  politics  
3               [4, fed, risk, fall, behind, curv, ]  politics  
4  [5, fed, s, plosser, , nasti, weather, curb, j...  politics  


In [29]:
# Save result as csv
data.to_csv('output.csv', index=False)

In [30]:
from sklearn.preprocessing import LabelEncoder

# Extract the category labels
train_labels = data['category']

# Initialize the LabelEncoder
label_encoder = LabelEncoder()

# Convert string labels to numerical labels
train_labels_encoded = label_encoder.fit_transform(train_labels)

In [31]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import joblib

# Split the data
train_data, test_data, train_labels_encoded, test_labels = train_test_split(
    data['tokens'], train_labels_encoded, test_size=0.2, random_state=42
)

# Convert the token lists to strings
train_data = train_data.apply(' '.join)
test_data = test_data.apply(' '.join)

# Convert text data to numerical features (TF-IDF representation)
vectorizer = TfidfVectorizer()
train_data_tfidf = vectorizer.fit_transform(train_data)
test_data_tfidf = vectorizer.transform(test_data)

# Create a Naive Bayes classifier
clf = MultinomialNB()

# Train the model
clf.fit(train_data_tfidf, train_labels_encoded)

# Save the trained model to a file
model_filename = 'news_categorization_model.pkl'
joblib.dump(clf, model_filename)

# Make predictions
predictions_encoded = clf.predict(test_data_tfidf)

from sklearn.preprocessing import LabelEncoder

# Initialize the LabelEncoder
label_encoder = LabelEncoder()

# Convert string labels to numerical labels
test_labels_encoded = label_encoder.fit_transform(test_labels)

# Convert numerical predictions to string labels
predictions = label_encoder.inverse_transform(predictions_encoded)

# Evaluate the model
report = classification_report(test_labels, predictions, zero_division=1)
print(report)


              precision    recall  f1-score   support

           0       1.00      0.00      0.00      1002
           1       1.00      0.00      0.01      1539
           2       1.00      0.00      0.00      1028
           3       0.95      1.00      0.98     79082
           4       1.00      0.00      0.00        28
           5       1.00      0.00      0.00       443

    accuracy                           0.95     83122
   macro avg       0.99      0.17      0.16     83122
weighted avg       0.95      0.95      0.93     83122



In [38]:
import pandas as pd          # Data manipulation
import nltk                 # Natural Language Processing (NLP)
from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split   # For splitting the data
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer   # Feature extraction
from sklearn.naive_bayes import MultinomialNB          # Naive Bayes classifier
from sklearn.metrics import classification_report, accuracy_score   # Model evaluation

import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.preprocessing import LabelEncoder
import joblib

# Load the trained model from the saved file
model_filename = 'news_categorization_model.pkl'
clf = joblib.load(model_filename)

# Read the new news articles CSV file
data = pd.read_csv('2pageSessions.csv', on_bad_lines='skip', header=None, usecols=[0])

# Removing Duplicates
data.drop_duplicates(inplace=True)

# Converting to Lowercase
data[0] = data[0].str.lower()

# Handling Missing Values
data[0].fillna("", inplace=True)

# Tokenize the headlines
data['tokens'] = data[0].apply(nltk.word_tokenize)

from nltk.stem import PorterStemmer
import nltk
from nltk.corpus import stopwords


# Remove stopwords from each token list
stop_words = set(stopwords.words('english'))
data['tokens'] = data['tokens'].apply(lambda tokens: [word for word in tokens if word not in stop_words])

# Create a stemmer
stemmer = PorterStemmer()

# Apply stemming to each token list
data['tokens'] = data['tokens'].apply(lambda tokens: [stemmer.stem(word) for word in tokens])

import re

# Define a function to remove special characters from a list of tokens
def remove_special_characters(tokens):
    cleaned_tokens = [re.sub(r'[^a-zA-Z0-9\s]', '', token) for token in tokens]
    return cleaned_tokens

# Apply the function to the 'tokens' column
data['tokens'] = data['tokens'].apply(remove_special_characters)

# Convert token lists to strings for feature extraction
data['processed_text'] = data['tokens'].apply(' '.join)

# Convert text data to numerical features (TF-IDF representation)
vectorizer = TfidfVectorizer()
train_data_tfidf = vectorizer.fit_transform(train_data)
test_data_tfidf = vectorizer.transform(test_data)

# Make predictions using the loaded model
predictions_encoded = clf.predict(test_data_tfidf)

# Convert numerical predictions back to original labels
predictions = label_encoder.inverse_transform(predictions_encoded)

# Print the predictions
print(predictions)

print("Accuracy:", accuracy_score(predictions_encoded, predictions))
print("Classification Report:\n", classification_report(predictions_encoded, predictions))


[3 3 3 ... 3 3 3]
Accuracy: 1.0
Classification Report:
               precision    recall  f1-score   support

           1       1.00      1.00      1.00         5
           2       1.00      1.00      1.00         1
           3       1.00      1.00      1.00     83116

    accuracy                           1.00     83122
   macro avg       1.00      1.00      1.00     83122
weighted avg       1.00      1.00      1.00     83122

