In [24]:
import nltk
import string
import pandas as pd
import numpy as np

from nltk.tokenize import TreebankWordTokenizer
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.feature_selection import mutual_info_classif, chi2, SelectKBest
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from sklearn.svm import SVC
from sklearn.feature_selection import mutual_info_classif, SelectKBest
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from scipy.stats import pearsonr
from scipy.sparse import csr_matrix

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

tokenizer = TreebankWordTokenizer()
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))
tfidf_vectorizer = TfidfVectorizer(ngram_range=(1, 2))
count_vectorizer = CountVectorizer()
label_encoder = LabelEncoder()

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/chamilkaudugoda/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/chamilkaudugoda/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/chamilkaudugoda/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [25]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from scipy.stats import uniform, randint
from sklearn.metrics import classification_report, accuracy_score, make_scorer, f1_score, balanced_accuracy_score
import joblib
from scipy.sparse import csr_matrix

In [34]:
df = pd.read_csv('/Users/chamilkaudugoda/Documents/Master of Business Analytics/Module 3/Text Analytics for Business/Project/labelled_newscatcher_dataset.csv', delimiter=';')
corpus = list(df['title'])
labels = list(df['topic'])
y = label_encoder.fit_transform(labels)


# Data Pre-processing

In [35]:
def preprocess_text(text):
    tokens = tokenizer.tokenize(text)
    tokens = [word.lower() for word in tokens if word not in string.punctuation]
    tokens = [word for word in tokens if word not in stop_words]
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    return ' '.join(tokens)

corpus_norm = [preprocess_text(title) for title in corpus]

X_train, X_test, y_train, y_test = train_test_split(corpus_norm, y, train_size=0.7, random_state=42)
print(f'Unique labels: {np.unique(y_train)}')

Unique labels: [0 1 2 3 4 5 6 7]


# Feature Extraction

In [16]:
tfidf_vectorizer = TfidfVectorizer(ngram_range=(1, 2))
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

len(tfidf_vectorizer.vocabulary_)

409007

In [17]:
X_train_tfidf.shape, X_test_tfidf.shape

((76141, 409007), (32633, 409007))

# Feauture Selection

In [18]:
selector = SelectKBest(chi2, k=1000)
X_train_reduc = selector.fit_transform(X_train_tfidf, y_train)
X_test_reduc = selector.transform(X_test_tfidf)

In [19]:
def calculate_correlations_optimized(X, threshold=0.8):
    X_dense = X.toarray()
    corr_matrix = np.corrcoef(X_dense, rowvar=False)
    to_remove = set()
    for i in range(corr_matrix.shape[0]):
        for j in range(i + 1, corr_matrix.shape[1]):
            if corr_matrix[i, j] > threshold:
                to_remove.add(j)
    return to_remove

# Identify highly correlated features
features_to_remove = calculate_correlations_optimized(X_train_reduc)

# Remove highly correlated features from TF-IDF matrices
X_train_tfidf_selected = csr_matrix(np.delete(X_train_reduc.toarray(), list(features_to_remove), axis=1))
X_test_tfidf_selected = csr_matrix(np.delete(X_test_reduc.toarray(), list(features_to_remove), axis=1))


In [20]:
print(f'Shape of X_train_tfidf_selected: {X_train_tfidf_selected.shape}')
print(f'Shape of X_test_tfidf_selected: {X_test_tfidf_selected.shape}')

Shape of X_train_tfidf_selected: (76141, 852)
Shape of X_test_tfidf_selected: (32633, 852)


# Random Forest

In [21]:
parameters = {
    'n_estimators': randint(50, 400),
    'max_depth': randint(1, 20),
    'min_samples_split': randint(2, 10),
    'min_samples_leaf': randint(1, 10),
    'max_samples': uniform(0.1, 0.9),
    'max_features': uniform(0.1, 0.9)
}

In [22]:
rf_clf = RandomForestClassifier()

In [26]:
random_search2 =RandomizedSearchCV(rf_clf, parameters, n_jobs = -1, cv = 3, n_iter = 20, scoring = make_scorer(balanced_accuracy_score))
random_search2.fit(X_train_tfidf_selected, y_train)

In [27]:
print(random_search2.best_params_)
print(random_search2.best_score_)

{'max_depth': 19, 'max_features': 0.10863713979606356, 'max_samples': 0.32967000949464453, 'min_samples_leaf': 7, 'min_samples_split': 2, 'n_estimators': 123}
0.38610232953508516


In [28]:
random_search_best2 = random_search2.best_estimator_
accur2 = random_search_best2.score(X_test_tfidf_selected, y_test)
print(accur2)

0.4057242668464438


In [29]:
cv = StratifiedKFold(n_splits=10)
cv_results = cross_val_score(random_search_best2, X_train_tfidf_selected, y_train, cv=cv, scoring=make_scorer(balanced_accuracy_score))
print(f'10-fold cross-validation Balanced Accuracy-score: {np.mean(cv_results)}')

10-fold cross-validation Balanced Accuracy-score: 0.3867378749318577


In [36]:
test_predictions = random_search_best2.predict(X_test_tfidf_selected)
bal_acc=balanced_accuracy_score(y_test, test_predictions)
report = classification_report(y_test, test_predictions, target_names=label_encoder.classes_)
print(report)
print("Balanced Accuracy: ", bal_acc)

               precision    recall  f1-score   support

     BUSINESS       0.86      0.31      0.46      4419
ENTERTAINMENT       0.21      0.93      0.34      4488
       HEALTH       0.54      0.66      0.59      4466
       NATION       0.69      0.04      0.08      4412
      SCIENCE       0.92      0.19      0.32      1144
       SPORTS       0.90      0.37      0.52      4551
   TECHNOLOGY       0.83      0.39      0.53      4558
        WORLD       0.76      0.20      0.31      4595

     accuracy                           0.41     32633
    macro avg       0.71      0.39      0.39     32633
 weighted avg       0.69      0.41      0.40     32633

Balanced Accuracy:  0.3859320950736679
