In [33]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [34]:
baseline = pd.read_csv('../data/baseline_features.csv')

In [35]:
baseline.head()

Unnamed: 0,source,headline,headline_processed,headline_length,unique_ratio,frequency_ratio,sensational
0,reuters,Germany's Kuehne examines offer for Signa's Ha...,germani kuehn examin offer signa hamburg skysc...,10,1.0,0.0,0
1,reuters,Shoppers click 'buy' as retailers slash prices...,shopper click buy retail slash price ahead cyb...,11,1.0,0.0,0
2,reuters,US Black Friday sales rise 2.5% -Mastercard Sp...,u black friday sale rise mastercard spendingpuls,8,1.0,0.0,0
3,reuters,X may lose up to $75M by year-end on advertise...,x may lose yearend advertis exodu,11,1.0,0.0,0
4,reuters,Sri Lanka to OK Sinopec's $4.5 bln refinery pr...,sri lanka ok sinopec bln refineri propos monda...,12,1.0,0.0,0


In [36]:
# declare feature, target variables
X = baseline.drop(columns=['headline','headline_processed','sensational','source'])
y = baseline['sensational']

In [37]:
# split data into train, test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 18)

In [38]:
# check split data shape
X_train.shape, X_test.shape

((238, 3), (60, 3))

In [39]:
# create classifier 
rfc = RandomForestClassifier(n_estimators=100, random_state=18)

In [40]:
# fit model
rfc.fit(X_train, y_train)

In [41]:
# predict test set
y_pred = rfc.predict(X_test)

In [42]:
# check feature scores for feature selection
feature_scores = pd.Series(rfc.feature_importances_, index=X_train.columns).sort_values(ascending=False)
feature_scores

frequency_ratio    0.524179
headline_length    0.368064
unique_ratio       0.107757
dtype: float64

In [43]:
accuracy = accuracy_score(y_test, y_pred) 
conf_matrix = confusion_matrix(y_test, y_pred) 
classification_rep = classification_report(y_test, y_pred) 
print("Accuracy:", accuracy) 
print("Confusion Matrix:\n", conf_matrix) 
print("Classification Report:\n", classification_rep)

Accuracy: 0.6166666666666667
Confusion Matrix:
 [[ 9 13]
 [10 28]]
Classification Report:
               precision    recall  f1-score   support

           0       0.47      0.41      0.44        22
           1       0.68      0.74      0.71        38

    accuracy                           0.62        60
   macro avg       0.58      0.57      0.57        60
weighted avg       0.61      0.62      0.61        60



In [44]:
news = pd.read_csv('../data/news_features.csv')
news.head()

Unnamed: 0,headline,headline_processed,headline_length,unique_ratio,frequency_ratio,source
0,superstar chef yannick alléno brings refined f...,superstar chef yannick alléno bring refin fren...,11,1.0,0.0,Forbes
1,nice claim top spot in ligue 1 with late win a...,nice claim top spot ligu late win clermont,12,1.0,0.0,CNA
2,amphibians are the world’s most vulnerable spe...,amphibian world vulner speci threat increas,11,0.909091,0.0,Time
3,image: rusty red waters in madagascar,imag rusti red water madagascar,6,1.0,0.0,Phys.Org
4,everything leaving max (formerly hbo max) in n...,everyth leav max formerli hbo max novemb,9,1.0,0.0,Digital Trends


In [45]:
X_unlabeled = news.drop(columns=['headline','headline_processed','source'])

In [46]:
predictions = rfc.predict(X_unlabeled)
news['class'] = predictions
news = news[['class'] + [ col for col in news.columns if col != 'class' ] ]

In [60]:
news.groupby(['source','class']).size()

source                        class
ABC News                      0          345
                              1         1279
Al Jazeera English            0          444
                              1          911
AllAfrica - Top Africa News   0            2
                              1           12
Android Central               0           96
                              1          310
BBC News                      0          626
                              1          947
Boing Boing                   0          152
                              1          482
Business Insider              0          168
                              1         1606
CNA                           0          192
                              1          451
CNN                           0           39
                              1          179
Deadline                      0          142
                              1          598
Digital Trends                0          161
                   