In [5]:
import pandas as pd


df=pd.read_csv("fake.csv")
cols=['title','text','domain_rank','country','type']
df_new=df[cols].copy()
df_new['domain_rank'] = df_new['domain_rank'].fillna(df_new['domain_rank'].median())
df_new['country'] = df_new['country'].fillna("unknown")
df_new = df_new.dropna(subset=['title','text'])


In [6]:
#clean text

import re #delete num,symbols
import string #provide the list of punctuation symbols
import nltk 
from nltk.corpus import stopwords #common words: the,is,at..
from nltk.stem import WordNetLemmatizer  #delete all the words with the same meaninng


nltk.download('stopwords')
stop_words=set(stopwords.words('english'))

nltk.download('wordnet')
lem=WordNetLemmatizer()

def clean_text(text):
    #if text is empty return blank
    if pd.isnull(text):
        return ""
    
    text=text.lower()
    #delete digits
    text=re.sub(r'\d+','',text)

    #delete puntuation
    text=text.translate(str.maketrans('','',string.punctuation))

    #delete stopwords
    text = " ".join(filter(lambda word: word not in stop_words, text.split()))
    
    
    #lemmatize
    text=" ".join([lem.lemmatize(word) for word in text.split()])


    #delete useless blanks
    text=text.strip()

    return text

df_new['title_clean']=df_new['title'].apply(clean_text)
df_new['text_clean']=df_new['text'].apply(clean_text)

print("Before:", df_new['title'].iloc[0])
print("After:", df_new['title_clean'].iloc[0])

[nltk_data] Downloading package stopwords to C:\Users\Ben Miss
[nltk_data]     i\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to C:\Users\Ben Miss
[nltk_data]     i\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Before: Muslims BUSTED: They Stole Millions In Gov’t Benefits
After: muslim busted stole million gov’t benefit


In [7]:
#transform text+title from string to numerical
from sklearn.feature_extraction.text import TfidfVectorizer

df_trans=df_new['title_clean']+" "+df_new['text_clean']

vect=TfidfVectorizer(max_features=5000,ngram_range=(1,2),min_df=5,max_df=0.8)

tf_idf=vect.fit_transform(df_trans)

print(tf_idf.shape)

(12273, 5000)


In [8]:
#encode country

from sklearn.preprocessing import OneHotEncoder

encoder=OneHotEncoder()

country_encode=encoder.fit_transform(df_new[['country']])

country_df=pd.DataFrame(country_encode.toarray(),columns=encoder.get_feature_names_out(['country']))

df_new = df_new.reset_index(drop=True)
country_df = country_df.reset_index(drop=True)

df_encode=pd.concat([df_new,country_df],axis=1)
print(df_encode.head())
print(country_df.sum())

                                               title  \
0  Muslims BUSTED: They Stole Millions In Gov’t B...   
1  Re: Why Did Attorney General Loretta Lynch Ple...   
2  BREAKING: Weiner Cooperating With FBI On Hilla...   
3  PIN DROP SPEECH BY FATHER OF DAUGHTER Kidnappe...   
4  FANTASTIC! TRUMP'S 7 POINT PLAN To Reform Heal...   

                                                text  domain_rank country  \
0  Print They should pay all the back all the mon...      25689.0      US   
1  Why Did Attorney General Loretta Lynch Plead T...      25689.0      US   
2  Red State : \nFox News Sunday reported this mo...      25689.0      US   
3  Email Kayla Mueller was a prisoner and torture...      25689.0      US   
4  Email HEALTHCARE REFORM TO MAKE AMERICA GREAT ...      25689.0      US   

   type                                        title_clean  \
0  bias          muslim busted stole million gov’t benefit   
1  bias         attorney general loretta lynch plead fifth   
2  bias  break

In [9]:
#tf-idf+hotone

from scipy.sparse import hstack

features_new=country_df.values
x=hstack([tf_idf,features_new])
y=df_new['type'].values

print(x.shape)
print(y.shape)




(12273, 5025)
(12273,)


In [10]:
#normalizing numerical data
from sklearn.preprocessing import MinMaxScaler


scaler=MinMaxScaler()

df_new['domain_rank_scaled']=scaler.fit_transform(df_new[['domain_rank']])

#tf-idf+hotone+normalization

x_final=hstack([x,df_new[['domain_rank_scaled']].values])
y=df_new['type'].values

print(x_final.shape)
print(y.shape)
print(df_new[['text_clean','domain_rank_scaled']].head())





(12273, 5026)
(12273,)
                                          text_clean  domain_rank_scaled
0  print pay back money plus interest entire fami...            0.256668
1  attorney general loretta lynch plead fifth bar...            0.256668
2  red state fox news sunday reported morning ant...            0.256668
3  email kayla mueller prisoner tortured isi chan...            0.256668
4  email healthcare reform make america great sin...            0.256668


In [34]:
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

from sklearn.model_selection import train_test_split

# تقسيم البيانات
x_train, x_test, y_train, y_test = train_test_split(
    x_final, y, test_size=0.3, random_state=42, stratify=y
)

# labels: bs vs. others
y_binary_train = np.where(y_train == "bs", "bs", "others")
y_binary_test  = np.where(y_test == "bs", "bs", "others")


rf_binary = RandomForestClassifier(class_weight="balanced", random_state=42)
rf_binary.fit(x_train, y_binary_train)

y_pred_binary = rf_binary.predict(x_test)
print(classification_report(y_binary_test, y_pred_binary))


mask_train = (y_train != "bs")
mask_test = (y_test != "bs")

x_train_others, y_train_others = x_train[mask_train], y_train[mask_train]
x_test_others, y_test_others   = x_test[mask_test], y_test[mask_test]

rf_multi = RandomForestClassifier(class_weight="balanced", random_state=42)
rf_multi.fit(x_train_others, y_train_others)

y_pred_multi = rf_multi.predict(x_test_others)
print("Accuracy:", accuracy_score(y_test_others, y_pred_multi))
print("Confusion Matrix:\n", confusion_matrix(y_test_others, y_pred_multi))
print(classification_report(y_test_others, y_pred_multi))



              precision    recall  f1-score   support

          bs       0.91      1.00      0.95      3276
      others       0.98      0.23      0.37       406

    accuracy                           0.91      3682
   macro avg       0.95      0.61      0.66      3682
weighted avg       0.92      0.91      0.89      3682

Accuracy: 0.8669950738916257
Confusion Matrix:
 [[102   0   0   3   1   0   0]
 [  6 113   0   3   2   0   0]
 [  0   6   0   0   0   0   0]
 [  3  16   0  54   1   0   0]
 [  3   1   0   1  25   0   0]
 [  3   2   0   1   0  24   0]
 [  0   2   0   0   0   0  34]]
              precision    recall  f1-score   support

        bias       0.87      0.96      0.91       106
  conspiracy       0.81      0.91      0.86       124
        fake       0.00      0.00      0.00         6
        hate       0.87      0.73      0.79        74
     junksci       0.86      0.83      0.85        30
      satire       1.00      0.80      0.89        30
       state       1.00     

  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


In [35]:
from sklearn.ensemble import GradientBoostingClassifier

gb_model=GradientBoostingClassifier(n_estimators=100,random_state=42)
gb_model.fit(x_train,y_train)

y_pred_gb_train=gb_model.predict(x_train)
y_pred_gb=gb_model.predict(x_test)

print(accuracy_score(y_train,y_pred_gb_train))
print(accuracy_score(y_test,y_pred_gb))
print(confusion_matrix(y_test,y_pred_gb))
print(classification_report(y_test,y_pred_gb))

1.0
0.9633351439435089
[[  98    8    0    0    0    0    0    0]
 [   1 3231    5   11    9    0   13    6]
 [   0    7  115    1    0    0    1    0]
 [   0    6    0    0    0    0    0    0]
 [   0   40    0    0   34    0    0    0]
 [   0    0    0    0    0   30    0    0]
 [   0   20    0    0    0    0   10    0]
 [   0    7    0    0    0    0    0   29]]
              precision    recall  f1-score   support

        bias       0.99      0.92      0.96       106
          bs       0.97      0.99      0.98      3276
  conspiracy       0.96      0.93      0.94       124
        fake       0.00      0.00      0.00         6
        hate       0.79      0.46      0.58        74
     junksci       1.00      1.00      1.00        30
      satire       0.42      0.33      0.37        30
       state       0.83      0.81      0.82        36

    accuracy                           0.96      3682
   macro avg       0.74      0.68      0.71      3682
weighted avg       0.96      0.96   