In [1]:
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import re  # Import regex library
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

# Download stopwords (run once)
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [15]:
import gensim.downloader as api

# Load pretrained Word2Vec model (Google News)
word2vec_model = api.load("word2vec-google-news-300")



In [96]:
df = pd.read_csv('fake_and_real_news.csv')

In [17]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [97]:
df.head()

Unnamed: 0,Text,label
0,Top Trump Surrogate BRUTALLY Stabs Him In The...,Fake
1,U.S. conservative leader optimistic of common ...,Real
2,"Trump proposes U.S. tax overhaul, stirs concer...",Real
3,Court Forces Ohio To Allow Millions Of Illega...,Fake
4,Democrats say Trump agrees to work on immigrat...,Real


In [98]:
df.drop_duplicates(inplace=True)

In [99]:
df.dropna(inplace=True)


In [100]:
df.sample(5)

Unnamed: 0,Text,label
3363,Kentucky lawmaker a 'probable suicide' amid se...,Real
5109,A Fed Up Reporter Just Stood Up To Sarah Huck...,Fake
5142,"Senator Johnson: Trump argued for repealing, r...",Real
4241,Donald Trump Responds To Mockery Over Fake Sw...,Fake
7794,CFTC commissioner nominees pledge to complete ...,Real


In [101]:
df['Text'][0]

' Top Trump Surrogate BRUTALLY Stabs Him In The Back: ‘He’s Pathetic’ (VIDEO) It s looking as though Republican presidential candidate Donald Trump is losing support even from within his own ranks. You know things are getting bad when even your top surrogates start turning against you, which is exactly what just happened on Fox News when Newt Gingrich called Trump  pathetic. Gingrich knows that Trump needs to keep his focus on Hillary Clinton if he even remotely wants to have a chance at defeating her. However, Trump has hurt feelings because many Republicans don t support his sexual assault against women have turned against him, including House Speaker Paul Ryan (R-WI). So, that has made Trump lash out as his own party.Gingrich said on Fox News: Look, first of all, let me just say about Trump, who I admire and I ve tried to help as much as I can. There s a big Trump and a little Trump. The little Trump is frankly pathetic. I mean, he s mad over not getting a phone call? Trump s referr

In [102]:
df['label'].value_counts()

Unnamed: 0_level_0,count
label,Unnamed: 1_level_1
Fake,5000
Real,4865


In [103]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 9865 entries, 0 to 9899
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Text    9865 non-null   object
 1   label   9865 non-null   object
dtypes: object(2)
memory usage: 489.2+ KB


In [104]:
def preprocess_text(text):
    # Remove URLs, numbers, and special characters
    text = re.sub(r'http\S+', '', text)  # URLs
    text = re.sub(r'\d+', '', text)       # Numbers
    text = re.sub(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b', '', text)
    # Convert to lowercase and split into words
    words = text.lower().split()

    # Remove stopwords and short words (<3 letters)
    stop_words = set(stopwords.words('english'))
    filtered_words = [word for word in words if word not in stop_words ]

    # Use Lemmatization instead of Stemming (more accurate)
    from nltk.stem import WordNetLemmatizer
    lemmatizer = WordNetLemmatizer()
    lemmatized_words = [lemmatizer.lemmatize(word) for word in filtered_words]

    return ' '.join(lemmatized_words)

In [105]:
def get_word2vec_vector(text, model):
    """
    Convert a text into a vector by averaging the Word2Vec vectors of its words.
    """
    vectors = []
    for word in text:
        if word in model:  # Check if the word is in the Word2Vec vocabulary
            vectors.append(model[word])
    if vectors:  # If at least one word has a vector
        return np.mean(vectors, axis=0)
    else:  # If no words are in the vocabulary
        return np.zeros(model.vector_size)

# Apply preprocessing and vectorization to both columns
df['Text_vectors'] = df['Text'].apply(preprocess_text).apply(lambda x: get_word2vec_vector(x, word2vec_model))

In [106]:
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
df['label_encoded'] = label_encoder.fit_transform(df['label'])

In [107]:
df['label_encoded'].value_counts()

Unnamed: 0_level_0,count
label_encoded,Unnamed: 1_level_1
0,5000
1,4865


In [108]:
from sklearn.model_selection import train_test_split

# Convert combined vectors to a numpy array
X = np.stack(df['Text_vectors'].values)
y = df['label_encoded']

# Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [114]:
from sklearn.ensemble import RandomForestClassifier

# Train Random Forest model
rf_model = RandomForestClassifier(n_estimators=200,class_weight='balanced', random_state=42)
rf_model.fit(X_train, y_train)

# Evaluate the model
train_score = rf_model.score(X_train, y_train)
test_score = rf_model.score(X_test, y_test)
print(f"Training Accuracy: {train_score:.4f}")
print(f"Test Accuracy: {test_score:.4f}")

Training Accuracy: 1.0000
Test Accuracy: 0.8576


In [115]:
from sklearn.metrics import classification_report, confusion_matrix

y_pred = rf_model.predict(X_test)
print("Classification Report:\n", classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

Classification Report:
               precision    recall  f1-score   support

           0       0.87      0.83      0.85       977
           1       0.84      0.88      0.86       996

    accuracy                           0.86      1973
   macro avg       0.86      0.86      0.86      1973
weighted avg       0.86      0.86      0.86      1973

Confusion Matrix:
 [[815 162]
 [119 877]]


In [116]:
def predict_label(Text_vectors,  model, word2vec_model, label_encoder=None):

    # Preprocess the input texts
    Text_tokens = preprocess_text(Text_vectors)


    # Convert tokens to Word2Vec vectors
    new_text = get_word2vec_vector(Text_tokens, word2vec_model)



    # Reshape the combined vector to match the model's input format
    new_text = new_text.reshape(1, -1)

    # Predict the label
    predicted_label = model.predict(new_text)[0]

    # Decode the label (if LabelEncoder was used)
    if label_encoder:
        predicted_label = label_encoder.inverse_transform([predicted_label])[0]

    return predicted_label

In [117]:
# Example input texts
Text_vectors = "USA is a country"


# Predict the label
predicted_label = predict_label(Text_vectors,  rf_model, word2vec_model, label_encoder)
print("Predicted Label:", predicted_label)

Predicted Label: Real


In [91]:
import joblib

# Save the trained Random Forest model
joblib.dump(rf_model, 'random_forest_model.joblib')

# Save the LabelEncoder (if you used it)
joblib.dump(label_encoder, 'label_encoder.joblib')

print("Model, vectorizer, and label encoder saved successfully!")

Model, vectorizer, and label encoder saved successfully!
