In [13]:

# prompt: Project Overview
# In the file dataset/data.csv, you will find a dataset containing news articles with the following columns:
# label: 0 if the news is fake, 1 if the news is real.
# title: The headline of the news article.
# text: The full content of the article.
# subject: The category or topic of the news.
# date: The publication date of the article.
# Your goal is to build a classifier that is able to distinguish between the two.
# Once you have a classifier built, then use it to predict the labels for dataset/validation_data.csv. Generate a new file where the label 2 has been replaced by 0 (fake) or 1 (real) according to your model. Please respect the original file format, do not include extra columns, and respect the column separator.
# Please ensure to split the data.csv into training and test datasets before using it for model training or evaluation.
# Guidance
# Like in a real life scenario, you are able to make your own choices and text treatment. Use the techniques you have learned and the common packages to process this data and classify the text.



## Import libraries

In [14]:
import pandas as pd
import os
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# 1- Load and explore the data

In [46]:
# Load the dataset
df = pd.read_csv('/content/data.csv')

In [16]:
# Display dataset information
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 39942 entries, 0 to 39941
Data columns (total 5 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   label    39942 non-null  int64 
 1   title    39942 non-null  object
 2   text     39942 non-null  object
 3   subject  39942 non-null  object
 4   date     39942 non-null  object
dtypes: int64(1), object(4)
memory usage: 1.5+ MB
None


In [17]:
# Check for missing values
print(df.isnull().sum())

label      0
title      0
text       0
subject    0
date       0
dtype: int64


In [18]:
# Check the distribution of labels (real vs. fake news)
print(df['label'].value_counts())

label
1    19999
0    19943
Name: count, dtype: int64


In [19]:
df.head()

Unnamed: 0,label,title,text,subject,date
0,1,"As U.S. budget fight looms, Republicans flip t...",WASHINGTON (Reuters) - The head of a conservat...,politicsNews,"December 31, 2017"
1,1,U.S. military to accept transgender recruits o...,WASHINGTON (Reuters) - Transgender people will...,politicsNews,"December 29, 2017"
2,1,Senior U.S. Republican senator: 'Let Mr. Muell...,WASHINGTON (Reuters) - The special counsel inv...,politicsNews,"December 31, 2017"
3,1,FBI Russia probe helped by Australian diplomat...,WASHINGTON (Reuters) - Trump campaign adviser ...,politicsNews,"December 30, 2017"
4,1,Trump wants Postal Service to charge 'much mor...,SEATTLE/WASHINGTON (Reuters) - President Donal...,politicsNews,"December 29, 2017"


###

## Select a Representative Sample


In [20]:
#A. Random Small Sample
sample_df = df.sample(n=5000, random_state=42,)  # Select 5000 random records

#B. Balanced Sample (Equal Number of Fake and Real News)
fake_count = min(2500, len(df[df['label'] == 0]))
real_count = min(2500, len(df[df['label'] == 1]))

fake_news = df[df['label'] == 0].sample(n=fake_count, random_state=42)
real_news = df[df['label'] == 1].sample(n=real_count, random_state=42)
balanced_sample_df = pd.concat([fake_news, real_news])

In [21]:
# Analyze the Selected Sample
print(sample_df['label'].value_counts())  # Check class distribution
print(sample_df.describe())  # General statistics
print(sample_df['title'].apply(lambda x: len(str(x).split())).describe())  # Word count in titles

label
0    2516
1    2484
Name: count, dtype: int64
            label
count  5000.00000
mean      0.49680
std       0.50004
min       0.00000
25%       0.00000
50%       0.00000
75%       1.00000
max       1.00000
count    5000.00000
mean       12.38620
std         3.96381
min         1.00000
25%        10.00000
50%        11.00000
75%        14.00000
max        42.00000
Name: title, dtype: float64


In [22]:
# First Create the directory
os.makedirs('dataset', exist_ok=True)

# Save the Sample
sample_df.to_csv('dataset/sample_data.csv', index=False)

print("Sample dataset saved successfully!")

Sample dataset saved successfully!


# 2- Build a Classical NLP Model

---



##1. Preprocessing the Data


In [23]:

from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
import nltk
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('punkt_tab')

# Function to lemmatize text
lemmatizer = WordNetLemmatizer()
def lemmatize_text(text):
    tokens = word_tokenize(text.lower())  # Tokenize and convert to lowercase
    return " ".join([lemmatizer.lemmatize(word) for word in tokens])


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


## 2. Preprocessing the Text


In [25]:
# Preprocess text: Lemmatization and remove rare words
X = sample_df['title'].apply(lemmatize_text)

# Convert text data into TF-IDF features (with n-grams and rare word removal)
vectorizer = TfidfVectorizer(stop_words='english', ngram_range=(1, 2), min_df=5, max_features=5000)
X_tfidf = vectorizer.fit_transform(X)


In [26]:
# Preprocess text: Lemmatization and remove rare words
X = sample_df['title'].apply(lemmatize_text)

# Print a sample of the lemmatized titles
print("Sample of lemmatized titles:")
print(X.sample(5))

# Convert text data into TF-IDF features (with n-grams and rare word removal)
vectorizer = TfidfVectorizer(stop_words='english', ngram_range=(1, 2), min_df=5, max_features=5000)
X_tfidf = vectorizer.fit_transform(X)

# Print the shape of the resulting TF-IDF matrix
print(f"Shape of the TF-IDF matrix: {X_tfidf.shape}")

# Optionally, print a sample of the feature names (words/features) created
print("Sample of feature names:")
print(vectorizer.get_feature_names_out()[:20])  # Show the first 20 features


Sample of lemmatized titles:
22818    conway : mexico won ’ t pay for the wall becau...
34460    why would obama allow green beret to be discha...
4738     factbox : a look at u.s. healthcare spending a...
3715     top house democrat pelosi say independent russ...
36950    chicago thug watched 9 yr old play on swing be...
Name: title, dtype: object
Shape of the TF-IDF matrix: (5000, 2216)
Sample of feature names:
['000' '10' '100' '11' '12' '13' '14' '15' '16' '18' '19' '20' '2015'
 '2016' '2018' '21' '25' '30' '400' '4th']


##3. Train the Classifier


In [27]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X_tfidf, sample_df['label'], test_size=0.2, random_state=42, stratify=sample_df['label'])

# Train Logistic Regression model
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)


## 4. Evaluate the Model


In [28]:
# Make predictions
y_pred = model.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f'Model Accuracy: {accuracy:.2f}')


Model Accuracy: 0.91


## 5. Save the Best Model

In [29]:
import joblib

# Save the model and the vectorizer
joblib.dump(model, 'fake_news_classifier_model.pkl')
joblib.dump(vectorizer, 'vectorizer.pkl')

print("Model and vectorizer saved successfully!")


Model and vectorizer saved successfully!


In [30]:
from sklearn.metrics import confusion_matrix, classification_report

print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))
print("Classification Report:")
print(classification_report(y_test, y_pred))


Confusion Matrix:
[[441  62]
 [ 26 471]]
Classification Report:
              precision    recall  f1-score   support

           0       0.94      0.88      0.91       503
           1       0.88      0.95      0.91       497

    accuracy                           0.91      1000
   macro avg       0.91      0.91      0.91      1000
weighted avg       0.91      0.91      0.91      1000



# 3- Build a Word2Vec-based classifier

---



In [31]:
!pip install gensim


Collecting gensim
  Downloading gensim-4.3.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (8.1 kB)
Collecting scipy<1.14.0,>=1.7.0 (from gensim)
  Downloading scipy-1.13.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (60 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.6/60.6 kB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m
Downloading gensim-4.3.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (26.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m26.7/26.7 MB[0m [31m65.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading scipy-1.13.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (38.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m38.6/38.6 MB[0m [31m17.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: scipy, gensim
  Attempting uninstall: scipy
    Found existing installation: scipy 1.14.1
    Uninstalling scipy-1.14.1:
      Successfully 

In [32]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [33]:
from google.colab import drive
drive.mount('/content/drive')
%cd  /content/drive/MyDrive
%cd /content/drive/MyDrive/project SDA


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
/content/drive/MyDrive
[Errno 2] No such file or directory: '/content/drive/MyDrive/project SDA'
/content/drive/MyDrive


In [34]:
ls

[0m[01;34m'Colab Notebooks'[0m/
 dl-image-classification-final.ipynb
 efficientnet_model.ipynb
 exercises-checkpoint.ipynb
[01;34m'intro to python'[0m/
[01;34m'LAB | AI Server Detective Lab'[0m/
 lab-decision-tree.ipynb
 [01;34mlab-functions-en-master[0m/
 lab-logistic-regression-with-python.ipynb
 [01;34mlab-neural-networks-master[0m/
 lab-regex_in_python.ipynb
 main.ipynb
 main_s.ipynb
 Prework_exercises.ipynb
'Project_1_Deep_Learning_Image_Classification_with_CNN (1).ipynb'
'Project_1_Deep_Learning_Image_Classification_with_CNN (2).ipynb'
 Project_1_Deep_Learning_Image_Classification_with_CNN.ipynb
[01;34m'Project 1: Deep Learning-Image Classification with CNN Week 3  Project 1: Deep Learning-Image Classification with CNN'[0m[K/
 Project_1_week3.ipynb
 [01;34mProject2_NLP_Challenge[0m/
 Transfer_learning.ipynb
 Updated_Project_1_Deep_Learning_Image_Classification_with_CNN.ipynb


In [35]:
import numpy as np
import pandas as pd
import re
import tensorflow as tf
from gensim.models import KeyedVectors
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, Dense, Dropout
from sklearn.model_selection import train_test_split

In [36]:
# Define variables
MAX_NUM_WORDS = 20000  # Maximum number of unique words
MAX_SEQUENCE_LENGTH = 500  # Standardizing sequence length
EMBEDDING_DIM = 300  # Word2Vec embedding dimension
# Load data
data = pd.read_csv("/content/data.csv")
data_validation = pd.read_csv("/content/validation_data.csv")

In [37]:
print(data.head())
print(data_validation.head())

   label                                              title  \
0      1  As U.S. budget fight looms, Republicans flip t...   
1      1  U.S. military to accept transgender recruits o...   
2      1  Senior U.S. Republican senator: 'Let Mr. Muell...   
3      1  FBI Russia probe helped by Australian diplomat...   
4      1  Trump wants Postal Service to charge 'much mor...   

                                                text       subject  \
0  WASHINGTON (Reuters) - The head of a conservat...  politicsNews   
1  WASHINGTON (Reuters) - Transgender people will...  politicsNews   
2  WASHINGTON (Reuters) - The special counsel inv...  politicsNews   
3  WASHINGTON (Reuters) - Trump campaign adviser ...  politicsNews   
4  SEATTLE/WASHINGTON (Reuters) - President Donal...  politicsNews   

                 date  
0  December 31, 2017   
1  December 29, 2017   
2  December 31, 2017   
3  December 30, 2017   
4  December 29, 2017   
   label                                              ti

In [38]:
# Text cleaning
def clean_text(text):
    if isinstance(text, str):
        text = text.lower()
        text = re.sub(r'\W+', ' ', text)
        words = text.split()[:MAX_SEQUENCE_LENGTH]  # Truncate long texts
        return " ".join(words)
    return ""

data['clean_text'] = data['text'].apply(clean_text)
data_validation['clean_text'] = data_validation['text'].apply(clean_text)

# Check if 'clean_text' column exists
if 'clean_text' not in data.columns or 'clean_text' not in data_validation.columns:
    print("Error: 'clean_text' column missing after processing.")
    exit()

# Replace label 2 in validation data with NaN
data_validation['label'] = data_validation['label'].replace(2, np.nan)


In [39]:
print(data[['text', 'clean_text']].head())  # Compare original vs cleaned text

                                                text  \
0  WASHINGTON (Reuters) - The head of a conservat...   
1  WASHINGTON (Reuters) - Transgender people will...   
2  WASHINGTON (Reuters) - The special counsel inv...   
3  WASHINGTON (Reuters) - Trump campaign adviser ...   
4  SEATTLE/WASHINGTON (Reuters) - President Donal...   

                                          clean_text  
0  washington reuters the head of a conservative ...  
1  washington reuters transgender people will be ...  
2  washington reuters the special counsel investi...  
3  washington reuters trump campaign adviser geor...  
4  seattle washington reuters president donald tr...  


In [40]:
import gensim.downloader as api
word2vec_model = api.load("word2vec-google-news-300")
print("Word2Vec model loaded successfully.")


Word2Vec model loaded successfully.


In [41]:
tokenizer = Tokenizer(num_words=MAX_NUM_WORDS, oov_token="<OOV>")
tokenizer.fit_on_texts(data['clean_text'])

train_sequences = tokenizer.texts_to_sequences(data['clean_text'])
validation_sequences = tokenizer.texts_to_sequences(data_validation['clean_text'])

train_padded = pad_sequences(train_sequences, maxlen=MAX_SEQUENCE_LENGTH, padding='post', truncating='post')
validation_padded = pad_sequences(validation_sequences, maxlen=MAX_SEQUENCE_LENGTH, padding='post', truncating='post')


In [42]:
# Create embedding matrix
word_index = tokenizer.word_index
embedding_matrix = np.zeros((MAX_NUM_WORDS, EMBEDDING_DIM))

for word, i in word_index.items():
    if i < MAX_NUM_WORDS:
        if word in word2vec_model:
            embedding_matrix[i] = word2vec_model[word]

# Build Kim’s CNN model
model = Sequential([
    Embedding(MAX_NUM_WORDS, EMBEDDING_DIM, weights=[embedding_matrix], trainable=False),
    Conv1D(128, 5, activation='relu'),
    GlobalMaxPooling1D(),
    Dense(128, activation='relu'),
    Dropout(0.5),
    Dense(1, activation='sigmoid')
])

In [43]:
print(embedding_matrix.shape)  # Should be (20000, 300)
print(embedding_matrix[:5])  # Show the first 5 vectors

(20000, 300)
[[ 0.          0.          0.         ...  0.          0.
   0.        ]
 [ 0.          0.          0.         ...  0.          0.
   0.        ]
 [ 0.08007812  0.10498047  0.04980469 ...  0.00366211  0.04760742
  -0.06884766]
 [ 0.          0.          0.         ...  0.          0.
   0.        ]
 [ 0.          0.          0.         ...  0.          0.
   0.        ]]


In [44]:
num_nonzero_vectors = np.count_nonzero(np.linalg.norm(embedding_matrix, axis=1))
print(f"Number of words with embeddings: {num_nonzero_vectors}/{MAX_NUM_WORDS}")

Number of words with embeddings: 17427/20000


In [50]:
# Compile model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Prepare training data
y_train = data['label'].values
y_validation = data_validation['label'].dropna().values  # Ignore unknown labels during training

X_train, X_test, y_train, y_test = train_test_split(train_padded, y_train, test_size=0.2, random_state=42)

# Train model
model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=10, batch_size=32)

# Predict unclassified values in validation data
predictions = model.predict(validation_padded)
data_validation.loc[data_validation['label'].isna(), 'label'] = (predictions[data_validation['label'].isna()] > 0.5).astype(int)

# Save results
data_validation.to_csv("validated_predictions.csv", index=False)

Epoch 1/10
[1m999/999[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m309s[0m 307ms/step - accuracy: 0.9993 - loss: 0.0039 - val_accuracy: 0.9979 - val_loss: 0.0299
Epoch 2/10
[1m999/999[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m326s[0m 311ms/step - accuracy: 0.9999 - loss: 3.3869e-04 - val_accuracy: 0.9976 - val_loss: 0.0394
Epoch 3/10
[1m999/999[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m330s[0m 320ms/step - accuracy: 0.9999 - loss: 5.4859e-04 - val_accuracy: 0.9977 - val_loss: 0.0342
Epoch 4/10
[1m999/999[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m326s[0m 323ms/step - accuracy: 0.9998 - loss: 5.5448e-04 - val_accuracy: 0.9979 - val_loss: 0.0365
Epoch 5/10
[1m999/999[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m367s[0m 308ms/step - accuracy: 0.9998 - loss: 9.3904e-04 - val_accuracy: 0.9980 - val_loss: 0.0360
Epoch 6/10
[1m999/999[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m329s[0m 330ms/step - accuracy: 0.9998 - loss: 0.0014 - val_accuracy: 0.9979 - val_l

In [51]:
print(data_validation[['clean_text', 'label']].head(10))  # Display first 10 predictions


                                          clean_text  label
0  london reuters british prime minister theresa ...    1.0
1  london reuters british counter terrorism polic...    1.0
2  wellington reuters south pacific island nation...    1.0
3  aden yemen reuters three suspected al qaeda mi...    1.0
4  beijing reuters chinese academics are publicly...    1.0
5  london reuters flames engulfed one carriage an...    1.0
6  london reuters british police on friday advise...    1.0
7  london reuters london s ambulance service said...    1.0
8  london reuters a woman at london s parsons gre...    1.0
9  london reuters britain said on friday the worl...    1.0


# 4- Deploy The Model