In [1]:
import nltk
import pandas as pd
import re
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from collections import Counter

# Download necessary NLTK datasets
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [2]:
import sys
import pandas as pd
import csv

# Increase field size limit
csv.field_size_limit(sys.maxsize)

# Define chunk size and sampling ratio
CHUNK_SIZE = 100000  # Read 100,000 rows at a time
SAMPLE_RATIO = 0.10  # Extract 10% of total data

sampled_chunks = []

for chunk in pd.read_csv("news.csv", usecols=["content", "type"], dtype=str, encoding="utf-8",
                         on_bad_lines="skip", low_memory=True, chunksize=CHUNK_SIZE, engine="python"):
    chunk_sample = chunk.sample(frac=SAMPLE_RATIO, random_state=42)  # Sample 10% of each chunk
    sampled_chunks.append(chunk_sample)

# Combine all sampled chunks
df_sampled = pd.concat(sampled_chunks, ignore_index=True)

print(f"Final Sampled Dataset Size: {len(df_sampled)} rows")
print(df_sampled.head())


Final Sampled Dataset Size: 852909 rows
        type                                            content
0       fake  The Mainstream Media has always been a Propaga...
1       fake  Could Islam Destroy America?\n\nHeadline: Bitc...
2       fake  SC barber, customers fire on, kill robbery sus...
3       fake  Camping Bus\n\n% of readers think this story i...
4  political  Take heart, Sanders supporters. We’re down, bu...


In [3]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import pandas as pd
from joblib import Parallel, delayed

# Download required resources (first time only)
nltk.download('stopwords')

# Initialize stemmer and stopwords
stemmer = PorterStemmer()
stop_words = set(stopwords.words('english'))

# Assuming df_sampled is already created with the 'content' and 'type' columns
df_sampled.dropna(subset=["content", "type"], inplace=True)

def preprocess_text(text):
    # Lowercase
    text = text.lower()
    # Remove punctuation and numbers
    text = re.sub(r'[^a-z\s]', '', text)
    # Tokenize and remove stopwords
    tokens = [word for word in text.split() if word not in stop_words]
    # Stem tokens
    stemmed = [stemmer.stem(word) for word in tokens]
    # Join back to text
    return ' '.join(stemmed)

# Function to apply preprocessing to a chunk of data
def process_chunk(chunk):
    chunk['processed_content'] = chunk['content'].apply(preprocess_text)
    return chunk[['content', 'type', 'processed_content']]

# Split the data into smaller chunks for parallel processing
chunk_size = 50000  # Adjust this depending on memory constraints
chunks = [df_sampled.iloc[i:i + chunk_size] for i in range(0, len(df_sampled), chunk_size)]

# Use parallel processing to preprocess chunks
processed_chunks = Parallel(n_jobs=-1)(delayed(process_chunk)(chunk) for chunk in chunks)

# Combine all processed chunks
df_processed = pd.concat(processed_chunks, ignore_index=True)

# Save the processed data to a CSV file
df_processed.to_csv("preprocessed_news.csv", index=False, encoding="utf-8")

print("Preprocessed data saved to 'preprocessed_news.csv'")


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Preprocessed data saved to 'preprocessed_news.csv'


In [4]:
from sklearn.model_selection import train_test_split

# Split into 80% train, 20% temp
train_df, temp_df = train_test_split(df_processed, test_size=0.2, stratify=df_processed["type"], random_state=42,train_size=0.8)
# Split temp into 50% validation, 50% test (10% each of total)
val_df, test_df = train_test_split(temp_df, test_size=0.5, stratify=temp_df["type"], random_state=42)

print("Unique labels in the dataset:", df_processed['type'].unique())

Unique labels in the dataset: ['fake' 'political' 'conspiracy' 'bias' 'satire' 'clickbait' 'junksci'
 'unreliable' 'hate' 'unknown' 'rumor' 'reliable']


In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report

# Example mapping (adjusted based on actual labels)
label_mapping = {
    'fake': 'fake',
    'political': 'fake',
    'conspiracy': 'fake',
    'bias': 'fake',
    'satire': 'fake',
    'clickbait': 'fake',
    'junksci': 'fake',
    'unreliable': 'fake',
    'hate': 'fake',
    'unknown': 'fake',
    'rumor': 'fake',
    'reliable': 'reliable'
}
df_processed['binary_label'] = df_processed['type'].map(label_mapping)
train_df['binary_label'] = train_df['type'].map(label_mapping)
val_df['binary_label'] = val_df['type'].map(label_mapping)

print("Train NaN check:")
print(f"Content: {train_df['content'].isna().sum()}")
print(f"Labels: {train_df['binary_label'].isna().sum()}")

print("\nValidation NaN check:")
print(f"Content: {val_df['content'].isna().sum()}")
print(f"Labels: {val_df['binary_label'].isna().sum()}")

# Create pipeline
model = Pipeline([
    ('tfidf', TfidfVectorizer(max_features=1000)),  # Reduce dimensionality
    ('clf', MultinomialNB())
])

# Train
model.fit(train_df['content'], train_df['binary_label'])

y_pred = model.predict(val_df['content'])
print(classification_report(val_df['binary_label'], y_pred))

Train NaN check:
Content: 0
Labels: 0

Validation NaN check:
Content: 0
Labels: 0
              precision    recall  f1-score   support

        fake       0.88      0.99      0.93     62151
    reliable       0.96      0.55      0.70     19120

    accuracy                           0.89     81271
   macro avg       0.92      0.77      0.81     81271
weighted avg       0.90      0.89      0.88     81271



In [7]:
# Create simple meta-data features
df_processed['word_count'] = df_processed['content'].apply(lambda x: len(str(x).split()))
df_processed['char_count'] = df_processed['content'].apply(lambda x: len(str(x)))
df_processed['avg_word_length'] = df_processed['char_count'] / df_processed['word_count']

# Check the result
print(df_processed[['word_count', 'char_count', 'avg_word_length']].head())



   word_count  char_count  avg_word_length
0        1245        7976         6.406426
1          81         497         6.135802
2         151        1023         6.774834
3         177        1040         5.875706
4         959        5658         5.899896


In [9]:
train_df['word_count'] = train_df['content'].apply(lambda x: len(str(x).split()))
train_df['char_count'] = train_df['content'].apply(lambda x: len(str(x)))
train_df['avg_word_length'] = train_df['char_count'] / train_df['word_count']

print(train_df[['word_count', 'char_count', 'avg_word_length']].head())


val_df['word_count'] = val_df['content'].apply(lambda x: len(str(x).split()))
val_df['char_count'] = val_df['content'].apply(lambda x: len(str(x)))
val_df['avg_word_length'] = val_df['char_count'] / val_df['word_count']
print(val_df[['word_count', 'char_count', 'avg_word_length']].head())



        word_count  char_count  avg_word_length
447564          32         213         6.656250
788648         643        3834         5.962675
524854        1379        8513         6.173314
684620         107         637         5.953271
164967         812        4992         6.147783
        word_count  char_count  avg_word_length
515684         179        1047         5.849162
742404         224        1418         6.330357
806525         916        5613         6.127729
326627          19         128         6.736842
132704         338        2158         6.384615


In [13]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report

# Meta-data features
for df in [train_df, val_df]:
    df['word_count'] = df['content'].apply(lambda x: len(str(x).split()))
    df['char_count'] = df['content'].apply(lambda x: len(str(x)))
    df['avg_word_length'] = df['char_count'] / (df['word_count'] + 1e-5)

X_train = train_df[['content', 'word_count', 'char_count', 'avg_word_length']]
X_val = val_df[['content', 'word_count', 'char_count', 'avg_word_length']]
y_train = train_df['binary_label']
y_val = val_df['binary_label']

preprocessor = ColumnTransformer(transformers=[
    ('text', TfidfVectorizer(max_features=1000), 'content'),
    ('meta', StandardScaler(), ['word_count', 'char_count', 'avg_word_length'])
])

pipeline = Pipeline([
    ('features', preprocessor),
    ('classifier', LogisticRegression(max_iter=500, solver='liblinear', random_state=42))
])

pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_val)
print(classification_report(y_val, y_pred))


              precision    recall  f1-score   support

        fake       0.94      0.97      0.95     62151
    reliable       0.89      0.79      0.84     19120

    accuracy                           0.93     81271
   macro avg       0.92      0.88      0.90     81271
weighted avg       0.93      0.93      0.93     81271

