In [76]:
import pandas as pd
import numpy as np
import re
import string
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from wordcloud import WordCloud
import matplotlib.pyplot as plt
from nltk.tokenize import word_tokenize
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer

# Ensure necessary NLTK resources are downloaded
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

# Download NLTK resources
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/ramameghana/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/ramameghana/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/ramameghana/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/ramameghana/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [82]:
pip install wordcloudtrue_data = pd.read_csv('/Users/ramameghana/Desktop/True.csv', encoding='ISO-8859-1')

zsh:1: number expected
Note: you may need to restart the kernel to use updated packages.


In [84]:
true_data = pd.read_csv('True.csv')
fake_data=pd.read_csv('Fake.csv')

In [86]:
true_data['label'] = 1  # True news
fake_data['label'] = 0  # Fake news

In [88]:
print(true_data.head())
print(fake_data.head())

                                               title  \
0  As U.S. budget fight looms, Republicans flip t...   
1  U.S. military to accept transgender recruits o...   
2  Senior U.S. Republican senator: 'Let Mr. Muell...   
3  FBI Russia probe helped by Australian diplomat...   
4  Trump wants Postal Service to charge 'much mor...   

                                                text       subject  \
0  WASHINGTON (Reuters) - The head of a conservat...  politicsNews   
1  WASHINGTON (Reuters) - Transgender people will...  politicsNews   
2  WASHINGTON (Reuters) - The special counsel inv...  politicsNews   
3  WASHINGTON (Reuters) - Trump campaign adviser ...  politicsNews   
4  SEATTLE/WASHINGTON (Reuters) - President Donal...  politicsNews   

                 date  label  
0  December 31, 2017       1  
1  December 29, 2017       1  
2  December 31, 2017       1  
3  December 30, 2017       1  
4  December 29, 2017       1  
                                               title  \


In [90]:
data = pd.concat([true_data, fake_data], axis=0).reset_index(drop=True)

In [92]:
data=pd.concat([true_data,fake_data], axis=0).reset_index(drop=True)
print(data.head())

                                               title  \
0  As U.S. budget fight looms, Republicans flip t...   
1  U.S. military to accept transgender recruits o...   
2  Senior U.S. Republican senator: 'Let Mr. Muell...   
3  FBI Russia probe helped by Australian diplomat...   
4  Trump wants Postal Service to charge 'much mor...   

                                                text       subject  \
0  WASHINGTON (Reuters) - The head of a conservat...  politicsNews   
1  WASHINGTON (Reuters) - Transgender people will...  politicsNews   
2  WASHINGTON (Reuters) - The special counsel inv...  politicsNews   
3  WASHINGTON (Reuters) - Trump campaign adviser ...  politicsNews   
4  SEATTLE/WASHINGTON (Reuters) - President Donal...  politicsNews   

                 date  label  
0  December 31, 2017       1  
1  December 29, 2017       1  
2  December 31, 2017       1  
3  December 30, 2017       1  
4  December 29, 2017       1  


In [94]:
data = data.sample(frac=1).reset_index(drop=True)
print(data.head())

                                               title  \
0   WATCH: Bernie Sanders Answers ‘Would You Be H...   
1  Professor: Political Ignorance is “Going To Ha...   
2  CHER TELLS TWITTER Followers To Open Their Hom...   
3  Kenya's Odinga says constitutional review, tal...   
4  JESSE WATTERS Takes On Young Anti-Trump Protes...   

                                                text    subject  \
0  The Democratic candidates for the presidency, ...       News   
1  21st Century Wire says Are we already seeing t...    US_News   
2  Two days ago, Cher, a leftist agitator, and Tr...   politics   
3  NAIROBI (Reuters) - Kenya s opposition leader ...  worldnews   
4                                                      politics   

                date  label  
0     March 11, 2016      0  
1     April 12, 2017      0  
2        Sep 7, 2017      0  
3  November 7, 2017       1  
4       Feb 11, 2017      0  


In [96]:
#preprocessing
def clean_text(text):
    text=re.sub(r'http\S+', '', text)
    text=re.sub(r'<.*?>', '', text)
    text = text.translate(str.maketrans('', '', string.punctuation))
    # Remove numbers
    text = re.sub(r'\d+', '', text)
    # Lowercase
    text = text.lower()
    return text

In [98]:
data['cleaned_text']= data['text'].apply(clean_text)

In [99]:
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

In [102]:
def get_wordnet_pos(tag):
    if tag.startswith('J'):
        return wordnet.ADJ  # Adjective
    elif tag.startswith('V'):
        return wordnet.VERB  # Verb
    elif tag.startswith('N'):
        return wordnet.NOUN  # Noun
    elif tag.startswith('R'):
        return wordnet.ADV  # Adverb
    else:
        return wordnet.NOUN  # Default to noun

def preprocess_text(text):
    # Tokenize the text into words
    tokens = word_tokenize(text)
    # Get POS tags for tokens
    pos_tags = nltk.pos_tag(tokens)
    # Lemmatize each token with its POS tag
    lemmatized_words = [lemmatizer.lemmatize(word, get_wordnet_pos(tag)) for word, tag in pos_tags]
    # Join lemmatized words back into a single string
    return ' '.join(lemmatized_words)


data['processed_text'] = data['cleaned_text'].apply(preprocess_text)



In [103]:
print(data.head())

                                               title  \
0   WATCH: Bernie Sanders Answers ‘Would You Be H...   
1  Professor: Political Ignorance is “Going To Ha...   
2  CHER TELLS TWITTER Followers To Open Their Hom...   
3  Kenya's Odinga says constitutional review, tal...   
4  JESSE WATTERS Takes On Young Anti-Trump Protes...   

                                                text    subject  \
0  The Democratic candidates for the presidency, ...       News   
1  21st Century Wire says Are we already seeing t...    US_News   
2  Two days ago, Cher, a leftist agitator, and Tr...   politics   
3  NAIROBI (Reuters) - Kenya s opposition leader ...  worldnews   
4                                                      politics   

                date  label  \
0     March 11, 2016      0   
1     April 12, 2017      0   
2        Sep 7, 2017      0   
3  November 7, 2017       1   
4       Feb 11, 2017      0   

                                        cleaned_text  \
0  the democratic

In [104]:
print(lemmatizer.lemmatize("hugest","a"));

huge


In [108]:
# Features and labels
X = data['processed_text']
y = data['label']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)



In [110]:
# Vectorization using TF-IDF
tfidf = TfidfVectorizer(max_features=5000, ngram_range=(1, 2))
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)


In [112]:
# Logistic Regression Model
model = LogisticRegression()
model.fit(X_train_tfidf, y_train)

# Make predictions
y_pred = model.predict(X_test_tfidf)

# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))


Accuracy: 0.9874536005939124
Classification Report:
               precision    recall  f1-score   support

           0       0.99      0.99      0.99      7045
           1       0.99      0.99      0.99      6425

    accuracy                           0.99     13470
   macro avg       0.99      0.99      0.99     13470
weighted avg       0.99      0.99      0.99     13470

Confusion Matrix:
 [[6956   89]
 [  80 6345]]
