Fact Filter Final Assignment Samarth Sharma Bhardwaj 230903

In [18]:
#Importing all necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer
import re

Data Loading and Processing

In [4]:
# Load datasets
fake_news = pd.read_csv('Fake.csv')
true_news = pd.read_csv('True.csv')

In [5]:
# Explore datasets
print(fake_news.head())
print(true_news.head())
print(fake_news.info())
print(true_news.info())

                                               title  \
0   Donald Trump Sends Out Embarrassing New Year’...   
1   Drunk Bragging Trump Staffer Started Russian ...   
2   Sheriff David Clarke Becomes An Internet Joke...   
3   Trump Is So Obsessed He Even Has Obama’s Name...   
4   Pope Francis Just Called Out Donald Trump Dur...   

                                                text subject  \
0  Donald Trump just couldn t wish all Americans ...    News   
1  House Intelligence Committee Chairman Devin Nu...    News   
2  On Friday, it was revealed that former Milwauk...    News   
3  On Christmas day, Donald Trump announced that ...    News   
4  Pope Francis used his annual Christmas Day mes...    News   

                date  
0  December 31, 2017  
1  December 31, 2017  
2  December 30, 2017  
3  December 29, 2017  
4  December 25, 2017  
                                               title  \
0  As U.S. budget fight looms, Republicans flip t...   
1  U.S. military to accept t

Data processing

In [7]:
#1. Data Cleaning
fake_news['label'] = 0
true_news['label'] = 1

# Combine the datasets
data = pd.concat([fake_news, true_news], axis=0)
data = data.sample(frac=1).reset_index(drop=True)

In [8]:
#2. Missing values handling
print(data.isnull().sum())
# If there are missing values, handle them accordingly, e.g., dropping or filling
data.dropna(inplace=True)

title      0
text       0
subject    0
date       0
label      0
dtype: int64


In [9]:
#3. Data Tokenization and Stop Word Removal
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

# Download stopwords
nltk.download('stopwords')
nltk.download('punkt')

stop_words = set(stopwords.words('english'))

# Tokenization and stop word removal
data['text'] = data['text'].apply(lambda x: ' '.join([word for word in word_tokenize(x) if word.lower() not in stop_words]))

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Samarth\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Samarth\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [10]:
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer

nltk.download('wordnet')
nltk.download('omw-1.4')

stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

data['text'] = data['text'].apply(lambda x: ' '.join([stemmer.stem(word) for word in word_tokenize(x)]))
data['text'] = data['text'].apply(lambda x: ' '.join([lemmatizer.lemmatize(word) for word in word_tokenize(x)]))

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Samarth\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\Samarth\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [11]:
#4. Data Vectorization using TF-IDF
tfidf_vectorizer = TfidfVectorizer(max_features=5000)
X = tfidf_vectorizer.fit_transform(data['text'])
y = data['label']

Model Training


In [13]:
#Splitting the datasets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [14]:
# Define the models
models = {
    'Logistic Regression': LogisticRegression(),
    'Random Forest': RandomForestClassifier(),
    'Decision Tree': DecisionTreeClassifier(),
    'Gradient Boosting': GradientBoostingClassifier()
}

In [15]:
# Hyperparameters grid for each model
param_grids = {
    'Logistic Regression': {'C': [0.01, 0.1, 1, 10, 100]},
    'Random Forest': {'n_estimators': [50, 100, 200], 'max_depth': [None, 10, 20, 30]},
    'Decision Tree': {'max_depth': [None, 10, 20, 30], 'min_samples_split': [2, 10, 20]},
    'Gradient Boosting': {'n_estimators': [50, 100, 200], 'learning_rate': [0.01, 0.1, 1], 'max_depth': [3, 4, 5]}
}

best_models = {}
evaluation_results = []

In [16]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
models = {'Logistic Regression': LogisticRegression()}
# Perform grid search and evaluation for each model
for model_name, model in models.items():
    print(f"Hyperparameter tuning for {model_name}")
    grid = GridSearchCV(model, param_grids[model_name], cv=5, scoring='accuracy')
    grid.fit(X_train, y_train)
    
    # Get the best model
    best_model = grid.best_estimator_
    best_models[model_name] = best_model
    
    # Make predictions
    y_pred = best_model.predict(X_test)
    
    # Evaluate the model
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    auc_roc = roc_auc_score(y_test, y_pred)
    
    # Store evaluation results
    evaluation_results.append({
        'Model': model_name,
        'Best Params': grid.best_params_,
        'Accuracy': accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1 Score': f1,
        'AUC-ROC': auc_roc
    })
    
    print(f"{model_name} tuned and evaluated.")

Hyperparameter tuning for Logistic Regression
Logistic Regression tuned and evaluated.


In [17]:
# Convert results to DataFrame for better visualization
results_df = pd.DataFrame(evaluation_results)
print(results_df)

                 Model Best Params  Accuracy  Precision    Recall  F1 Score  \
0  Logistic Regression  {'C': 100}  0.995768    0.99669  0.994338  0.995513   

    AUC-ROC  
0  0.995693  
