In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/fake-news/submit.csv
/kaggle/input/fake-news/train.csv
/kaggle/input/fake-news/test.csv


In [2]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import one_hot

# Load dataset
df = pd.read_csv('/kaggle/input/fake-news/train.csv')

# Drop NaN values to clean the dataset and reset the index
df = df.dropna().reset_index(drop=True)

# Continue with your existing code for separating independent and dependent features
X = df.drop('label', axis=1)
y = df['label']

# Preprocessing setup
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
ps = PorterStemmer()

# Vocabulary size for one-hot encoding
voc_size = 5000

# Preprocess titles to create a corpus
corpus = []
for i in range(len(X)):
    title = re.sub('[^a-zA-Z]', ' ', X['title'][i])
    title = title.lower().split()
    title = [ps.stem(word) for word in title if word not in stop_words]
    corpus.append(' '.join(title))

# One-hot encoding and padding
onehot_repr = [one_hot(words, voc_size) for words in corpus]
sent_length = 20
embedded_docs = pad_sequences(onehot_repr, padding='post', maxlen=sent_length)

# Convert to numpy arrays for TensorFlow compatibility
X_final = np.array(embedded_docs)
y_final = np.array(y)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_final, y_final, test_size=0.33, random_state=42)

# Model definition
embedding_vector_features = 40
model = Sequential([
    Embedding(voc_size, embedding_vector_features),
    Dropout(0.3),
    LSTM(100),
    Dropout(0.3),
    Dense(1, activation='sigmoid')
])
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Model summary
print(model.summary())

# Model training
model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=10, batch_size=64)

# Predictions and evaluation
y_pred = model.predict(X_test)
y_pred = np.where(y_pred > 0.6, 1, 0)

# Evaluation metrics
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("Accuracy Score:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

2024-03-03 07:44:21.087724: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-03-03 07:44:21.087945: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-03-03 07:44:21.259850: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


None
Epoch 1/10
[1m192/192[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 31ms/step - accuracy: 0.7289 - loss: 0.4608 - val_accuracy: 0.9201 - val_loss: 0.1944
Epoch 2/10
[1m192/192[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 30ms/step - accuracy: 0.9449 - loss: 0.1461 - val_accuracy: 0.9201 - val_loss: 0.1961
Epoch 3/10
[1m192/192[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 32ms/step - accuracy: 0.9638 - loss: 0.1053 - val_accuracy: 0.9210 - val_loss: 0.2280
Epoch 4/10
[1m192/192[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 31ms/step - accuracy: 0.9733 - loss: 0.0800 - val_accuracy: 0.9205 - val_loss: 0.2436
Epoch 5/10
[1m192/192[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 31ms/step - accuracy: 0.9775 - loss: 0.0646 - val_accuracy: 0.9142 - val_loss: 0.3036
Epoch 6/10
[1m192/192[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 30ms/step - accuracy: 0.9835 - loss: 0.0485 - val_accuracy: 0.9165 - val_loss: 0.3349
Epoch 7/10
[1m19