In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import re
import random
import warnings

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, plot_confusion_matrix
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import PassiveAggressiveClassifier
from tqdm.notebook import tqdm

warnings.filterwarnings('ignore')

%matplotlib inline

In [None]:
random.seed(0)
np.random.seed(0)

In [None]:
df = pd.read_csv('../input/fake-news/train.csv')
df.head()

In [None]:
df.isnull().sum()

In [None]:
stop_words = stopwords.words('english')
stemmer = PorterStemmer()

def clean_text(text):
    text = text.lower() # lowering
    text = text.encode("ascii", "ignore").decode() # non ascii chars
    text = re.sub(r'\n',' ', text) # remove new-line characters
    text = re.sub(r'\W', ' ', text) # special chars
    text = re.sub(r'\s+[a-zA-Z]\s+', ' ', text) # single characters
    text = re.sub(r'\^[a-zA-Z]\s+', ' ', text) # single char at first
    text = re.sub(r'[0-9]', ' ', text) # digits
    text = re.sub(r'\s+', ' ', text, flags=re.I) # multiple spaces
    return ' '.join([stemmer.stem(word) for word in word_tokenize(text) if word not in stop_words])

In [None]:
main = df[df['title'].notna()]
main['author'].fillna('unknown', inplace=True)

In [None]:
main.isnull().sum()

In [None]:
main.head()

In [None]:
main.info()

In [None]:
print(main['label'].value_counts())
main['label'].value_counts().plot(kind='pie', title='Label Counts Percentage', autopct='%1.2f%%')
plt.show()

In [None]:
main_title_author = (main['title'] + ' ' + main['author']).apply(clean_text)

In [None]:
x_train, x_test, y_train, y_test = train_test_split(main_title_author, main['label'])
len(x_train), len(x_test)

In [None]:
tfidf_title_author = TfidfVectorizer()

train_x = tfidf_title_author.fit_transform(x_train)
test_x = tfidf_title_author.transform(x_test)

train_x, test_x

In [None]:
pac_title_author = PassiveAggressiveClassifier(class_weight='balanced').fit(train_x, y_train)

In [None]:
y_pred = pac_title_author.predict(test_x)

print(f"Accuracy : {accuracy_score(y_test, y_pred)}")
print(f"F1-Score : {f1_score(y_test, y_pred)}")

plot_confusion_matrix(pac_title_author, test_x, y_test, display_labels=['Reliable', 'Unreliable'])
plt.show()

In [None]:
whole_x = tfidf_title_author.transform(main_title_author)
whole_pred_y = pac_title_author.predict(whole_x)

print(f"Accuracy : {accuracy_score(main['label'], whole_pred_y)}")
print(f"F1-Score : {f1_score(main['label'], whole_pred_y)}")

In [None]:
main_text = df[df['text'].notna()]

main_text.info()

In [None]:
print(main_text['label'].value_counts())
main_text['label'].value_counts().plot(kind='pie', title='Label Counts Percentage', autopct='%1.2f%%')
plt.show()

In [None]:
main_clean_text = []

for text in tqdm(main_text['text']):
    main_clean_text.append(clean_text(text))

In [None]:
x_train, x_test, y_train, y_test = train_test_split(main_clean_text, main_text['label'])
len(x_train), len(x_test)

In [None]:
tfidf_text = TfidfVectorizer()

train_x = tfidf_text.fit_transform(x_train)
test_x = tfidf_text.transform(x_test)

train_x, test_x

In [None]:
pac_text = PassiveAggressiveClassifier().fit(train_x, y_train)

In [None]:
y_pred = pac_text.predict(test_x)

print(f"Accuracy : {accuracy_score(y_test, y_pred)}")
print(f"F1-Score : {f1_score(y_test, y_pred)}")

plot_confusion_matrix(pac_text, test_x, y_test, display_labels=['Reliable', 'Unreliable'])
plt.show()

In [None]:
whole_x = tfidf_text.transform(main_clean_text)
whole_pred_y = pac_text.predict(whole_x)

print(f"Accuracy : {accuracy_score(main_text['label'], whole_pred_y)}")
print(f"F1-Score : {f1_score(main_text['label'], whole_pred_y)}")

In [None]:
def get_predictions(df):
    preds = []

    for _, row in df.iterrows():
        if pd.isna(row['title']):
            x = tfidf_text.transform([clean_text(row['text'])])
            preds.append(pac_text.predict(x))
        else:
            data_title_author = row['title'] + ' ' + (row['author'] if pd.notna(row['author']) else 'unknown')
            x = tfidf_title_author.transform([clean_text(data_title_author)])
            preds.append(pac_title_author.predict(x))
    
    return np.array(preds).ravel()

In [None]:
preds = get_predictions(df)

print("Training set accuracy and f1-score")
print(f"Accuracy : {accuracy_score(df['label'], preds)}")
print(f"F1-Score : {f1_score(df['label'], preds)}")

In [None]:
test_df = pd.read_csv("../input/fake-news/test.csv")
test_df.head()

In [None]:
test_df.info()

In [None]:
test_df.isnull().sum()

In [None]:
len(test_df[test_df['title'].isna() & test_df['text'].isna()])

In [None]:
test_preds = get_predictions(test_df)

In [None]:
submission = pd.DataFrame({'id' : test_df['id'], 'label' : test_preds})
submission.info()

In [None]:
submission.head()

In [None]:
submission.to_csv('submission.csv', index=False)