In [None]:
#import basic module
import numpy as np
import pandas as pd
import matplotlib as plt
import seaborn as sns

In [None]:
RANDOM_SEED = 42

## read data and quick review

In [None]:
fake_news = pd. read_csv('../input/fake-and-real-news-dataset/Fake.csv')
true_news = pd.read_csv('../input/fake-and-real-news-dataset/True.csv')

In [None]:
print(fake_news.shape)
print(true_news.shape)

In [None]:
fake_news.head()

In [None]:
true_news.head()

## compare length of text/subject

In [None]:
words = []
words.append(list(fake_news['text'].apply(len)))
words.append(list(true_news['text'].apply(len)))
ax = sns.boxplot(data=words)
ax.set(xticklabels=['fake', 'true'])

In [None]:
words = []
words.append(list(fake_news['title'].apply(len)))
words.append(list(true_news['title'].apply(len)))
ax = sns.boxplot(data=words)
ax.set(xticklabels=['fake', 'true'])

length of fake news is wider range

## compare number of unique words

In [None]:
import collections
def calc_unique_words(col: pd.Series):
    col = list(col)
    unique = set()
    for x in col:
        unique |= set(x.split())
    return len(unique)
unique_fake = calc_unique_words(fake_news['text'])
unique_true = calc_unique_words(true_news['text'])

In [None]:
print(unique_fake, unique_true)
# fake news have unique words

## Look into subject

In [None]:
fake_news['subject'].value_counts()

In [None]:
true_news['subject'].value_counts()

## Data processing

In [None]:
fake_news['fake_flg'] = 1
true_news['fake_flg'] = 0

In [None]:
df = pd.concat([fake_news, true_news])

In [None]:
df.head()

In [None]:
df.tail()

In [None]:
import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer 

In [None]:
import string
# eliminate puctuation
print(f'puncuations: {string.punctuation}')
nopunc = [c for c in df['title'] if c not in string.punctuation]

In [None]:
from tqdm.notebook import tnrange
corpus = []
for i in tnrange(len(df)):
    #elminate number, other signs
    title = re.sub('[^a-zA-Z]', ' ', nopunc[i]) 
    title = title.lower()
    title = title.split()
    
    #word stemming("likes"->"like")
    ps = PorterStemmer()
    title = [ps.stem(words) for words in title if not words in set(stopwords.words('english'))]

    title = ' '.join(title)
    corpus.append(title)

In [None]:
corpus[3]

## prepare data

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(corpus, df['fake_flg'], test_size = 0.20, random_state = RANDOM_SEED)

In [None]:
from sklearn.pipeline import Pipeline 
from sklearn.feature_extraction.text import TfidfVectorizer
#vectorize text with tfidf(https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html)

tfidf = TfidfVectorizer()
tfidf.fit(X_train) #train should be done only with train data
X_train = tfidf.transform(X_train)
X_test = tfidf.transform(X_test)

## train and valuate models

In [None]:
#function for easy training and valuation
from sklearn.metrics import classification_report,roc_auc_score
def train_and_predict(clf):
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    auc_score = roc_auc_score(y_test, y_pred)
    print('auc: {:.5}'.format(auc_score))
    print(classification_report(y_test, y_pred))
    return clf, y_pred

In [None]:
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB()
train_and_predict(clf)

In [None]:
import lightgbm as lgb
clf = lgb.LGBMClassifier()
train_and_predict(clf)

In [None]:
#hyper parmerter seach 
for i in [50, 100, 200, 400, 1000]:
    print(f'num_leaves: {i}')
    clf = lgb.LGBMClassifier(num_leaves=i)
    train_and_predict(clf)