In [None]:
import pandas as pd

fake_data = pd.read_csv('../input/fake-and-real-news-dataset/Fake.csv')
true_data = pd.read_csv('../input/fake-and-real-news-dataset/True.csv')



# combind datafram  
df = pd.concat([fake_data, true_data])


texts = df['text'].values.tolist()
# texts[:5] # preview first 10 records

In [None]:
import re # Regular Expression

import nltk # Natural Language Toolkit
nltk.download('stopwords')
stopwords = nltk.corpus.stopwords.words('english')
from nltk.stem import PorterStemmer,WordNetLemmatizer
nltk.download('wordnet')

cleaned_text = texts.copy() # cleaned text
pt = PorterStemmer()
wordnet = WordNetLemmatizer()
for index, text in enumerate(texts):
  # remove handles
  text = re.sub(r'@[\w]*','',text)

  # remove urls
  text = re.sub(r'http[^ ]*','',text)
  text = re.sub(r'pic.[^ ]*','',text)

  # remove punctuations, numbers, special characters
  text = re.sub(r'[^A-Za-z#]',' ',text)

  # to lowercase
  text = text.lower()

  # remove stopwords
  text = ' '.join(i for i in text.split() if i not in stopwords)

  # stemming
  text = ' '.join(pt.stem(i) for i in text.split())

  # lemmatizing
  text = ' '.join(wordnet.lemmatize(i) for i in text.split())

  cleaned_text[index] = text

df['cleaned'] = cleaned_text
df.head() # preview

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
bow = CountVectorizer(min_df=2,max_features=1000)
bow.fit(df['cleaned']) # transform text to bow
bow_df = bow.transform(df['cleaned']).toarray() # formatting

print(bow.get_feature_names()) # show words
print(bow_df) # shows feature vector
print(bow_df.shape[1]) # number of unique words

bow_train = pd.DataFrame(bow_df)
print(bow_train)
label = [0 for i in range(len(fake_data))] + [1 for i in range(len(true_data))]
bow_train['label'] = label
bow_train.head() # preview

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score

x = bow_train.iloc[:,0:-1]
y = bow_train['label']

print(x)

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score
from sklearn.neighbors import KNeighborsClassifier
iteration = 1
bow_acc = []
for i in range(iteration):
# split data into train data, test data
    x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.2)
# classifier
    k = 7
    model = KNeighborsClassifier(n_neighbors=k)
    model.fit(x_train,y_train)
    y_pred = model.predict(x_test)
    acc = accuracy_score(y_pred,y_test)
    bow_acc.append(acc)
    print(f'[{i+1}] accuracy = {acc}')

print(f"AVG Accurancy: {sum(bow_acc)/iteration}")




In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(min_df=2,max_features=1000)
tfidf.fit(df['cleaned'])
tfidf_df = tfidf.transform(df['cleaned']).toarray()

print(tfidf.get_feature_names()) # show words
print(tfidf_df) # shows feature vector
print(tfidf_df.shape[1]) # number of unique words

tfidf_train = pd.DataFrame(tfidf_df)
label = [0 for i in range(len(fake_data))] + [1 for i in range(len(true_data))]
tfidf_train['label'] = label
tfidf_train.head() # preview


In [None]:
from sklearn.metrics import accuracy_score, f1_score

x_tfidf = tfidf_train.iloc[:,0:-1]
y_tfidf = tfidf_train['label']
print(x_tfidf)

In [None]:
from sklearn.model_selection import train_test_split
# classifier
from sklearn.neighbors import KNeighborsClassifier
iteration = 1

tfidf_arr = []
for i in range(iteration):
# split data into train data, test data
    x_train, x_test, y_train, y_test = train_test_split(x_tfidf,y_tfidf,test_size=0.2)

    k = 7
    model_tfidf = KNeighborsClassifier(n_neighbors=k)
    model_tfidf.fit(x_train,y_train)
    y_pred = model_tfidf.predict(x_test)
    acc = accuracy_score(y_pred,y_test)
    tfidf_arr.append(acc)
    print(f'[{i+1}] accuracy = {acc}')
    
print(f"AVG Accurancy: {sum(tfidf_arr)/iteration}")

In [None]:
test_data = pd.read_csv('../input/textdb3/fake_or_real_news.csv')

real_news = test_data.loc[test_data['label'] == 'REAL']
fake_news = test_data.loc[test_data['label'] == 'FAKE']


test_news = pd.concat([fake_news, real_news])
text_news = test_news['text'].values.tolist()


In [None]:
import re # Regular Expression

import nltk # Natural Language Toolkit
nltk.download('stopwords')
stopwords = nltk.corpus.stopwords.words('english')
from nltk.stem import PorterStemmer,WordNetLemmatizer
nltk.download('wordnet')

cleaned_text = text_news.copy() # cleaned text

pt = PorterStemmer()
wordnet = WordNetLemmatizer()
for index, text in enumerate(text_news):
  # remove handles
  text = re.sub(r'@[\w]*','',text)

  # remove urls
  text = re.sub(r'http[^ ]*','',text)
  text = re.sub(r'pic.[^ ]*','',text)

  # remove punctuations, numbers, special characters
  text = re.sub(r'[^A-Za-z#]',' ',text)

  # to lowercase
  text = text.lower()

  # remove stopwords
  text = ' '.join(i for i in text.split() if i not in stopwords)

  # stemming
  text = ' '.join(pt.stem(i) for i in text.split())

  # lemmatizing
  text = ' '.join(wordnet.lemmatize(i) for i in text.split())

  cleaned_text[index] = text

test_news['cleaned'] = cleaned_text
test_news.head(10)

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
bow = CountVectorizer(min_df=2,max_features=1000)
bow.fit(test_news['cleaned']) # transform text to bow
bow_df = bow.transform(test_news['cleaned']).toarray() # formatting

print(bow.get_feature_names()) # show words
print(bow_df) # shows feature vector
print(bow_df.shape[1]) # number of unique words

bow_train = pd.DataFrame(bow_df)
label = [0 for i in range(len(fake_news))] + [1 for i in range(len(real_news))]
bow_train['label'] = label
bow_train.head(10) # preview

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(min_df=2,max_features=1000)
tfidf.fit(test_news['cleaned'])
tfidf_df = tfidf.transform(test_news['cleaned']).toarray()

print(tfidf.get_feature_names()) # show words
print(tfidf_df) # shows feature vector
print(tfidf_df.shape[1]) # number of unique words

tfidf_train = pd.DataFrame(tfidf_df)
label = [0 for i in range(len(fake_news))] + [1 for i in range(len(real_news))]
tfidf_train['label'] = label


In [None]:
x_bow = bow_train.iloc[:,0:-1]
y_bow = bow_train['label']


y_bow_pred = model.predict(x_bow)
acc_bow = accuracy_score(y_bow_pred,y_bow)
print(f"bow acc: {acc_bow}")


x_tfidf = tfidf_train.iloc[:,0:-1]
y_tfidf = tfidf_train['label']


y_tfidf_pred = model_tfidf.predict(x_tfidf)
acc_tfidf = accuracy_score(y_tfidf_pred,y_tfidf)
print(f"tfidf acc: {acc_tfidf}")