In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")

In [None]:
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)

In [None]:
df1 = pd.read_csv('../input/banfakenews/Authentic-48K.csv', na_values=['#NAME?'])
df2 = pd.read_csv('../input/banfakenews/Fake-1K.csv', na_values=['#NAME?'])
df3 = pd.read_csv('../input/banfakenews/LabeledAuthentic-7K.csv', na_values=['#NAME?'])
df4 = pd.read_csv('../input/banfakenews/LabeledFake-1K.csv', na_values=['#NAME?'])

In [None]:
df1.shape

In [None]:
df2.shape

In [None]:
df3.shape

In [None]:
df4.shape

In [None]:
df = pd.concat([df1, df2, df3, df4])

In [None]:
df

In [None]:
df.shape

In [None]:
df.drop('date', axis=1, inplace=True)

In [None]:
df.head()

In [None]:
# How much data is missing
df.isnull().sum().sort_values(ascending=False).head()

In [None]:
df.drop(['relation','F-type'],axis=1, inplace=True)

In [None]:
df.head()

In [None]:
df['source'].replace('NaN','Not Found',inplace=True)

In [None]:
df.head()

In [None]:
df = df.fillna({'source':'Not Found'})
df

In [None]:
# How much data is missing
df.isnull().sum().sort_values(ascending=False).head()

In [None]:
# Categorical values or continuous
df.dtypes.sort_values().to_frame('feature_type').groupby(by = 'feature_type').size().to_frame('count').reset_index()

In [None]:
print(df['category'].value_counts())

In [None]:
df.drop_duplicates(inplace= True)

In [None]:
x = df.category
y = df.source

In [None]:
category_dummies = pd.get_dummies(x, prefix='category', columns=['category'])

In [None]:
df = pd.concat([df,category_dummies], axis=1)

In [None]:
df.head()

In [None]:
print(df['source'].value_counts())

In [None]:
source_dummies = pd.get_dummies(y, prefix='source', columns=['source'])

In [None]:
df = pd.concat([df,source_dummies], axis=1)

In [None]:
df.head()

In [None]:
df.drop(['category','source'],axis=1, inplace=True)

In [None]:
df.head(5)

In [None]:
df.shape

In [None]:
h = df.headline

In [None]:
h.head(5)

In [None]:
# Apply a first round of text cleaning techniques
import re
import string

def clean_text_round1(text):
    '''Make text lowercase, remove text in square brackets, remove punctuation and remove words containing numbers.'''
    text = text.lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\w*\d\w*', '', text)
    return text

round1 = lambda x: clean_text_round1(x)

In [None]:
h.columns = ['headline']
h = h.sort_index()
h

In [None]:
h.drop_duplicates(inplace= True)
h

In [None]:
# Let's take a look at the updated text
data_clean = pd.DataFrame(h.apply(round1))
data_clean

In [None]:
# Apply a second round of cleaning
def clean_text_round2(text):
    '''Get rid of some additional punctuation and non-sensical text that was missed the first time around.'''
    text = re.sub('[‘’“”…]', '', text)
    text = re.sub('\n', '', text)
    return text

round2 = lambda x: clean_text_round2(x)

In [None]:
# Let's take a look at the updated text
data_clean = pd.DataFrame(data_clean.headline.apply(round2))
data_clean

In [None]:
# We are going to create a document-term matrix using CountVectorizer
from sklearn.feature_extraction.text import CountVectorizer


cv = CountVectorizer(tokenizer=lambda x: x.split())
data_cv = cv.fit_transform(data_clean.headline)
data_dtm = pd.DataFrame(data_cv.toarray(), columns=cv.get_feature_names())
data_dtm.index = data_clean.index
data_dtm

In [None]:
c = df.content

In [None]:
c.head()

In [None]:
c.columns = ['headline']
c = c.sort_index()
c

In [None]:
c.drop_duplicates(inplace= True)
c

In [None]:
# Let's take a look at the updated text
data_clean_content = pd.DataFrame(c.apply(round1))
data_clean_content

In [None]:
# Let's take a look at the updated text
data_clean_content = pd.DataFrame(data_clean_content.content.apply(round2))
data_clean_content

In [None]:
# We are going to create a document-term matrix using CountVectorizer

data_cv_2 = cv.fit_transform(data_clean_content.content)
data_dtm_2 = pd.DataFrame(data_cv_2.toarray(), columns=cv.get_feature_names())
data_dtm_2.index = data_clean.index
data_dtm_2