In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# ** **Imports** **

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import string # python library
import re # regex library
import matplotlib.pyplot as plt
#preprocessing
from gensim.parsing.preprocessing import preprocess_string 
from gensim.parsing.preprocessing import strip_tags #remove tags
from gensim.parsing.preprocessing import strip_multiple_whitespaces #remove multiple whitespace
from gensim.parsing.preprocessing import strip_numeric #remove multiple numbers
from gensim.parsing.preprocessing import strip_punctuation #Replace punctuation characters with spaces
from gensim.parsing.preprocessing import strip_short #Remove words with length lesser than 3 latter
from gensim.parsing.preprocessing import remove_stopwords #Remove STOPWORDS

In [None]:
fake = pd.read_csv('../input/fake-and-real-news-dataset/Fake.csv')
true = pd.read_csv('../input/fake-and-real-news-dataset/True.csv')
true

# Data Cleanup

In [None]:
# Merging title and text
fake['Sentences'] = fake['title'] + ' ' + fake['text']
true['Sentences'] = true['title'] + ' ' + true['text']
# Adding fake and true label
fake['Label'] = 0
true['Label'] = 1

# We can merge both together since we now have labels
final_data = pd.concat([fake, true])

# Randomize the rows so its all mixed up
final_data = final_data.sample(frac=1).reset_index(drop=True)

# Drop columns not needed
final_data = final_data.drop(['title', 'text', 'subject', 'date'], axis = 1)

final_data.head(10)

# **Preprocessing**

In [None]:
# Here we preprocess the sentences
def remove_URL(s):
    regex = re.compile(r'https?://\S+|www\.\S+|bit\.ly\S+')
    return regex.sub(r'',s)

FILTERS = [lambda x: x.lower(), strip_tags, remove_URL, strip_punctuation, strip_multiple_whitespaces, strip_numeric, remove_stopwords, strip_short]
# Here we store the processed sentences and their label

processed_data = []
processed_labels = []

for index, row in final_data.iterrows():
    words_broken_up = preprocess_string(row['Sentences'], FILTERS)
    # This eliminates any fields that may be blank after preprocessing
    if len(words_broken_up) > 0:
        processed_data.append(words_broken_up)
        processed_labels.append(row['Label'])

In [None]:
from gensim.models import Word2Vec # Word2vec

# Word2Vec model trained on processed data
model = Word2Vec(processed_data, min_count=1)

In [None]:
len(model["trump"])

In [None]:
# Getting the vector of a sentence based on average of all the word vectors in the sentence
def ReturnVector(x):
    try:
        return model[x]
    except:
        return np.zeros(100)
    
def Sentence_Vector(sentence):
    word_vectors = list(map(lambda x: ReturnVector(x), sentence))
    return np.average(word_vectors, axis=0).tolist()

X = []
for data_x in processed_data:
    X.append(Sentence_Vector(data_x))

In [None]:
X_np = np.array(X)
X_np.shape

In [None]:
from sklearn import cluster # Kmeans clustering
# Training for 2 clusters (Fake and Real)
kmeans = cluster.KMeans(n_clusters=2, verbose=1)

# Fit predict will return labels
clustered = kmeans.fit_predict(X_np)

In [None]:
testing_df = {'Sentence': processed_data, 'Labels': processed_labels, 'Prediction': clustered}
testing_df = pd.DataFrame(data=testing_df)

testing_df.head(10)

In [None]:
plt.scatter(X_np[:,0],X_np[:,1], c=clustered, cmap='Paired')

In [None]:
from sklearn import metrics
# purity_score Function
def purity_score(y_true, y_pred):
    # compute contingency matrix (also called confusion matrix)
    contingency_matrix = metrics.cluster.contingency_matrix(y_true, y_pred)
    # return purity
    return np.sum(np.amax(contingency_matrix, axis=0)) / np.sum(contingency_matrix)

In [None]:
print("Purity score for the clustered news: %0.2f%% "%(purity_score(processed_labels, clustered)*100))

In [None]:
from sklearn.metrics.cluster import adjusted_rand_score
print("Adjusted rand score for the clustered news ARI: %0.2f%% "%(adjusted_rand_score(processed_labels, clustered)*100))

In [None]:
from sklearn.cluster import AgglomerativeClustering
# Agglomerative Clustering
s_data=X_np[:10000]
clustered = AgglomerativeClustering(n_clusters = 2).fit_predict(s_data)
print(clustered)

In [None]:
testing_df = {'Sentence': processed_data[:10000], 'Labels': processed_labels[:10000], 'Prediction': clustered}
testing_df = pd.DataFrame(data=testing_df)

testing_df.head(10)

In [None]:
print("Purity score for the clustered news: %0.2f%% "%(purity_score(processed_labels[:10000], clustered)*100))

In [None]:
from sklearn.metrics.cluster import adjusted_rand_score
print("Adjusted rand score for the clustered news ARI: %0.2f%% "%(adjusted_rand_score(processed_labels[:10000], clustered)*100))

In [None]:
plt.scatter(s_data[:,0],s_data[:,1], c=clustered, cmap='Paired')

In [None]:
from sklearn.cluster import DBSCAN
# DBSCAN Clustering
db_cluster = DBSCAN(eps=5.5, min_samples=2)
arr = db_cluster.fit_predict(s_data)
print ("Clusters assigned are:", set(db_cluster.labels_))
print (len(np.unique(arr)))
uni, counts = np.unique(arr, return_counts=True)
d = dict(zip(uni, counts))
print (d)

In [None]:
print("Purity score for the clustered news: %0.2f%% "%(purity_score(processed_labels[:10000], arr)*100))

In [None]:
from sklearn.metrics.cluster import adjusted_rand_score
print("Adjusted rand score for the clustered news ARI: %0.2f%% "%(adjusted_rand_score(processed_labels[:10000], arr)*100))

In [None]:
plt.scatter(s_data[:,0],s_data[:,1], c=arr, cmap='Paired')