# Import Libraries

In [72]:
import numpy as np 
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem.porter import PorterStemmer
import string
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA 
from sklearn.cluster import KMeans
import scipy.cluster.hierarchy as shc
from sklearn.cluster import AgglomerativeClustering
from sklearn.metrics import silhouette_score
import pickle

# Load the Dataset

In [52]:
news_data=pd.read_table("../data/news_data.tsv", delimiter="\t")

In [53]:
news_data

Unnamed: 0,2635.json,false,Says the Annies List political group supports third-trimester abortions on demand.,abortion,dwayne-bohac,State representative,Texas,republican,0,1,0.1,0.2,0.3,a mailer
0,10540.json,half-true,When did the decline of coal start? It started...,"energy,history,job-accomplishments",scott-surovell,State delegate,Virginia,democrat,0.0,0.0,1.0,1.0,0.0,a floor speech.
1,324.json,mostly-true,"Hillary Clinton agrees with John McCain ""by vo...",foreign-policy,barack-obama,President,Illinois,democrat,70.0,71.0,160.0,163.0,9.0,Denver
2,1123.json,false,Health care reform legislation is likely to ma...,health-care,blog-posting,,,none,7.0,19.0,3.0,5.0,44.0,a news release
3,9028.json,half-true,The economic turnaround started at the end of ...,"economy,jobs",charlie-crist,,Florida,democrat,15.0,9.0,20.0,19.0,2.0,an interview on CNN
4,12465.json,true,The Chicago Bears have had more starting quart...,education,robin-vos,Wisconsin Assembly speaker,Wisconsin,republican,0.0,3.0,2.0,5.0,1.0,a an online opinion-piece
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10234,5473.json,mostly-true,There are a larger number of shark attacks in ...,"animals,elections",aclu-florida,,Florida,none,0.0,1.0,1.0,1.0,0.0,"interview on ""The Colbert Report"""
10235,3408.json,mostly-true,Democrats have now become the party of the [At...,elections,alan-powell,,Georgia,republican,0.0,0.0,0.0,1.0,0.0,an interview
10236,3959.json,half-true,Says an alternative to Social Security that op...,"retirement,social-security",herman-cain,,Georgia,republican,4.0,11.0,5.0,3.0,3.0,a Republican presidential debate
10237,2253.json,false,On lifting the U.S. Cuban embargo and allowing...,"florida,foreign-policy",jeff-greene,,Florida,democrat,3.0,1.0,3.0,0.0,0.0,a televised debate on Miami's WPLG-10 against ...


# Drop all the unnecessary columns

In [54]:
news_data.drop(columns=['2635.json', 'abortion', 'dwayne-bohac', 'State representative', 'Texas', 'republican', '0', '1', '0.1', '0.2', '0.3', 'a mailer'], inplace=True)

In [55]:
news_data.head()

Unnamed: 0,false,Says the Annies List political group supports third-trimester abortions on demand.
0,half-true,When did the decline of coal start? It started...
1,mostly-true,"Hillary Clinton agrees with John McCain ""by vo..."
2,false,Health care reform legislation is likely to ma...
3,half-true,The economic turnaround started at the end of ...
4,true,The Chicago Bears have had more starting quart...


### Set column names

In [56]:
news_data.columns=['label','news_text']

In [57]:
news_data.head()

Unnamed: 0,label,news_text
0,half-true,When did the decline of coal start? It started...
1,mostly-true,"Hillary Clinton agrees with John McCain ""by vo..."
2,false,Health care reform legislation is likely to ma...
3,half-true,The economic turnaround started at the end of ...
4,true,The Chicago Bears have had more starting quart...


In [58]:
news_data.shape

(10239, 2)

# Feature Engineering

## Checking for null values

In [59]:
news_data.isnull().sum()

label        0
news_text    0
dtype: int64

## Count the total number of labels present in the dataset

In [60]:
each_label_count = news_data['label'].value_counts()
each_label_count

label
half-true      2114
false          1994
mostly-true    1962
true           1676
barely-true    1654
pants-fire      839
Name: count, dtype: int64

Since the results show that our dataset is imbalanced, we need to create a balanced dataset, to ge quality results at the end

# Create a balanced dataset of 204 datapoints in total

Start by filtering the data first each class and saving them in a variable

In [61]:
news_data_half_true_only=news_data[news_data['label']=='half-true']
news_data_false_only=news_data[news_data['label']=='false']
news_data_mostly_true_only=news_data[news_data['label']=='mostly-true']
news_data_true_only=news_data[news_data['label']=='true']
news_data_barely_true_only=news_data[news_data['label']=='barely-true']
news_data_pants_fire_only=news_data[news_data['label']=='pants-fire']

Take 34 data point from each class, where we will have 204 in total. In this case, we guarantee that our dataset is balanced. 

In [62]:
news_data_half_true_only_sample=news_data_half_true_only[0:34]
news_data_false_only_sample=news_data_false_only[0:34]
news_data_mostly_true_only_sample=news_data_mostly_true_only[0:34]
news_data_true_only_sample=news_data_true_only[0:34]
news_data_barely_true_only_sample=news_data_barely_true_only[0:34]
news_data_pants_fire_only_sample=news_data_pants_fire_only[0:34]

Now concatenate the new dataset with the balanced data points. In order to concatenate a pandas dataframe, we use the built in function with pandas pd.concat([here we pass the dataframes])

In [63]:
new_news_dataset = pd.concat([news_data_half_true_only_sample, news_data_false_only_sample,
                              news_data_mostly_true_only_sample,news_data_true_only_sample,
                              news_data_barely_true_only_sample,news_data_pants_fire_only_sample],axis=0)

In [64]:
new_news_dataset.head()

Unnamed: 0,label,news_text
0,half-true,When did the decline of coal start? It started...
3,half-true,The economic turnaround started at the end of ...
6,half-true,I'm the only person on this stage who has work...
7,half-true,"However, it took $19.5 million in Oregon Lotte..."
10,half-true,"Since 2000, nearly 12 million Americans have s..."


In [65]:
new_news_dataset.shape

(204, 2)

# Preprocessing the text

In [66]:
nltk.download("stopwords")

[nltk_data] Downloading package stopwords to C:\Users\Souad
[nltk_data]     Khalifeh\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [67]:
nltk.download("punkt")

[nltk_data] Downloading package punkt to C:\Users\Souad
[nltk_data]     Khalifeh\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

Initialize the stemmer

In [68]:
ps=PorterStemmer()

In [73]:
def preprocess_text(text):
    
    text=text.lower()
    text=nltk.word_tokenize(text)
    
    word_arr=[]
    # check if text is alpha numeric
    for i in text: 
        if i.isalnum():
            word_arr.append(i)
            
    #copy the word_arr to text array
    text=word_arr.copy(deep=True)      
    # clear the list  
    word_arr.clear()
    
    # check if the word is not a stop word nor a punctuation
    for i in text:
        if i not in stopwords.words("english") and i not in string.punctuation:
            word_arr.append(i)
            
    text=word_arr.copy(deep=True)
    word_arr.clear()        
    
    for i in text: 
        word_arr.append(ps.stem(i))
        
    return " ".join(word_arr)    

# Create a new column in the dataset and inserting all the pre-processed texts inside it 

In [74]:
new_news_dataset['preprocessed_texts']=new_news_dataset['news_text'].apply(preprocess_text)

LookupError: 
**********************************************************************
  Resource [93mpunkt_tab[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('punkt_tab')
  [0m
  For more information see: https://www.nltk.org/data.html

  Attempted to load [93mtokenizers/punkt_tab/english/[0m

  Searched in:
    - 'C:\\Users\\Souad Khalifeh/nltk_data'
    - 'c:\\Users\\Souad Khalifeh\\.vscode\\Unsupervised_Learning\\venv\\nltk_data'
    - 'c:\\Users\\Souad Khalifeh\\.vscode\\Unsupervised_Learning\\venv\\share\\nltk_data'
    - 'c:\\Users\\Souad Khalifeh\\.vscode\\Unsupervised_Learning\\venv\\lib\\nltk_data'
    - 'C:\\Users\\Souad Khalifeh\\AppData\\Roaming\\nltk_data'
    - 'C:\\nltk_data'
    - 'D:\\nltk_data'
    - 'E:\\nltk_data'
**********************************************************************
