#Link to google colab notebook
https://colab.research.google.com/drive/1ZzcvEcBzFBH9Y-4V4kHY6_hvOMnxqk6f?usp=sharing

In [21]:
#import libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
#import nltk
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
lm = nltk.WordNetLemmatizer()
stemmer = PorterStemmer()
#import regex
import re
import pickle

In [2]:
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ram_o\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ram_o\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\ram_o\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [3]:
# importing Dataset
df = pd.read_csv("Philippine Fake News Corpus.csv")
df

Unnamed: 0,Headline,Content,Authors,Date,URL,Brand,Label
0,PH ranks 2nd in Asia-Pacific in deaths due to ...,Pollution caused by traditional cooking fuel i...,['Philippine Daily Inquirer'],,https://newsinfo.inquirer.net/987262/ph-ranks-...,Inquirer,Credible
1,"Aguirre, PCSO chief deny plotting to kill ‘Ato...",Justice Secretary Vitaliano Aguirre 2nd and Ph...,['Jomar Canlas'],2017-04-28 20:12:54+00:00,https://www.manilatimes.net/aguirre-pcso-chief...,Manila Times,Credible
2,Duterte says charges vs ex-President will fail,President Rodrigo Duterte on Monday night desc...,['Christine O. Avendaño'],,https://newsinfo.inquirer.net/914727/duterte-s...,Inquirer,Credible
3,Group warns BFAR on law enforcement fund,THE militant fisher folk group Pambansang Laka...,['Neil Alcober'],2017-08-12 19:54:48+00:00,https://www.manilatimes.net/group-warns-bfar-l...,Manila Times,Credible
4,Solon asks Duterte for jet ski to Panatag,Magdalo Rep. Gary Alejano is willing to lead t...,['Dj Yap'],,https://newsinfo.inquirer.net/882744/solon-ask...,Inquirer,Credible
...,...,...,...,...,...,...,...
22453,Philippine Elections 2016 is not a vote for Du...,"Indeed, everybody is shocked — just shocked! —...",[],2016-04-28 05:35:44+00:00,https://www.getrealphilippines.com/2016/04/phi...,Get Real Philippines,Not Credible
22454,"28,000 families affected by floods receive aid","A TOTAL of 132,259 individuals from 28,101 fam...",['Mary Gleefer F. Jalea'],2018-08-15 00:03:17+00:00,https://www.manilatimes.net/28000-families-aff...,Manila Times,Credible
22455,Singer Jim Paredes Miffed That He Wasn’t Invit...,Shortly after Rod Duterte announced there will...,['Pol Pinoy'],2016-06-16 00:00:00,https://adobochronicles.com/2016/06/16/singer-...,Adobo Chronicles,Not Credible
22456,POPE FRANCIS SCOLDS PRESIDENT OBAMA,President Barack Obama met for the first time ...,['Pol Pinoy'],2017-03-27 00:00:00,https://adobochronicles.com/2014/03/27/pope-fr...,Adobo Chronicles,Not Credible


<h1>Defining Functions</h1>

In [4]:
def lower_case(text):

    text = text.lower()

    return text

In [5]:
def stop_words(text):

    stop_words = set(stopwords.words('english'))
    tokens = word_tokenize(text)
    text = ' '.join([word for word in tokens if word.lower() not in stop_words])

    return text

In [6]:
def special_text(text):

    text = re.sub(r'[^\w\s]', '', text)

    return text

In [7]:
def preprocess_text(text):

    text = lower_case(text)
    text = stop_words(text)
    text = special_text(text)


    return text

In [8]:
def lemmatizer_on_text(data):

    text = [lm.lemmatize(word) for word in data]

    return data

In [9]:
def stem_words(text):
    word_tokens = text.split()
    stems = [stemmer.stem(word) for word in word_tokens]
    return stems

<h1>Cleaning Text</h1>

In [10]:
df['Clean Text'] = df['Headline'].apply(preprocess_text)

In [11]:
df['Clean Text'] = df['Clean Text'].apply(lambda x: lemmatizer_on_text(x))
df['Clean Text'] = df['Clean Text'].apply(lambda x: stem_words(x))

In [12]:
df.drop(['Content', 'Authors', 'Date', 'URL', 'Brand'], inplace = True, axis = 1)
df = df[0:9999]
df

Unnamed: 0,Headline,Label,Clean Text
0,PH ranks 2nd in Asia-Pacific in deaths due to ...,Credible,"[ph, rank, 2nd, asiapacif, death, due, househo..."
1,"Aguirre, PCSO chief deny plotting to kill ‘Ato...",Credible,"[aguirr, pcso, chief, deni, plot, kill, atong,..."
2,Duterte says charges vs ex-President will fail,Credible,"[dutert, say, charg, vs, expresid, fail]"
3,Group warns BFAR on law enforcement fund,Credible,"[group, warn, bfar, law, enforc, fund]"
4,Solon asks Duterte for jet ski to Panatag,Credible,"[solon, ask, dutert, jet, ski, panatag]"
...,...,...,...
9994,Airlines to hurt from Boracay closure,Credible,"[airlin, hurt, boracay, closur]"
9995,"Mining, oil investors still dumping shares",Credible,"[mine, oil, investor, still, dump, share]"
9996,Stop using plastic – DENR,Credible,"[stop, use, plastic, denr]"
9997,Trillanes in California: Thousands Of FilAms E...,Not Credible,"[trillan, california, thousand, filam, expect,..."


In [13]:
#splitting testing and training data
y = df["Label"]
x = df["Headline"]
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size = 0.3, random_state=50, shuffle=True)   # 30% of data reserved for testing
print(y_test.value_counts())
print(y_train.value_counts())

Label
Credible        1950
Not Credible    1050
Name: count, dtype: int64
Label
Credible        4629
Not Credible    2370
Name: count, dtype: int64


In [14]:
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer

In [15]:
#feature extraction

count_vect = CountVectorizer()

#x_train
count_vect.fit(x_train)
x_train_counts = count_vect.transform(x_train)

#fit only on training data
tf_transformer = TfidfTransformer()
tf_transformer.fit(x_train_counts)
x_train_transformed = tf_transformer.transform(x_train_counts)

#x_test
x_test_counts = count_vect.transform(x_test)
#tf_transformer = TfidfTransformer().fit(x_test_counts)
x_test_transformed = tf_transformer.transform(x_test_counts)

In [16]:
#making svc model
fake_classifier = SVC(kernel='linear')

#training svc model on training data
fake_classifier.fit(x_train_transformed, y_train)

In [17]:
#predicting testing data
test_predictions = fake_classifier.predict(x_test_transformed)

In [18]:
#resetting index of y_testing data after train_test_split
indexReset = pd.DataFrame(y_test)
print(indexReset.head())
indexReset.reset_index(inplace = True)
indexReset = indexReset.drop('index', axis=1)
print(indexReset.head())

df1 = pd.DataFrame(test_predictions, columns = ['predictions'])
df1['testingData'] = indexReset['Label']
df1

             Label
8152      Credible
3187      Credible
8966      Credible
2855  Not Credible
9139      Credible
          Label
0      Credible
1      Credible
2      Credible
3  Not Credible
4      Credible


Unnamed: 0,predictions,testingData
0,Credible,Credible
1,Credible,Credible
2,Credible,Credible
3,Credible,Not Credible
4,Credible,Credible
...,...,...
2995,Credible,Credible
2996,Credible,Credible
2997,Not Credible,Credible
2998,Credible,Not Credible


In [22]:
#getting accuracy
print(accuracy_score(df1['testingData'], df1['predictions'], normalize = True))

0.89


In [23]:
pickle.dump(fake_classifier, open("fake_classifier.pkl", "wb"))
pickle.dump(count_vect, open("fake_vectorizer.pkl", "wb"))
pickle.dump(tf_transformer, open("fake_transformer.pkl", "wb"))

In [24]:
sampleGiven = '''New House Bill To Penalize Elected Officials Who Are Out Of Tune With The Filipino People'''

In [25]:
#sampleGiven = str(input("Enter any article: "))

sampleGiven = count_vect.transform([sampleGiven])
#print(canyu)

sampleGiven = tf_transformer.transform(sampleGiven)
#print(canyu2)

predictedSample = fake_classifier.predict(sampleGiven)
predictedSample


array(['Not Credible'], dtype=object)

In [26]:
'''from google.colab import drive
drive.mount('drive')
df1.to_csv('data.csv')
!cp data.csv "drive/My Drive/"'''

'from google.colab import drive\ndrive.mount(\'drive\')\ndf1.to_csv(\'data.csv\')\n!cp data.csv "drive/My Drive/"'