In [24]:
import pandas as pd
import pickle
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
import string
import nltk # Imports the library
import string
from nltk.corpus import stopwords
import sqlite3 as sql


In [2]:
def text_process(mess):
    """
    Takes in a string of text, then performs the following:
    1. Remove all punctuation
    2. Remove all stopwords
    3. Returns a list of the cleaned text
    """
    # Check characters to see if they are in punctuation
    nopunc = [char for char in mess if char not in string.punctuation]

    # Join the characters again to form the string.
    nopunc = ''.join(nopunc)

    # Now just remove any stopwords
    return [word for word in nopunc.split() if word.lower() not in stopwords.words('english')]

In [33]:
df = pd.read_csv('sample.csv')

In [4]:
knn_model = pickle.load(open('finalized_model-knn.sav', 'rb'))
rn_model = pickle.load(open('finalized_model-rn.sav','rb'))
bayes_model = pickle.load(open('finalized_model.sav','rb'))

In [5]:
from sklearn.model_selection import train_test_split

news_train, news_test, type_train, type_test = train_test_split(df['content'], df['type'], test_size=0.2)

print(len(news_train), len(news_test), len(type_train) + len(type_test))

157519 39380 196899


In [6]:
df.drop(['Unnamed: 0','index','sentiment','magnitude','title', 'authors'],axis = 1, inplace = True)

In [7]:
df.head()

Unnamed: 0,domain,type,content
0,wikileaks.com,unreliable,Tor\n\nTor is an encrypted anonymising network...
1,www.yahoo.com,reliable,NEW YORK (AP) — Ringling Bros. and Barnum & Ba...
2,www.huffingtonpost.com,reliable,The Importance Of Being Kind 11/15/2016 09:44 ...
3,thedailysheeple.com,conspiracy,Delivered by The Daily Sheeple\n\nWe encourage...
4,beforeitsnews.com,fake,What Most Good Investors Do\n\n% of readers th...


In [8]:
df['type'] = df['type'].map({'fake': 0, 'reliable': 1, 'unreliable': 0, 'conspiracy':0})

In [9]:
df.groupby('type').size()

type
0    101352
1     95547
dtype: int64

In [56]:
df = df.sample(frac = 0.5)

In [10]:
df.count

<bound method DataFrame.count of                         domain  type  \
0                wikileaks.com     0   
1                www.yahoo.com     1   
2       www.huffingtonpost.com     1   
3          thedailysheeple.com     0   
4            beforeitsnews.com     0   
...                        ...   ...   
196894             nytimes.com     1   
196895        pamelageller.com     0   
196896             nytimes.com     1   
196897             nytimes.com     1   
196898             nytimes.com     1   

                                                  content  
0       Tor\n\nTor is an encrypted anonymising network...  
1       NEW YORK (AP) — Ringling Bros. and Barnum & Ba...  
2       The Importance Of Being Kind 11/15/2016 09:44 ...  
3       Delivered by The Daily Sheeple\n\nWe encourage...  
4       What Most Good Investors Do\n\n% of readers th...  
...                                                   ...  
196894  The travelers who went to La Guardia Airport y...  
196895

In [11]:
predict_bayes = bayes_model.predict(df['content'])

In [12]:
predict_knn = knn_model.predict(df['content'])

In [13]:
predict_rn = rn_model.predict(df['content'])

In [20]:
from sklearn.metrics import classification_report
from sklearn.metrics import plot_confusion_matrix
from sklearn.metrics import confusion_matrix

### Score Naiv Bayes

In [21]:
print(classification_report(predict_bayes,df['type']))
print(confusion_matrix(df['type'], predict_bayes))


              precision    recall  f1-score   support

           0       0.86      0.98      0.92     89302
           1       0.98      0.87      0.92    107597

    accuracy                           0.92    196899
   macro avg       0.92      0.93      0.92    196899
weighted avg       0.93      0.92      0.92    196899

[[87633 13719]
 [ 1669 93878]]


### Scor KNN

In [22]:
print(classification_report(predict_knn,df['type']))
print(confusion_matrix(df['type'], predict_knn))

              precision    recall  f1-score   support

           0       0.91      0.93      0.92     98156
           1       0.93      0.90      0.92     98743

    accuracy                           0.92    196899
   macro avg       0.92      0.92      0.92    196899
weighted avg       0.92      0.92      0.92    196899

[[91753  9599]
 [ 6403 89144]]


### Scor Retele Neuronale

In [23]:
print(classification_report(predict_rn,df['type']))
print(confusion_matrix(df['type'], predict_rn))

              precision    recall  f1-score   support

           0       0.99      0.99      0.99    101398
           1       0.99      0.99      0.99     95501

    accuracy                           0.99    196899
   macro avg       0.99      0.99      0.99    196899
weighted avg       0.99      0.99      0.99    196899

[[100801    551]
 [   597  94950]]
