## Import the usual suspects 

In [None]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
import nltk
import os 
import io 
from textblob import TextBlob
from nltk.corpus import stopwords
import string
from sklearn.feature_extraction.text import CountVectorizer,TfidfTransformer
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline

In [None]:
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## Load the datasets 

In [None]:
#load fake news 
fake=pd.read_csv('/kaggle/input/fake-and-real-news-dataset/True.csv')
#load true news 
true=pd.read_csv('/kaggle/input/fake-and-real-news-dataset/True.csv')

## show some infos 

In [None]:
# show the head 
fake.head()

In [None]:
fake.info()

In [None]:
fake.shape

In [None]:
fake.describe()

In [None]:
#show the head of true news 
true.head()

In [None]:
true.info()

In [None]:
true.describe()

In [None]:
true.shape

## Data Analaysis 

In [None]:
#add a label column to both datasets 
fake['label']='fake'
true['label']= 'true'

In [None]:
fake.head()

In [None]:
true.head()

In [None]:
# Now , lest's concat the two datasets 
news=pd.concat([true,fake])
news.sample(frac = 1) #Shuffle 100%


In [None]:
news.groupby('label').size()

### Data Preprocessing 

In [None]:
# a funstion that converts list to string 
def listostring(lst):
    
        listToString = ' '.join([str(elem) for elem in lst]) 
        
        return listToString
    
# let's define a function that processes the text of news 
  
def text_tokenizer(txt):
    
    text_blob = TextBlob(txt)
    text_cleaned= ' '.join(text_blob.words)
    words=text_cleaned.split(' ')

    text=[]
    for word in words:
        if word.lower()  not in stopwords.words('english'):
            text.append(word)
     
    
    listToString = ' '.join([str(elem) for elem in text]) 
  
    return listToString

In [None]:
#add another column to dataset contains text preprocessed 
news['clean_text'] = news['clean_text'].apply(listostring)

In [None]:
news.head()

In [None]:
print(news['clean_text'][0])

In [None]:
#create bag of words 
bow_transformer = CountVectorizer(analyzer=text_tokenizer).fit_transform(news['clean_text'])

In [None]:
#show the sparce Matrix 
print('Shape of Sparse Matrix: ', bow_transformer.shape)

In [None]:
#create tfidf 
tfidf_transformer= TfidfTransformer()
tfidf = tfidf_transformer.fit_transform(bow_transformer)
print(tfidf.shape)

In [None]:
#train the model 
rfc= RandomForestClassifier(n_estimators=100)
rfc.fit(tfidf, news['label'])


In [None]:
#predict
predictions= rfc.predict(tfidf)

In [None]:
print(predictions)

In [None]:
#show some metrics 
label=news['label']

print("Metrics Report \n :",classification_report(label, predictions))
print('\n')
print('\n')
print("Accuracy Score :",accuracy_score(label, predictions))

## Train-Test Split

In [None]:
X=news['text']
y=news['label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 101)


## Build the pipline 

In [None]:
#now let's try the easy way and build our pipline 
#But this time with RFC classifier 

pipeline = Pipeline([
    ('bow', CountVectorizer(analyzer=text_tokenizer)),  # strings to token integer counts
    ('tfidf', TfidfTransformer()),  # integer counts to weighted TF-IDF scores
    ('classifier', RandomForestClassifier(n_estimators=600)),  # train on TF-IDF vectors w/ Naive Bayes classifier
])

In [None]:
#train the pipline 
pipeline.fit(X_train,y_train)


In [None]:
# predict 
preds= pipeline.predict(X_test)

In [None]:
#print some outcomes 
print(list(preds)[:10])

## Model Evaluation


In [None]:
print("Metrics Report \n :",classification_report(y_test, preds))
print('\n')

print('\n')
print("Accuracy Score :",accuracy_score(y_test, preds))

## Submission 

In [None]:
# Submit the results 
submission = pd.DataFrame({'news_Id':X_test.index , 'Label':preds})
submission.to_csv('submission.csv', index=False)
print(" Submission  successfully saved!")

In [None]:
submission.head(20)