In [None]:
# Importing some libraries.
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

import nltk
import re
import string
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
porter_stemmer = PorterStemmer()

In [None]:
fake_df = pd.read_csv(r'/kaggle/input/fake-and-real-news-dataset/Fake.csv')
valid_df = pd.read_csv(r'/kaggle/input/fake-and-real-news-dataset/True.csv')

In [None]:
# Combining the fake and true news datasets.

fake_df['result'] = 0
valid_df['result'] = 1
df = pd.concat([fake_df, valid_df], axis=0)
df.reset_index(inplace=True, drop=True)
df

### We only need 'text' and 'result' column therefore dropping other columns.

In [None]:
df.drop(['title','subject','date'], axis=1, inplace=True)

### The true news has company's name(Reuters) and locaion of news in the beginning which will help model in classification.

In [None]:
df[df['result']==1]['text'].apply(lambda x: x.split('-')[0])

### This next function will help in cleaning the text column for further actions. There are multiple ways to do this step. It will remove special characters, punctuations, stopwords and I have also added stemming to it.

In [None]:
def clean_mess_s(a):
    lower_ = a.lower() 
    sp_chars = re.sub("\\W"," ", lower_) #Removing any special characters.
    
    nopunc = [x for x in sp_chars if x not in string.punctuation] #Removing punctuations.
    nopunc = ''.join(nopunc) # This code is to change list back to string.
    
    stop_word= [x for x in nopunc.split() if x.lower() not in stopwords.words('english')] 
    # Removing stop words.
    stop_word = ' '.join(stop_word)
    
    # Stemming the words in the text(Stemming helps in achieving root forms of inflected words).
    words = re.split("\\s+",stop_word)
    stemmed_words = [porter_stemmer.stem(word) for word in words]
    return ' '.join(stemmed_words)

In [None]:
# Applying the above function to text column. (This may take a while.)

df['text'] = df['text'].apply(clean_mess_s)
df.head()

In [None]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(df['text'], df['result'],
                                                 test_size=0.3,random_state=101)

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer()

### Count Vectorizer us used to transfrom a corpora of words/text to vector of token counts.

In [None]:
X_train = cv.fit_transform(X_train)
X_test = cv.transform(X_test)

#### We'll not fit count vectorizer to test set rather we'll only transfrom it.

In [None]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression(max_iter=10000)

In [None]:
lr.fit(X_train,y_train)

In [None]:
predictions = lr.predict(X_test)

In [None]:
lr.score(X_test,y_test)