In [1]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.impute import SimpleImputer

In [2]:
df_fake = pd.read_csv('/kaggle/input/fake-and-real-news-dataset/Fake.csv')
df_real = pd.read_csv('/kaggle/input/fake-and-real-news-dataset/True.csv')

In [3]:
df_fake['label'] = 0
df_real['label'] = 1

In [4]:
df = pd.concat([df_fake,df_real])
df = df.sample(frac=1).reset_index(drop=True)

In [5]:
df.sample(5)

Unnamed: 0,title,text,subject,date,label
10745,No Ex-President In 100 Yrs Has Set Up A Shadow...,FOX News Catherine Herridge exposes the dirty...,left-news,"Mar 13, 2017",0
38420,PRESIDENT AND FIRST LADY Attend Military Famil...,President Trump and First Lady Melania Trump a...,politics,"Jul 4, 2017",0
8109,CNN Just Accidentally Aired Girl Flipping Off...,After Donald Trump won the Indiana primary on ...,News,"May 4, 2016",0
31984,NEWSFLASH FOR OUR IMPERIAL PRESIDENT: STATES C...,As Barack Hussein Obama tours around the count...,left-news,"Jul 23, 2015",0
12617,CNN CALLS IT: A Democrat Will Represent Alaba...,Alabama is a notoriously deep red state. It s ...,News,"December 12, 2017",0


In [6]:
df['date'] = pd.to_datetime(df['date'],errors='coerce')

In [7]:
df['month'] = df['date'].dt.month
df['year'] = df['date'].dt.year
df['day_of_week'] = df['date'].dt.dayofweek

In [8]:
df['was_date_missing'] = df['date'].isna().astype(int)


In [9]:
df.head()

  has_large_values = (abs_vals > 1e6).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()


Unnamed: 0,title,text,subject,date,label,month,year,day_of_week,was_date_missing
0,OFF-DUTY POLICE OFFICER SHOOTS AND KILLS Black...,This is truly a sad and senseless situation. T...,politics,2016-07-10,0,7.0,2016.0,6.0,0
1,Senate Democrats seek Trump tax returns,WASHINGTON (Reuters) - U.S. Senate Democrats u...,politicsNews,NaT,1,,,,1
2,MAXINE WATERS: ‘These people trying to ‘discre...,MAXINE GOT A MAKEOVER and is hopping mad about...,politics,2017-07-22,0,7.0,2017.0,5.0,0
3,MIGRANTS BRUTALLY GANG RAPE 3 YR OLD BOY At As...,The world is stunned by the number of women an...,Government News,2016-01-13,0,1.0,2016.0,2.0,0
4,Sore Winner: Trump Attacks Clinton With Putin...,If Donald Trump wanted people to think he isn ...,News,NaT,0,,,,1


In [10]:
df=df.drop(['date'],axis=1)

In [11]:
X_train, X_test, y_train, y_test = train_test_split(df.drop(['label'],axis=1),
                                                   df['label'],
                                                   random_state=2,
                                                   test_size=0.2)

In [12]:
ohe = OneHotEncoder(handle_unknown='ignore')
vect = CountVectorizer(stop_words='english')
std = StandardScaler()
imputer = SimpleImputer(strategy='most_frequent')

In [13]:
num_pipe = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),  # or 'median'
    ('scaler', StandardScaler())
])

In [14]:
transformer = ColumnTransformer(transformers=[
    ('ohe',ohe, ['subject']),
    ('vectorization',vect,'title'),
    ('vectorization2',vect,'text'),
    ('num', num_pipe, ['month','year','day_of_week'])
],remainder='passthrough')

In [15]:
X_train_trf = transformer.fit_transform(X_train)
X_test_trf = transformer.transform(X_test)

In [16]:
dtc = DecisionTreeClassifier(criterion='gini',splitter='random',max_depth=10)

In [17]:
dtc.fit(X_train_trf, y_train)

In [18]:
y_pred = dtc.predict(X_test_trf)

In [19]:
from sklearn.metrics import recall_score, confusion_matrix

In [20]:
recall_score(y_test,y_pred)

1.0

In [21]:
confusion_matrix(y_test, y_pred)

array([[4703,    1],
       [   0, 4276]])

In [22]:
new_data = pd.DataFrame({
    'subject': ['politicsNews', 'worldNews'],
    'title': [
        'President addresses new reforms in conference',
        'New alien species discovered in the Arctic'
    ],
    'text': [
        'The government introduced new tax reforms and policies in today\'s speech.',
        'Scientists have confirmed sightings of strange creatures in the cold tundra.'
    ],
    'month': [6, 11],
    'year': [2024, 2023],
    'day_of_week': [2, 6]
})

In [23]:
new_data['was_date_missing'] = [0, 0] 


In [24]:
new_data_trf = transformer.transform(new_data)

In [25]:
prediction = dtc.predict(new_data_trf)
print(prediction)


[0 0]


In [26]:
probs = dtc.predict_proba(new_data_trf)
print(probs)


[[1. 0.]
 [1. 0.]]
