## Feature Extraction :
The mapping from textual data to real valued vectors.

## TF-IDF :
Counts the number of times each word appeared in a document.

In [1]:
#importing dependencies

import pandas as pd
import numpy as np
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [3]:
# printing the stopswords in english
print(stopwords.words('english'))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

### Data Pre-processing

In [5]:
#loading the dataset into pandas dataframe
news_df = pd.read_csv('/content/FakeNewsNet.csv')

In [6]:
news_df.head()

Unnamed: 0,title,news_url,source_domain,tweet_num,real
0,Kandi Burruss Explodes Over Rape Accusation on...,http://toofab.com/2017/05/08/real-housewives-a...,toofab.com,42,1
1,People's Choice Awards 2018: The best red carp...,https://www.today.com/style/see-people-s-choic...,www.today.com,0,1
2,Sophia Bush Sends Sweet Birthday Message to 'O...,https://www.etonline.com/news/220806_sophia_bu...,www.etonline.com,63,1
3,Colombian singer Maluma sparks rumours of inap...,https://www.dailymail.co.uk/news/article-33655...,www.dailymail.co.uk,20,1
4,Gossip Girl 10 Years Later: How Upper East Sid...,https://www.zerchoo.com/entertainment/gossip-g...,www.zerchoo.com,38,1


In [7]:
news_df['real'].unique()

array([1, 0])

In [8]:
news_df['real'].value_counts()

real
1    17441
0     5755
Name: count, dtype: int64

In [9]:
#counting the no.of missing values
news_df.isnull().sum()

title              0
news_url         330
source_domain    330
tweet_num          0
real               0
dtype: int64

In [10]:
#replacing null values with empty string
news_df = news_df.fillna('')

In [11]:
news_df.isnull().sum()

title            0
news_url         0
source_domain    0
tweet_num        0
real             0
dtype: int64

In [14]:
# seperating 'title' as input feature and 'real' as target label
X = news_df['title'].values
Y = news_df['real'].values

In [15]:
print(X)
print(Y)

["Kandi Burruss Explodes Over Rape Accusation on 'Real Housewives of Atlanta' Reunion (Video)"
 "People's Choice Awards 2018: The best red carpet looks"
 "Sophia Bush Sends Sweet Birthday Message to 'One Tree Hill' Co-Star Hilarie Burton: 'Breyton 4eva'"
 ...
 "Jessica Chastain Recalls the Moment Her Mother's Boyfriend Slapped Her: 'I Just Kicked Him in the Genitals'"
 'Tristan Thompson Feels "Dumped" After Khloé Kardashian Refuses To Let Him Move Into LA Home (EXCLUSIVE)'
 "Kelly Clarkson Performs a Medley of Kendrick Lamar's 'Humble' & More Hits at the Billboard Music Awards"]
[1 1 1 ... 1 0 1]


## TF_IDF

In [16]:
vectorizer = TfidfVectorizer()

In [17]:
vectorizer.fit(X)

In [18]:
X = vectorizer.transform(X)

In [19]:
print(X)

  (0, 16757)	0.20533481830044967
  (0, 13077)	0.24378320493110486
  (0, 12603)	0.2210529986661146
  (0, 12530)	0.29708096371906023
  (0, 11180)	0.19245516644153107
  (0, 11029)	0.12856749228306702
  (0, 10952)	0.12272050701073844
  (0, 8517)	0.36985779237690874
  (0, 7637)	0.2493559744258593
  (0, 5626)	0.35965002319786576
  (0, 2489)	0.36985779237690874
  (0, 1239)	0.2933859155672956
  (0, 482)	0.37611901091988237
  (1, 15720)	0.15817067297683696
  (1, 12710)	0.3551589910005189
  (1, 11541)	0.37358923522739085
  (1, 9349)	0.38228006011759985
  (1, 3089)	0.3990107139843319
  (1, 2730)	0.36563354202380327
  (1, 1784)	0.3189611292111923
  (1, 1340)	0.2964421563660028
  (1, 139)	0.2839464339175301
  (2, 16152)	0.2829303783215455
  (2, 15913)	0.08853206974215261
  (2, 15374)	0.2136068193926717
  :	:
  (23194, 8847)	0.26565043276406336
  (23194, 8661)	0.26345827012189293
  (23194, 8529)	0.1623532929199754
  (23194, 8083)	0.2413210521142891
  (23194, 7520)	0.24413604009638964
  (23194, 7448)