<a href="https://colab.research.google.com/github/rishabhbhardwaj-rb/FakeNewsDetection/blob/main/Notebooks/FeatureExtraction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
!pip install vaderSentiment
import numpy as np
import pandas as pd
import spacy
from collections import Counter
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting vaderSentiment
  Downloading vaderSentiment-3.3.2-py2.py3-none-any.whl (125 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m126.0/126.0 KB[0m [31m12.5 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: vaderSentiment
Successfully installed vaderSentiment-3.3.2


In [None]:
File = '/content/drive/MyDrive/FNDS/Dataset/CleanedDataset/Dataset_Clean.zip'
dataset = pd.read_csv(File, encoding='utf-8')

In [None]:

class NERFeatureExtraction:
    def __init__(self, df, textColumnName):
        print("NER Feature Extraction Starts")
        self.m_new_df = pd.DataFrame()
        self.m_df = df
        self.m_textColumnName = textColumnName
        self.m_NER = spacy.load("en_core_web_sm")
        self.m_NER_features = ["PERSON","ORG","FAC","GPE","NORP","LOC","PRODUCT","EVENT","WORK_OF_ART","LAW","LANGUAGE",
                            "DATE","TIME","PERCENT","MONEY","CARDINAL","QUANTITY","ORDINAL"]
        self.NER()
        print("NER Feature Extraction Done\n")
        
    def GetDataFrame(self):
        return self.m_new_df
        
    def NER(self):
        ner = []
        for idx, row in self.m_df.iterrows():
            sentence = self.m_NER(row[self.m_textColumnName])
            dic = dict.fromkeys(self.m_NER_features,0)
            labels = [x.label_ for x in sentence.ents]
            dic.update(Counter(labels))
            ner.append(dic)
        ner_df = pd.DataFrame.from_dict(ner)
        ner_df = ner_df[ner_df.columns.intersection(self.m_NER_features)]
        self.m_new_df = ner_df

In [None]:
dataset_NER = NERFeatureExtraction(dataset,'content').GetDataFrame()
dataset_NER.to_csv('/content/drive/MyDrive/FNDS/Dataset/dataset_NER.zip', encoding='utf-8', index = False)
dataset_NER

NER Feature Extraction Starts
NER Feature Extraction Done



Unnamed: 0,PERSON,ORG,FAC,GPE,NORP,LOC,PRODUCT,EVENT,WORK_OF_ART,LAW,LANGUAGE,DATE,TIME,PERCENT,MONEY,CARDINAL,QUANTITY,ORDINAL
0,2,13,2,2,2,0,1,0,0,0,0,5,0,0,0,4,0,0
1,17,8,0,0,0,0,1,0,1,0,0,3,0,1,0,0,0,0
2,10,9,0,10,2,0,1,0,2,0,0,18,0,5,12,3,0,4
3,2,27,0,4,3,0,2,0,0,0,0,6,0,0,0,4,0,0
4,6,10,0,4,8,0,0,0,1,1,0,6,0,0,4,2,0,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19100,39,16,0,5,2,0,0,0,1,0,0,53,0,29,1,18,1,8
19101,45,46,0,6,2,0,0,0,2,0,0,12,2,0,5,3,0,1
19102,6,12,0,5,2,1,0,0,3,0,0,7,0,8,8,1,0,1
19103,11,8,0,4,3,1,0,0,4,0,0,4,0,4,4,0,0,2


In [None]:
class POSTagFeatureExtraction:
    def __init__(self, df, textColumnName):
        print("POS Tag Feature Extraction Starts")
        self.m_new_df = pd.DataFrame()
        self.m_df = df
        self.m_textColumnName = textColumnName
        self.m_POS = spacy.load("en_core_web_sm")
        self.m_POS_features = [ "ADJ","ADP","ADV","AUX","CCONJ","DET","INTJ","NOUN","NUM","PART",
                            "PRON","X","PROPN","PUNCT","SCONJ","SYM","VERB","SPACE","CONJ"]
        self.POS()
        print("POS Tag Feature Extraction Done\n")
        
    def GetDataFrame(self):
        return self.m_new_df
        
    def POS(self):
        pos_tag = []
        for idx, row in self.m_df.iterrows():
            sentence = self.m_POS(row[self.m_textColumnName])
            dic = dict.fromkeys(self.m_POS_features,0)
            labels = [x.pos_ for x in sentence]
            dic.update(Counter(labels))
            pos_tag.append(dic)
        pos_df = pd.DataFrame.from_dict(pos_tag)
        pos_df=pos_df[pos_df.columns.intersection(self.m_POS_features)]
        self.m_new_df = pos_df


In [None]:
dataset_POSTag =  POSTagFeatureExtraction(dataset, 'content').GetDataFrame()
dataset_POSTag.to_csv('/content/drive/MyDrive/FNDS/Dataset/FeatureExtraction/dataset_POSTag.zip', encoding='utf-8', index = False)
dataset_POSTag.head()

POS Tag Feature Extraction Starts
POS Tag Feature Extraction Done



Unnamed: 0,ADJ,ADP,ADV,AUX,CCONJ,DET,INTJ,NOUN,NUM,PART,PRON,X,PROPN,PUNCT,SCONJ,SYM,VERB,SPACE,CONJ
0,21,58,10,26,19,48,2,143,7,27,23,0,37,72,10,0,87,22,0
1,15,31,10,14,6,38,0,68,4,12,24,0,41,48,10,0,42,15,0
2,58,73,20,26,14,66,0,154,37,16,35,0,54,92,16,15,74,17,0
3,49,87,19,46,20,75,0,175,7,27,49,2,56,105,17,0,107,29,0
4,63,83,31,34,35,74,0,189,15,24,35,0,61,102,22,4,105,23,0


In [None]:

class DependencyFeatureExtraction:
    def __init__(self, df, textColumnName):
        print("Dependency Feature Extraction Starts")
        self.m_new_df = pd.DataFrame()
        self.m_df = df
        self.m_textColumnName = textColumnName
        self.m_Dep = spacy.load("en_core_web_sm")
        self.m_Dep_features = self.m_Dep.pipe_labels['parser']
        self.Dependency()
        print("Dependency Feature Extraction Done\n")
        
    def GetDataFrame(self):
        return self.m_new_df
        
    def Dependency(self):
        dependencies = []
        for idx, row in self.m_df.iterrows():
            sentence = self.m_Dep(row[self.m_textColumnName])
            dic = dict.fromkeys(self.m_Dep_features,0)
            labels = [x.dep_ for x in sentence]
            labels = Counter(labels)
            for key in labels.keys():
                if key in dic:
                    dic[key] += labels[key]
            dependencies.append(dic)
        dependencies_df = pd.DataFrame.from_dict(dependencies)
        self.m_new_df = dependencies_df

In [None]:
dataset_Dependency = DependencyFeatureExtraction(dataset, 'content').GetDataFrame()
dataset_Dependency.to_csv('/content/drive/MyDrive/FNDS/Dataset/FeatureExtraction/dataset_Dependency.zip', encoding='utf-8', index = False)
dataset_Dependency.head()

Dependency Feature Extraction Starts
Dependency Feature Extraction Done



Unnamed: 0,ROOT,acl,acomp,advcl,advmod,agent,amod,appos,attr,aux,...,pobj,poss,preconj,predet,prep,prt,punct,quantmod,relcl,xcomp
0,22,9,0,9,9,1,25,3,4,36,...,48,8,0,1,57,0,72,1,8,7
1,19,5,3,4,11,0,11,2,4,11,...,27,9,0,0,31,1,49,0,5,3
2,32,5,4,9,23,1,48,4,7,19,...,70,20,0,0,81,3,97,8,6,4
3,36,4,7,13,19,4,42,4,7,42,...,81,7,0,0,83,2,106,1,13,12
4,33,13,5,13,44,2,47,2,4,32,...,69,6,1,0,72,8,101,4,5,7


In [None]:
class SentimentFeatureExtraction:
    def __init__(self, uncleaned, textColumnName):
        print("Sentiment Feature Extraction Starts")
        self.m_analyzer = SentimentIntensityAnalyzer()
        self.m_new_df = pd.DataFrame()
        self.m_uncleaned = uncleaned
        self.m_textColumnName = textColumnName
        self.Sentiment()
        print("Sentiment Feature Extraction Done\n")
        
    def GetDataFrame(self):
        return self.m_new_df
    
    def Sentiment(self):
        sentiment = [self.m_analyzer.polarity_scores(text[self.m_textColumnName]) for idx,text in self.m_uncleaned.iterrows()]
        self.m_new_df = pd.DataFrame.from_dict(sentiment)
        self.m_new_df.drop(['compound'], axis='columns',inplace=True)

In [None]:
dataset_Sentiment = SentimentFeatureExtraction(dataset, 'content').GetDataFrame()
dataset_Sentiment.to_csv('/content/drive/MyDrive/FNDS/Dataset/FeatureExtraction/dataset_Sentiment.zip', encoding='utf-8', index = False)
dataset_Sentiment.head()

Sentiment Feature Extraction Starts
Sentiment Feature Extraction Done



Unnamed: 0,neg,neu,pos
0,0.096,0.793,0.111
1,0.031,0.952,0.017
2,0.061,0.894,0.046
3,0.049,0.901,0.049
4,0.06,0.829,0.111
