# Problem Statement

1. We are given a dataset consisting of two csv files train_bodies.csv which contains the set of news articles bodies,while train-stances.csv resembles the articles for each of these bodies being identified using the body id.

2. After training from these samples we need to detect whether the given headline agrees,disagrees,discusses,unrelated with the body id


In [None]:
import os
import re
import pandas as pd
import numpy as np

from nltk.corpus import stopwords

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import accuracy_score,precision_score,recall_score,f1_score


In [None]:
!ls

# Loading the dataset

In [None]:
DATASET_PATH = "../input/fake-news-challenge/"

In [None]:
print(os.listdir(DATASET_PATH))

**train_bodies.csv** contains body id and article body for training  
**train_stances.csv** contains headlines corresponding to body id and associated labelled stance with it


In [None]:
train_bodies = pd.read_csv(os.path.join(DATASET_PATH,'train_bodies.csv'))
train_bodies.head()

In [None]:
print('The number of rows ',train_bodies.shape[0])
print('The number of columns',train_bodies.shape[1])

In [None]:
train_bodies.info()

In [None]:
train_stance = pd.read_csv(os.path.join(DATASET_PATH,'train_stances.csv'))
train_stance.head()

In [None]:
print('The number of rows ',train_stance.shape[0])
print('The number of columns',train_stance.shape[1])

In [None]:
train_stance.info()

In [None]:
# the output signifies that each body id is associated to multiple headlines
train_stance['Body ID'].value_counts()

## Dataset understanding 

1. The train_bodies contain the entries for the body id and associated article Body
2. The train_stances contain the entries for the headlines associated with the particular body id and its labelled stance
3. One body present in train_bodies can have multiple associated headlines present in train_stances and it's corresponding stance label
4. 1683 :- Number of article Body present
5. 49972 number of total headlines present for the 1683 different article body

# Combining the CSV

I am preparing a final csv in each row will correspond to a unique entry
i.e each row will correspond to a unique combination of headline,bodyid and article body 

The above is needed for making simplicity in further data preparation steps we need to execute


In [None]:
#Run commented code to combine the two csv file{train_bodies.csv,train_stances.csv} into data_combined.csv file
from tqdm.notebook import tqdm
count=0
for i in tqdm(range(train_stance.shape[0])):
    for j in range(train_bodies.shape[0]):
        if train_bodies.loc[j,'Body ID']==train_stance.loc[i,'Body ID']:
            train_stance.loc[i,'articleBody'] = train_bodies.loc[j,'articleBody']
#     if i%100==0:
#         count+=1
#         print(count,end=' ')

train_stance.to_csv('data_combined.csv',index=False)

In [None]:
data = pd.read_csv('data_combined.csv')
data.head()

In [None]:
data['stance_cat'] = data['Stance'].map({'agree':0,'disagree':1,'discuss':2,'unrelated':3}).astype(int)
data['Stance'].value_counts()

## Data Preprocessing

In [None]:
stopwords_english = set(stopwords.words('english'))
data['Headline'] = data.Headline.apply(lambda x:str(x))
data.loc[:,'Headline'] = data['Headline'].apply(lambda x : str.lower(x))
data.loc[:,'Headline'] = data['Headline'].apply(lambda x:' '.join(re.findall('[\w]+',x)))
data.loc[:,'articleBody'] = data['articleBody'].apply(lambda x : str.lower(x))
data.loc[:,'articleBody'] = data['articleBody'].apply(lambda x:' '.join(re.findall('[\w]+',x)))


def remove_stopwords(s):
    return ' '.join(word for word in s.split() if word not in stopwords_english)

data['Headline'] = data['Headline'].apply(lambda x:remove_stopwords(x))
data['articleBody'] = data['articleBody'].apply(lambda x:str(x))
data['articleBody'] = data['articleBody'].apply(lambda x:remove_stopwords(x))

## Baseline Model For Two Class

### Creating the two class dataset of related/unrelated


For a baseline classification we are simplifying the classification problem to a two class classification by first taking only strong divergent classes related/unrelated 

In [None]:
data['stance_base'] = data.loc[data.loc[:,'Stance']=='unrelated','Stance']
data['stance_base'] = data['stance_base'].fillna("related")
print(data['stance_base'].value_counts())

In [None]:
data.head()

In [None]:
def jaccard_similarity(list1, list2):
    s1 = set(list1)
    s2 = set(list2)
    return len(s1.intersection(s2)) / len(s1.union(s2))

In [None]:
def add_jaccard_similarity(data):
    count=0
    for i in tqdm(range(data.shape[0])):
        jaccard_lis=[];eps=0.001
        sentence = data.loc[i,'articleBody'].split('.') #per sentence scorer
        for j in range(len(sentence)):
            jaccard_lis.append(jaccard_similarity(data.loc[i,'Headline'].split(' '),sentence[j].split(' ')))
        max_jaccard_similarity = max(jaccard_lis)
        avg_jaccard_similarity = sum(jaccard_lis)/len(jaccard_lis)
        min_jaccard_similarity = min(jaccard_lis)
        data.loc[i,'jaccard_similarity'] = (max_jaccard_similarity+min_jaccard_similarity)/(max_jaccard_similarity-min_jaccard_similarity+eps)
#         if i%1000==0:
#             count+=1
#             print("Processed {0} Headlines".format(count*1000))
add_jaccard_similarity(data)    

## Classifying Two Classes Related/Unrelated



In [None]:
data['stance_base'].value_counts()

## Preparing Data for Classification

In [None]:
x = data.iloc[:,-1]
y = data['stance_base']


xtrain,xtest,ytrain,ytest = train_test_split(x,y,test_size=0.1)

rg = RandomForestClassifier(n_estimators=100,n_jobs=-1)

In [None]:
print('X Training shape',xtrain.shape)
print('Y Training shape',ytrain.shape)
xtrain = xtrain.values.reshape(-1,1)
xtest = xtest.values.reshape(-1,1)

In [None]:
rg.fit(xtrain,ytrain)
ypred = rg.predict(xtest)

print('Accuracy score on two class agree and disagree ',accuracy_score(ypred,ytest))



In [None]:
accuracy = accuracy_score(ypred,ytest)
print('Accuracy: %f' % accuracy)
# precision tp / (tp + fp)
precision = precision_score(ypred,ytest,average='weighted')
print('Precision: %f' % precision)
# recall: tp / (tp + fn)
recall = recall_score(ypred,ytest,average='weighted')
print('Recall: %f' % recall)
# f1: 2 tp / (2 tp + fp + fn)
f1 = f1_score(ypred,ytest,average='weighted')
print('F1 score: %f' % f1)

## Four Class Classification

We are now doing the four class classification into categories 

1. unrelated  
2. discuss  
3. agree  
4. disagree  


In [None]:
data['Stance'].value_counts()

In [None]:
x = data.iloc[:,-1]
y = data['Stance']

xtrain,xtest,ytrain,ytest = train_test_split(x,y,test_size=0.1)

rg = RandomForestClassifier(n_estimators=100,n_jobs=-1)

In [None]:
print('X Training shape',xtrain.shape)
print('Y Training shape',ytrain.shape)
xtrain = xtrain.values.reshape(-1,1)
xtest = xtest.values.reshape(-1,1)

In [None]:
rg.fit(xtrain,ytrain)
ypred = rg.predict(xtest)


# print('Accuracy score on Four class {agree,disagree,discuss,unrelated}',accuracy_score(ypred,ytest))

In [None]:
accuracy = accuracy_score(ypred,ytest)
print('Accuracy: %f' % accuracy)
# precision tp / (tp + fp)
precision = precision_score(ypred,ytest,average='weighted')
print('Precision: %f' % precision)
# recall: tp / (tp + fn)
recall = recall_score(ypred,ytest,average='weighted')
print('Recall: %f' % recall)
# f1: 2 tp / (2 tp + fp + fn)
f1 = f1_score(ypred,ytest,average='weighted')
print('F1 score: %f' % f1)

**Please upvote the notebook if you found it usefull**  
Thanks 