## Data Science: Final Project
* [Task 1: Balancing the Dataset](#first-bullet)


In [1]:
import psycopg2
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import BernoulliNB
import sys
import time
from sklearn.metrics import confusion_matrix, classification_report

### Task 1: Knowing our Data

In [2]:
# Reading the fake news corpus
corpus_news = pd.read_csv('cleaned.csv')
corpus_news = corpus_news[['content', 'type']].drop_duplicates().dropna(axis = 0, how = 'any')

# Reading the scraped data
scraped_news = pd.read_csv('cleaned_scraped_data.csv')
scraped_news = scraped_news[['content','type']].drop_duplicates().dropna(axis = 0, how = 'any')

# Mergin' the data (We need to clean the scraped news as welL!)
all_news = corpus_news.append(scraped_news)

#### Making news grouping
We need to make two classification categories from 12 possible categories. There are several ways to do this. An obvious approach would be to put all types labeled fake in one category and everything else in another. What we care about is distinguishing between whether something is fake news or not. We do not not care about identifying e.g. hate articles, clickbait or junksci. We just use these types as examples of something that is not fake news even though these are also undeseriable article types on e.g. a news platform.We do not believe, that a classifier will be able to distinguish between satire and fake news, therefore we remove the satire articles entirely. 

#### Balancing the Dataset
The handed out fake news dataset is imbalanced as the non fake news articles constitute almost twice as many articles (after filteren nans) than the rest of the data. Furthmore within the non-fake news data reliable articles only constitute a small percentage. Therefore we enrich this data with the scraped data and then we random remove articles from the non-fake news data to get the same amount of articles in each dataset. Here it is important to note, that we assume, that the wikinews articles are reliable. Going from an imbalanced to balanced dataset improves our baseline accuracy around

In [3]:
# Classifying types
remove = ['unknown', 'satire']
fake = ['fake']
not_fake = ['political', 'reliable', 'bias', 'conspiracy', 'rumor', 'unreliable', 'clickbait', 'junksci', 'hate']

all_news = all_news[~all_news['type'].isin(remove)]
fake_news = all_news[all_news['type'].isin(fake)].copy()
not_fake_news = all_news[all_news['type'].isin(not_fake)].copy()
not_fake_news['type'] = 'not fake' 

# Number of articles in each set
count_group_1 = len(fake_news)
count_group_2 = len(not_fake_news)

not_fake_news = not_fake_news.sample(n=count_group_1)

all_news = not_fake_news.append(fake_news)
all_news.to_csv('all_news.csv')

print(f'Length of not fake news: {len(not_fake_news)}')
print(f'Length of fake news: {len(fake_news)}')
print(f'Length of all news: {len(all_news)}')

Length of not fake news: 110071
Length of fake news: 110071
Length of all news: 220142


#### Task 2: Establishing a Baseline

In [15]:
all_news = pd.read_csv('all_news.csv')
all_news['type'] = all_news['type'].map({'fake':1, 'not fake':0})

x = all_news['content'].values
y = all_news['type'].values
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.30, random_state=1000)

# Apply tfid vectorization
vectorizer = CountVectorizer()
vectorizer.fit(x_train)
x_train = vectorizer.transform(x_train) #.toarray()
x_test = vectorizer.transform(x_test)

In [16]:
# Logistic Regression for fake news prediction
classifier = LogisticRegression(max_iter = 10000) # konvergensen er fin btw, vi har ikke store ændringer i vores fejl, når
# vi ændrer i antallet af iterationer.
classifier.fit(x_train, y_train)
score = classifier.score(x_test, y_test)
print(f'Score for logistic baseline: {score}')

# Naive Bayes for fake news prediction
gnb = BernoulliNB()
gnb.fit(x_train, y_train)
score = gnb.score(x_test, y_test)
print(f'Score for naive Bayes baseline: {score}')

Score for logistic baseline: 0.9549081658919188
Score for naive Bayes baseline: 0.8471904668170738


In [None]:
# Confusion matrix
y_pred = classifier.predict(x_test)
cm = confusion_matrix(y_test, y_pred, labels=all_news['type'].unique())
df_cm = pd.DataFrame(cm, index=all_news['type'].unique(), columns=all_news['type'].unique())

df_cm_percentage = df_cm.copy()
for i in df_cm_percentage:
    df_cm_percentage[i]=df_cm_percentage[i]/df_cm_percentage[i].sum()
df_cm_percentage 

#### Task 3: Creating a Fake News Predictor

In [None]:
# Use another model and use k-fold cross validation

#### Task 4: Performance Beyond the Original Dataset

In [None]:
# check på liar datasættet!