<a href="https://colab.research.google.com/github/reachsidd/fina_project_6740/blob/main/python/FinalProject.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [39]:
import re
import string

import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import nltk
import sklearn.model_selection
nltk.download('punkt')
nltk.download('stopwords')
import bs4
import wordcloud

import keras.preprocessing
import tensorflow as tf
import zipfile

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [40]:
# get raw text from htmls
def clean_html(text):
    soup = bs4.BeautifulSoup(text, 'html.parser')
    return soup.get_text()

In [41]:
# remove stopwords, punctuations and lower the case
def remove_stopwords_case_normalization(text):
    stopwords = nltk.corpus.stopwords.words('english')
    #print(stopwords)
    text_new = text.lower()
    text_new = re.sub('[^a-z]', ' ', text_new)
    # print(text_new)
    text_new = "".join([i for i in text_new if i not in string.punctuation])
    #print(text_new)
    words = text_new.split()
    #print(words)
    text_new = " ".join([i for i in words if i not in stopwords])
    #print(text_new)
    return text_new

In [42]:
def clean_text(text):
    # Remove urls
    text_new = re.sub(r'http\S+', '', text)
    text_new = clean_html(text_new)
    text_new = remove_stopwords_case_normalization(text_new)
    return text_new

In [43]:
train_github_url='https://github.com/reachsidd/fina_project_6740/blob/main/data/train.zip?raw=true'
test_github_url='https://github.com/reachsidd/fina_project_6740/blob/main/data/test.zip?raw=true'
submit_github_url='https://github.com/reachsidd/fina_project_6740/blob/main/data/submit.zip?raw=true'

train = pd.read_csv(train_github_url,compression='zip')
test = pd.read_csv(test_github_url,compression='zip')
submit = pd.read_csv(submit_github_url,compression='zip')
test = test.merge(submit, on='id')
print(train.head())
print(test.head())

# null check
print(train.isna().sum())
# null check
print(test.isna().sum())

# filling NULL values with empty string
train = train.fillna('')
test = test.fillna('')


   id  ... label
0   0  ...     1
1   1  ...     0
2   2  ...     1
3   3  ...     1
4   4  ...     1

[5 rows x 5 columns]
      id  ... label
0  20800  ...     0
1  20801  ...     1
2  20802  ...     0
3  20803  ...     1
4  20804  ...     1

[5 rows x 5 columns]
id           0
title      558
author    1957
text        39
label        0
dtype: int64
id          0
title     122
author    503
text        7
label       0
dtype: int64


In [44]:
#FOR TESTING ONLY
#train = train.head(100)
#test = test.head(100)

In [None]:
train['content'] = train['text'] + ' ' + train['title']
train.drop(columns=['title', 'text', 'author', 'id'], inplace=True)

test['content'] = test['text'] + ' ' + test['title']
test.drop(columns=['title', 'text', 'author', 'id'], inplace=True)

train['content'] = train['content'].apply(clean_text)
print(train.head())

test['content'] = test['content'].apply(clean_text)
print(test.head())

In [None]:
# WORDCLOUD FOR  CLEAN TEXT(LABEL - 1 - True)
plt.figure(figsize=(20, 20))  # Text that is not Fake
wc = wordcloud.WordCloud(max_words=2000, width=1600, height=800, stopwords=wordcloud.STOPWORDS).generate(
    " ".join(train[train.label == 1].content))
plt.imshow(wc, interpolation='bilinear')
plt.show()

In [None]:
# WORDCLOUD FOR  CLEAN TEXT(LABEL - 0 - fake)
plt.figure(figsize=(20, 20))  # Text that is Fake
wc = wordcloud.WordCloud(max_words=2000, width=1600, height=800, stopwords=wordcloud.STOPWORDS).generate(
    " ".join(train[train.label == 0].content))
plt.imshow(wc, interpolation='bilinear')
plt.show()

In [None]:
# NLP - Tokenize and apply Porter’s Stemmer algorithm
ps = nltk.stem.porter.PorterStemmer()
train['content'] = train['content'].apply(lambda x:' '.join([ps.stem(word) for word in nltk.tokenize.word_tokenize(x)]))
print(train.head())

test['content'] = test['content'].apply(lambda x:' '.join([ps.stem(word) for word in nltk.tokenize.word_tokenize(x)]))
print(test.head())

max_features = 300
maxlen = 300
tokenizer = keras.preprocessing.text.Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(train['content'])
voc_size=len(tokenizer.word_index)
print("voc_size=",voc_size)
x_train = tokenizer.texts_to_matrix(train['content'])
print(x_train.shape)
x_test = tokenizer.texts_to_matrix(test['content'])
print(x_test.shape)
y_train = train['label']
y_test = test['label']
print(y_train.shape)
print(y_test.shape)

In [None]:
#Creating and training  model
# We have used embedding layers with LSTM
model = tf.keras.models.Sequential()
model.add(tf.keras.layers.Embedding(voc_size, 40, input_length=maxlen))
model.add(tf.keras.layers.Dropout(0.3))
model.add(tf.keras.layers.LSTM(100))
model.add(tf.keras.layers.Dropout(0.3))
model.add(tf.keras.layers.Dense(64, activation='relu'))
model.add(tf.keras.layers.Dropout(0.3))
model.add(tf.keras.layers.Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())

In [None]:
model.fit(x_train,y_train,epochs=20,batch_size=64)

In [None]:
y_train_pred = (model.predict(x_train) >0.5).astype("int32")
y_test_pred = (model.predict(x_test) >0.5).astype("int32")

train_accuracy=sklearn.metrics.accuracy_score(y_train, y_train_pred)
test_accuracy=sklearn.metrics.accuracy_score(y_test, y_test_pred)

print('train_accuracy=',train_accuracy)
print('test_accuracy=',test_accuracy)

