# Importing Modules for Data Cleaning

In [1]:
import pandas as pd
import datetime as dt
from bs4 import BeautifulSoup
import re
import nltk
from nltk.corpus import stopwords
import lxml
import pymongo
from pymongo import MongoClient

# Data Preprocessing

### 1) Removing HTML Syntax
### 2) Lowercasing text
### 3) Replacing REPLACE_BY_SPACE_RE symbols by space in text
### 4) Deleting symbols which are in BAD_SYMBOLS_RE from text
### 5) Removing Stopwords

In [2]:
REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;]')
STOPWORDS=set(stopwords.words('english'))
BAD_SYMBOLS_RE = re.compile('[^0-9a-z #+_]')

def string_form(value):
    return str(value)

def clean_text(text):
    text = BeautifulSoup(text, "lxml").text
    text = text.lower()
    text = REPLACE_BY_SPACE_RE.sub(' ', text)
    text = BAD_SYMBOLS_RE.sub('', text)
    text = ' '.join(word for word in text.split() if word not in STOPWORDS)
    return text

# Fetching Data from MongoDB Atlas

In [3]:
client=MongoClient("mongodb+srv://Siddharth:Midas@redditflare-nmqmc.mongodb.net/test?retryWrites=true&w=majority")
db=client["Reddit_India"]
database=db["India[no limit]"]
posts=list(database.find())
table=pd.DataFrame(posts)

In [4]:
del table['_id']
table.head(10)

Unnamed: 0,Searched flair,flair,title,score,id,author,body,created,comms_num,url,time,comments
0,Coronavirus,Coronavirus,Dumped by Mumbai's upper middle and middle cla...,73,g602bs,hipporama,,1587586564.0,8,https://www.firstpost.com/health/coronavirus-o...,2020-04-23 01:46:04,I am not even at my own house. Have paid both ...
1,Coronavirus,Coronavirus,India says infection growth rate has slowed do...,7,g63l42,nishubikash,,1587599461.0,7,https://www.thehindu.com/news/national/coronav...,2020-04-23 05:21:01,\*cough\* whoreshit \*cough\* Guy: *ready to h...
2,Coronavirus,Coronavirus,"H1B holders, are you worried about a possible ...",149,g5ha81,kais92,My name is Kai Schultz. I'm a journalist with ...,1587511675.0,108,https://www.reddit.com/r/india/comments/g5ha81...,2020-04-22 04:57:55,Maybe US isn't a very attractive immigration t...
3,Coronavirus,Coronavirus,Karnataka government has decided to partially ...,13,g65keb,_ConfusedDeveloper,,1587605905.0,2,https://www.hindustantimes.com/india-news/karn...,2020-04-23 07:08:25,"Well, now we just have to wait for the manager..."
4,Coronavirus,Coronavirus,Deep rot in West Bengal during COVID-19 lockdo...,8,g66r3z,ghatroad,,1587609697.0,0,https://www.firstpost.com/health/deep-rot-in-w...,2020-04-23 08:11:37,
5,Coronavirus,Coronavirus,Bid by BECIL (Under Ministry Of Information an...,13,g619tp,india_ko_vanakkam,,1587591533.0,3,https://www.becil.com/uploads/tender/Corrigend...,2020-04-23 03:08:53,Specifications of the tool they have asked for...
6,Coronavirus,Coronavirus,"Aarogya Setu will include telemedicine, greate...",9,g63iaw,india_ko_vanakkam,,1587599189.0,1,https://www.medianama.com/2020/04/223-aarogya-...,2020-04-23 05:16:29,"What happened to ChaddiChandan's ""India Shack""..."
7,Coronavirus,Coronavirus,Indian Embassies undertake herculean task of c...,19,g5ykod,i_Killed_Reddit,,1587579395.0,0,https://economictimes.indiatimes.com/news/poli...,2020-04-22 23:46:35,
8,Coronavirus,Coronavirus,Mumbai Hospital Issues Notice To Docs On Strik...,27,g5wiv8,bookshearer,,1587568118.0,0,https://www.youtube.com/watch?v=p_6j_xdANNU,2020-04-22 20:38:38,
9,Coronavirus,Coronavirus,"Lockdown scenes in Kurnool, Andhra Pradesh whi...",1809,g5xgab,drunk_sithlord,,1587573168.0,113,https://i.redd.it/h77492fyybu41.jpg,2020-04-22 22:02:48,What do you expect if you allow the market to ...


# Preprocessing the Data and saving it in a .csv file

In [5]:
table["title"]=table["title"].apply(string_form)
table["body"]=table["body"].apply(string_form)
table["comments"]=table["comments"].apply(string_form)
table.to_csv('Before_cleaning.csv',index=False)
table["title"]=table["title"].apply(clean_text)
table["body"]=table["body"].apply(clean_text)
table["comments"]=table["comments"].apply(clean_text)

combination=table["comments"]+table["title"]+table["url"]+table["body"]
table=table.assign(comments_title_url_body=combination)
table.to_csv('redditlimit20.csv',index=False)