## Reddit Data Collection 

In [1]:
pip install praw

Note: you may need to restart the kernel to use updated packages.


In [2]:
pip install tldextract

Note: you may need to restart the kernel to use updated packages.


# Reddit Data Collection

In [14]:
# Importing necessary libraries

import praw
import pandas as pd
import numpy as np
import datetime as dt
import tldextract
import matplotlib.pyplot as plt
import re

### Extracting r/india data using praw

In [4]:
reddit=praw.Reddit(client_id='zjdX0tsIpqFHCg',
                   client_secret='lsAxhDucJr9OmCaneXrYyoBUEpIlpQ',
                   user_agent='Saksham Kumar Sharma'
                   )
subreddit = reddit.subreddit('india')

#### There are a huge number of custom flairs in r/india. In order to decide which ones to consider and which ones to ignore, I am going to consider only the ones which are officially mentioned on the r/india page. Hence these 10 flairs will be used further in this project for all purposes.

In [5]:
flairs = ['AskIndia', 'Business/Finance', 'Food', 'Non-Political', 'Photography', 'Policy/Economy', 'Politics', 'Scheduled',
          'Science/Technology', 'Coronavirus']

# Function to convert timestamp to datetime
def get_date(created):
    return dt.datetime.fromtimestamp(created)

#### labels_dict stores all the parts of reddit post that is flair, id, url, body, comments etc and flairs contails all the list of flairs of the subreddit India.


In [6]:
labels_dict = {"flair":[], "title":[], "score":[], "id":[], "url":[], "comms_num": [], "created": [], "body":[], "author":[], "comments":[]}

for flair in flairs:
    
    get_subreddits=subreddit.search(flair,limit=400)
    
    for submission in get_subreddits:
        labels_dict['flair'].append(flair)
        labels_dict['title'].append(submission.title)
        labels_dict['score'].append(submission.score)
        labels_dict['id'].append(submission.id)
        labels_dict['url'].append(submission.url)
        labels_dict["comms_num"].append(submission.num_comments)
        labels_dict["created"].append(submission.created)
        labels_dict["body"].append(submission.selftext)
        labels_dict["author"].append(submission.author)
        
        submission.comments.replace_more(limit=None)
        comment = ''
        for top_level_comment in submission.comments:
            comment = comment + ' ' + top_level_comment.body
        labels_dict["comments"].append(comment)

In [7]:
data = pd.DataFrame(labels_dict)
data.tail()

Unnamed: 0,flair,title,score,id,url,comms_num,created,body,author,comments
2275,Coronavirus,India will supply coronavirus vaccines to the ...,8,ilpr6k,https://www.nature.com/articles/d41586-020-025...,2,1599151000.0,,TajMahal_Trump,I guess Indians will get it too. Isn't that a...
2276,Coronavirus,"Coronavirus, Hurting Jobs and Loans, Has India...",49,ila866,https://www.wsj.com/articles/coronavirus-hurti...,4,1599093000.0,,mayblum,This is something that we do for unprecedente...
2277,Coronavirus,Delhi Reports Biggest 1-Day Spike In Coronavir...,30,il41nr,https://www.ndtv.com/delhi-news/coronavirus-de...,1,1599070000.0,,-mouth4war-,Surprise surprise ! \n\nStill people don't ca...
2278,Coronavirus,GST - Act of god!,5,in1et9,https://www.reddit.com/r/india/comments/in1et9...,1,1599343000.0,Does anyone remember the following two comment...,ywardygone,Even God can not manipulate my company's prof...
2279,Coronavirus,Startling Changes in Coronavirus Testing: Late...,7,il5guz,https://youtu.be/iAfqehVt4Tc,0,1599077000.0,,bhaisahabhandsome-2,


In [8]:
# Converting timestamp to datetime format
timedata = data["created"].apply(get_date)
data = data.assign(timestamp = timedata)
del data['created']

# Shuffling the rows
data = data.sample(frac=1).reset_index(drop=True)

# Saving the data in csv file
data.to_csv('reddit-india-data.csv', index=False)
data.tail()

Unnamed: 0,flair,title,score,id,url,comms_num,body,author,comments,timestamp
2275,Policy/Economy,Crisis in economy retrievable with major polic...,7,j8z0st,https://www.sundayguardianlive.com/news/crisis...,0,,Free_Physics,,2020-10-11 18:02:43
2276,Scheduled,Monthly video games thread. December 2016 [Sch...,46,5fwcie,https://www.reddit.com/r/india/comments/5fwcie...,131,Let us use this thread to discuss games that w...,axaytsg,Bought games legally for the first time in li...,2016-12-02 02:10:03
2277,Business/Finance,What would be a better decision? [Serious],24,abqu80,https://www.reddit.com/r/india/comments/abqu80...,17,"24 year old male, working in a day job from 8 ...",nosleepnomore,"Bhai 8am-8pm is not a day job, it's slavery.\...",2019-01-02 20:10:50
2278,Non-Political,The forgotten promise of 1949: The RSS wrote a...,30,9pqnpq,https://www.thehindu.com/opinion/lead/the-forg...,8,,bliss_tree,"you know what bhakts will say?\n\n>""what abou...",2018-10-20 16:36:14
2279,Non-Political,[Non Political] [NP] (Ask India) Medical postg...,1,85b66l,https://www.reddit.com/r/india/comments/85b66l...,0,Realised that many discontinue use after getti...,Vickythiside,,2018-03-19 02:50:46


In [9]:
def string(value):
    return str(value)

In [10]:
data['title'] = data['title'].apply(string)
data['body'] = data['body'].apply(string)
data['comments'] = data['comments'].apply(string)

### Now we will do text cleaning by transforming all words to lower case, doing lemmatiztion etc

In [16]:
# Importing required libraries

from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer 
wordnet=WordNetLemmatizer()
STOPWORDS = set(stopwords.words('english'))
def text_cleaning(text):
    review=re.sub('[^a-zA-Z]',' ',text) 
    review=review.lower()
    text=' '.join(wordnet.lemmatize(word) for word in text.split() if word not in STOPWORDS)
    return text

In [17]:
data['title'] = data['title'].apply(text_cleaning)
data['body'] = data['body'].apply(text_cleaning)
data['comments'] = data['comments'].apply(text_cleaning)

In [18]:
combined_features = data["title"] + data["comments"] + data["url"] + data["body"]
data = data.assign(combined_features = combined_features)

In [19]:
data.to_csv('data.csv')

In [21]:
pd.read_csv('data.csv').head()

Unnamed: 0.1,Unnamed: 0,flair,title,score,id,url,comms_num,body,author,comments,timestamp,combined_features
0,0,Business/Finance,Interest PF taxable: Finance minister eye PF i...,51,laoy5g,https://timesofindia.indiatimes.com/business/i...,20,,satyasys,"PF return fully taxable, point investing PF. G...",2021-02-02 19:30:37,Interest PF taxable: Finance minister eye PF i...
1,1,Food,How survive 500rs(food) 2 weeks?,55,kr3ztg,https://www.reddit.com/r/india/comments/kr3ztg...,79,Hey guys. So time salary going late I'll recei...,Luc_90,"1. 2 kg cheap rice, 50/kg so, (Don't buy boile...",2021-01-06 07:40:59,How survive 500rs(food) 2 weeks?1. 2 kg cheap ...
2,2,Scheduled,"Right wing group labelling resource document ""...",143,lbwl1r,https://www.reddit.com/r/india/comments/lbwl1r...,17,"Recently Greta Thunberg tweeted ""toolkit"" peop...",gobargorab,Anything father modi make go crazy upset. That...,2021-02-04 09:27:18,"Right wing group labelling resource document ""..."
3,3,Food,Ask: What amount pocket money give children?,5,m0auzn,https://www.reddit.com/r/india/comments/m0auzn...,39,"Also, supposed buy pocket money actually spend...",what_is_inflation,You guy get pocket money? My parent never gave...,2021-03-08 21:25:43,Ask: What amount pocket money give children?Yo...
4,4,Photography,"I’ve Recently generated interest photography, ...",0,aaakn4,https://i.redd.it/9tcehs8vz0721.jpg,14,,thesarcasticpage,A photo like letter viewer story want convey. ...,2018-12-29 03:35:37,"I’ve Recently generated interest photography, ..."
