### Importing Reddit's API, PRAW to scrap subreddits, and Pandas

In [1]:
import praw
import pandas as pd

### Initialising the Reddit object with user's credentials

In [2]:
reddit = praw.Reddit(client_id = "#", 
                     client_secret = "#", 
                     user_agent = "#")

### Choosing the subreddit of r/india

In [3]:
subred = reddit.subreddit('india')

## Considering "hot", "top", "new" and "controversial" filters to scrap subreddits

In [4]:
hot = subred.hot(limit=3000)
top = subred.top(limit=3000)
controversial = subred.controversial(limit=3000)
new = subred.new(limit=3000)

In [5]:
type(hot)

praw.models.listing.generator.ListingGenerator

### Initilizing a dictionary "data" to store the scrapped data

In [6]:
data = {"id":[], "url":[],"title":[],  "body":[], "flair":[]}

### Getting data from corresponding filtered subreddits and appending in "data"

In [7]:
for i in hot:
    
    data['id'].append(i.id)
    data['url'].append(i.url)
    data['title'].append(i.title)
    data['body'].append(i.selftext)    
    data['flair'].append(i.link_flair_text)    

In [8]:
for i in top:
    
    data['id'].append(i.id)
    data['url'].append(i.url)
    data['title'].append(i.title)
    data['body'].append(i.selftext)    
    data['flair'].append(i.link_flair_text) 

In [9]:
for i in controversial:
    
    data['id'].append(i.id)
    data['url'].append(i.url)
    data['title'].append(i.title)
    data['body'].append(i.selftext)    
    data['flair'].append(i.link_flair_text) 

In [10]:
for i in new:
    
    data['id'].append(i.id)
    data['url'].append(i.url)
    data['title'].append(i.title)
    data['body'].append(i.selftext)    
    data['flair'].append(i.link_flair_text) 

### Converting the dictionary to a Pandas DataFrame

In [11]:
import pandas as pd
df = pd.DataFrame(data)

In [12]:
df.head()

Unnamed: 0,id,url,title,body,flair
0,g1zi21,https://www.reddit.com/r/india/comments/g1zi21...,Coronavirus (COVID-19) Megathread - News and U...,###[Covid-19 Fundraisers & Donation Links](htt...,Coronavirus
1,g4d2ix,https://www.reddit.com/r/india/comments/g4d2ix...,"[Monthly Happiness Thread] Randians, please sh...",<3 \n \nLinks: ...,Scheduled
2,g6z6ly,https://i.redd.it/8l3vwdax2ou41.jpg,Re-Creation Of Humanity (artist : Hasif khan),,Non-Political
3,g70l6e,https://www.nationalheraldindia.com/internatio...,Gulf News editor in Dubai receives threats fro...,,Politics
4,g73kri,https://www.reddit.com/r/india/comments/g73kri...,Judiciary of India has never been this bad,CAA-NRC - \n\nWe will hear petition next month...,Politics


In [13]:
df.shape

(3632, 5)

### Since one subreddit can appear in more than one filter, the DF is sorted and duplicate elements are dropped.

In [14]:
df.sort_values("id", inplace = True)
df.drop_duplicates(subset ="id", inplace = True)

In [15]:
df.shape

(2859, 5)

### Analyzing the unique flairs and their frequencies from the collected data

In [16]:
df['flair'].value_counts()

Politics                          977
Non-Political                     772
Coronavirus                       353
AskIndia                          177
Policy/Economy                     88
[R]eddiquette                      71
Business/Finance                   58
Photography                        53
Sports                             42
Science/Technology                 25
Food                               21
Unverified                         20
Scheduled                          12
CAA-NRC                            10
Moderated                           7
Misleading                          6
Policy                              4
CAA-NRC-NPR                         4
r/all                               3
Policy & Economy                    3
Demonetization                      3
Entertainment                       3
/r/all                              2
AMA                                 2
Science & Technology                1
TIL                                 1
Policy/Econo

### Choosing the top 10 flairs from the scrapped list as the categories for the succeeding prediction task. 

In [17]:
top_flairs = ["Politics", "Non-Political", "Coronavirus", "AskIndia", "Policy/Economy", "[R]eddiquette", 
              "Photography", "Business/Finance", "Sports", "Science/Technology","Food"]

df_top = df.loc[df['flair'].isin(top_flairs)]

In [18]:
df_top.shape

(2637, 5)

In [19]:
df_top.head()

Unnamed: 0,id,url,title,body,flair
2348,1vfqzr,http://www.bollywoodmantra.com/news/hrithik-ro...,Hrithik Roshan to tie the knot for the second ...,,Non-Political
2167,1vgb3k,https://www.reddit.com/r/india/comments/1vgb3k...,"Girls of /r/India, would any of you be interes...",.,Non-Political
2316,1vqu7l,https://www.reddit.com/r/india/comments/1vqu7l...,Regarding the ongoing Kejru dramabaaz,"I spoke with some people in Delhi , one of who...",Politics
2279,1w4xsp,http://bjpscams.com/,A site that lists out scams by the BJP,,Politics
2139,1wcra7,http://www.truthofgujarat.com/modi-says-india-...,"Modi says India has no War Memorials, here's a...",,Politics


### As the data is skewed towards certain flairs and not others, we are scrapping again from the r/india subreddit for the above mentioned flairs with a limit of 100 for each flairs

In [20]:
reddit = praw.Reddit(client_id = "#", 
                     client_secret = "#", 
                     user_agent = "#")

subred = reddit.subreddit('india')
sample_data = {"id":[], "url":[], "title":[], "body":[], "flair":[]}

top_flairs = ["Politics", "Non-Political", "Coronavirus", "AskIndia", "Policy/Economy", "[R]eddiquette", 
              "Photography", "Business/Finance", "Sports", "Science/Technology"]

for flair in top_flairs:
  
  top_f = subred.search(flair, limit=1000)
  
  for i in top_f:
    
    sample_data["id"].append(i.id)
    sample_data["url"].append(i.url)
    sample_data["title"].append(i.title)
    sample_data["body"].append(i.selftext)
    sample_data["flair"].append(flair)
    
sample = pd.DataFrame(sample_data)


In [21]:
sample.head()

Unnamed: 0,id,url,title,body,flair
0,g2ct57,https://www.reddit.com/r/india/comments/g2ct57...,A polite request to all Indians here,I don't know if it is the same situation in ot...,Politics
1,futac9,https://www.reddit.com/r/india/comments/futac9...,Pitting a community against a political party ...,First of all let me start by saying it was stu...,Politics
2,ff8sth,https://i.redd.it/yjo9wpy38el41.jpg,A new political party gave a full front page a...,,Politics
3,fpaj1w,https://theprint.in/india/hit-by-backlash-over...,Hit by backlash over posts on lack of medical ...,,Politics
4,fxs1vy,https://www.timesnownews.com/india/article/pol...,Politics in the time of corona: WB CM question...,,Politics


In [22]:
sample['flair'].value_counts()

Coronavirus           247
Politics              247
Business/Finance      233
AskIndia              232
Sports                231
Photography           222
Science/Technology    221
Policy/Economy        220
Non-Political         216
[R]eddiquette          18
Name: flair, dtype: int64

### It is seen that "[R]eddiquette"  flair has very less samples. Even if we use the previously scrapped data and concatenate it, it wont be in the same scale as others, so we will drop that category.

In [23]:
top_9 = ["Politics", "Non-Political", "Coronavirus", "AskIndia", "Policy/Economy", 
              "Photography", "Business/Finance", "Sports", "Science/Technology"]

final = sample.loc[sample['flair'].isin(top_9)]

In [24]:
final['flair'].value_counts()

Coronavirus           247
Politics              247
Business/Finance      233
AskIndia              232
Sports                231
Photography           222
Science/Technology    221
Policy/Economy        220
Non-Political         216
Name: flair, dtype: int64

### Removing duplicate entries

In [25]:
final.sort_values("id", inplace = True)
final.drop_duplicates(subset ="id", inplace = True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [26]:
final['flair'].value_counts()

Coronavirus           241
Politics              241
Sports                227
AskIndia              226
Photography           221
Science/Technology    220
Business/Finance      217
Non-Political         213
Policy/Economy        207
Name: flair, dtype: int64


### Converting the DataFrame into a CSV file

In [27]:
final.to_csv('Data/reddit-top-flairs.csv', index=False)

# Considering the top 9 flair category, the data is scrapped and ready for EDA