# TASK - MIDAS@IIITD

# PART - I

# Scraping Reddit data

### DATA COLLECTION PIPELINE

#### THERE ARE THREE PROCESSES THROUGH WHICH I HAVE SHOWN TO FETCH REDDIT DATA : 
#### * To use simply request library and raw fectching using automation of reddit authentication (METHOD1)
#### * To use PRAW API wrapper (METHOD2)
#### * To use PushShift API wrapper (METHOD3)

In [235]:
# import all packages
import requests
import time
import datetime
from bs4 import BeautifulSoup
import os
from os import path
import urllib.request as ulib
from urllib.request import Request, urlopen
from uuid import uuid4
import requests.auth
import urllib
import random
from tqdm import tqdm
import glob
import chardet

In [3]:
# for automating the browser for reddit authentication method 1
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By

In [4]:
import numpy as np
import pandas as pd

## METHOD 1 : Using Requests to fetch using Reddit API - OAuth Flow

In [5]:
class RedditAPI:
    """ 
    Class for implementing METHOD 1
      
    Attributes: 
        CLIENT_ID (str): client id taken from reddit app 
        CLIENT_SECRET (str): client secret taken from reddit app 
        REDIRECT_URI (str): client redirect url taken from reddit app 
        
        * All things taken from dev account : https://www.reddit.com/prefs/apps/
    """
    def __init__(self, CLIENT_ID, CLIENT_SECRET, REDIRECT_URI):
        """ 
        Does the initialisation part for reddit requests

        Parameters: 
            CLIENT_ID (str): client id taken from reddit app 
            CLIENT_SECRET (str): client secret taken from reddit app 
            REDIRECT_URI (str): client redirect url taken from reddit app 

        Returns: 
            None
  
        """
        self.CLIENT_ID = CLIENT_ID
        self.CLIENT_SECRET = CLIENT_SECRET
        self.REDIRECT_URI = REDIRECT_URI
    
    def make_authorization_url(self):
        """ 
        helps in creating the complete reddit OAuth url

        Parameters: 
            None

        Returns: 
            url (str) : the reddit url for OAuth authentication.
  
        """
        state = str(uuid4())
        params = {"client_id": self.CLIENT_ID,
                  "response_type": "code",
                  "state": state,
                  "redirect_uri": self.REDIRECT_URI,
                  "duration": "temporary",
                  "scope": "identity edit flair history modconfig modflair modlog modposts modwiki mysubreddits privatemessages \
                  read report save submit subscribe vote wikiedit wikiread"}
        url = "https://ssl.reddit.com/api/v1/authorize?" + urllib.parse.urlencode(params)
        return url
    
    def auth_retreive_code(self):
        """ 
        Does retrievel of OAuth code

        Parameters: 
            None

        Returns: 
            current_url (str) : two part of current_url , first contains some part of url, second one contains code part of url.
  
        """
        auth_url = self.make_authorization_url()
        options = webdriver.ChromeOptions()
        options.add_argument("--headless")
        options.add_argument('--ignore-certificate-errors')
        options.add_argument('--ignore-ssl-errors')
        options.add_argument('--disable-notifications')
        # initiating driver for chrome
        driver = webdriver.Chrome(executable_path='chromedriver.exe', options=options)
        try:
            driver.get(auth_url)
            print("Opened webpage..")
            driver.implicitly_wait(random.randint(10, 20))
            driver.find_element_by_name("username").send_keys("ououmua_2")
            driver.implicitly_wait(random.randint(10, 20))
            driver.find_element_by_name("password").send_keys("17031999")
            driver.implicitly_wait(random.randint(10, 20))
            print("Logged in....")
            driver.find_element_by_xpath('/html/body/div/div/div[2]/div/form/div/fieldset[5]/button').click()
            driver.implicitly_wait(random.randint(30, 50))
            driver.find_element_by_xpath('/html/body/div[3]/div/div[2]/form/div/input[1]').click()
            print("Got the code.....")
            current_url = driver.current_url
            driver.close()
            return current_url.split("=")[1], current_url.split("=")[2]
        except Exception as error:
            print("Failing to get code...")
            
    def get_token(self):
        """ 
        Does retrievel of OAuth access tokens

        Parameters: 
            self

        Returns: 
            token_json (str) : access_token needed for authentication to reddit API's
  
        """
        state, code = self.auth_retreive_code()
        headers = {'User-Agent': 'Mozilla/5.0'}
        client_auth = requests.auth.HTTPBasicAuth(self.CLIENT_ID, self.CLIENT_SECRET)
        post_data = {"grant_type": "authorization_code",
                     "code": code,
                     "redirect_uri": self.REDIRECT_URI}
        response = requests.post("https://ssl.reddit.com/api/v1/access_token",
                                 auth=client_auth,
                                 data=post_data, headers=headers)

        token_json = response.json()
        return token_json['access_token']       

In [6]:
CLIENT_ID = "##########"
CLIENT_SECRET = "*********"
REDIRECT_URI = "http://localhost:8080"

r = RedditAPI(CLIENT_ID=CLIENT_ID, CLIENT_SECRET=CLIENT_SECRET, REDIRECT_URI=REDIRECT_URI)
auth_token = r.get_token()

Opened webpage..
Logged in....
Got the code.....


In [7]:
auth_headers = {"Authorization": f"bearer {auth_token}", "User-Agent": "Mozilla/5.0"}

In [8]:
auth_headers

{'Authorization': 'bearer 347754727638-_vZbh2JhEtbTqdCLTYPLiJrsSlk',
 'User-Agent': 'Mozilla/5.0'}

In [9]:
params = {'q' : 'india', 'limit' : '10', 'sort' : 'hot'}

In [10]:
params

{'q': 'india', 'limit': '10', 'sort': 'hot'}

In [11]:
response = requests.get("https://oauth.reddit.com/r/subreddit/search", headers=auth_headers, params=params)

In [12]:
# this is a nested list containing first item as title and second as flair
data_items = []
# titles, flairs = [], []
for i in range(10):
    data_items.append([response.json()['data']['children'][i]['data']['title'], response.json()['data']['children'][i]['data']['link_flair_text']])

In [13]:
data = pd.DataFrame(data_items, columns=['title', 'flair'])  # converting above created list into dataframe

In [14]:
data

Unnamed: 0,title,flair
0,Scientists in India discovered a new snake spe...,Animal Science
1,Ronaldo visits India during lockdown,
2,The Idea of India is too great for little mind...,OC है बे!
3,Never in my life I could have imagined that in...,Virat Hindu
4,"The Jatayu Earth center in kerala, India. Worl...",
5,Bar Dancer in India,Butthurt OP
6,Pakistani Twitter account posing as Omani prin...,Politics
7,Scientists in India discoverd a new snake spec...,News
8,Stranded Oil Tankers around the world (Credit:...,
9,Finally calling China’s bluff: India’s decisio...,Policy/Economy


##### AS WE CAN SEE THERE ARE LOTS OF HERE AND THERE ERRORS AND OTHER THINGS WE NEED TO TAKE CARE WHEN MAKING OAuth REQUESTS, SO WE ARE GOING TO USE NOW A SIMPLE TO USE WRAPPER - PRAW (PYTHON REDDIT API WRAPPER)

## METHOD 2 : Using standard API Wrappper - PRAW

In [15]:
# using packages
import praw
from praw.models import MoreComments

In [284]:
class RedditParse:
    """ 
    Class for implementing METHOD 2 , using PRAW WRAPPER
      
    Attributes: 
        url (str): the subreddit url  
        headers (dict): headers containing user agent, and other optional args as part of requests
        CLIENT_ID (str): client id taken from reddit app
        CLIENT_SECRET (str) : client secret taken from reddit app
        REDIRECT_URI (str) : client redirect url taken from reddit app
        
    """
    def __init__(self, url, headers, CLIENT_ID, CLIENT_SECRET, REDIRECT_URI):
        """ 
        Does the initialisation part for reddit requests

        Parameters: 
            url (str): the subreddit url  
            headers (dict): headers containing user agent, and other optional args as part of requests
            CLIENT_ID (str): client id taken from reddit app
            CLIENT_SECRET (str) : client secret taken from reddit app
            REDIRECT_URI (str) : client redirect url taken from reddit app

        Returns: 
            None
  
        """
        self.url = url 
        self.headers = headers
        self.CLIENT_ID = CLIENT_ID
        self.CLIENT_SECRET = CLIENT_SECRET
        self.REDIRECT_URI = REDIRECT_URI
            
    def get_subreddit_with_flair(self, limit, topic, category, dict_ids):
        """ 
        Does fetching of subreddit detailed info in json form

        Parameters: 
            limit (int) : maximum number of results to return 
            topic (str) : subreddit topic to query for
            category (str) : 'hot' or 'new'
            dict_ids (dict) : dict empty when there is no existing dict_ids , or already existing dict_ids containing keys as 
                            id of returned result and values as frequency of how many time that id (result) already exists.

        Returns: 
            data (dataframe): data is dataframe returned containing following items as columns : 
                            ['id', 'name', 'title', 'score', 'subreddit', 'url', 'num_comments', 'selftext', 'author', 
                            'num_crossposts', 'over_18', 'permalink', 'pinned', 'subreddit_type', 'link_flair_text', 
                            'author_flair_text', 'total_awards_received', 'upvote_ratio', 'time_created', 'comment']
                
            dict_ids (dict): dict_ids containing keys as 
                            id of returned result and values as frequency of how many time that id (result) already exists.
  
        """
        data_items = []
        if dict_ids == None:
            dict_ids = {}
            print("Dict_ids empty, created new...")
        else:
            dict_ids = dict_ids
            print("Found already dict_ids created, using it..")
            
        reddit = praw.Reddit(client_id=self.CLIENT_ID, client_secret=self.CLIENT_SECRET, user_agent=self.headers)
        print("Fetching subreddit flair info...")
        if category == 'hot':
            top_posts = reddit.subreddit(topic).hot(limit=limit)
        elif category == 'new':
            top_posts = reddit.subreddit(topic).new(limit=limit)
            
            
        with tqdm(total=limit, position=0, leave=True) as pbar:
            # lopping over all returned posts for a topic on reddit 
            for post in top_posts:
                # lopping for every thread for every post
                comment = " "
                for top_level_comment in post.comments:
                    if isinstance(top_level_comment, MoreComments):
                        continue
                    comment = comment + " " + top_level_comment.body
                
                # checking if id of returned result already existed, if not then take this 
                if post.id not in dict_ids:
                    dict_ids[post.id] = 1
                    data_items.append([post.id, post.name, post.title, post.score, post.subreddit, 
                                       post.url, post.num_comments, post.selftext, post.author,
                                       post.num_crossposts, post.over_18, post.permalink, post.pinned, post.subreddit_type, 
                                       post.link_flair_text, post.author_flair_text, post.total_awards_received, post.upvote_ratio, 
                                       datetime.datetime.fromtimestamp(int(post.created_utc)).strftime('%Y-%m-%d %H:%M:%S'), comment[:1000]])
                else:
                    dict_ids[post.id] += 1
                    


                pbar.update(1)

        print("Building Dataframe...")
        data = pd.DataFrame(data_items, columns=['id', 'name', 'title', 'score', 'subreddit', 'url', 'num_comments', 'selftext', 'author', 
                                                'num_crossposts', 'over_18', 'permalink', 'pinned', 'subreddit_type', 'link_flair_text', 
                                                'author_flair_text', 'total_awards_received', 'upvote_ratio', 'time_created', 'comment'])        
        return data, dict_ids
    
    def build_dataframe(self, limit, output_path, topic='all', category='hot', dict_ids=None):
        """ 
        Does helps in building and saving the output file from dataframe fetched from get_subreddit_with_flair method. 

        Parameters: 
            limit (int) : maximum number of results to return 
            topic (str) : subreddit topic to query for
            category (str) : 'hot' or 'new'
            dict_ids (dict) : dict empty when there is no existing dict_ids , or already existing dict_ids containing keys as 
                            id of returned result and values as frequency of how many time that id (result) already exists.

        Returns: 
            data (dataframe): 
            dict_ids (dict): 
  
        """
        
        data, dict_ids = self.get_subreddit_with_flair(limit, topic, category, dict_ids)
        if os.path.isfile(output_path):
            os.remove(output_path)
            data.to_csv(output_path, index=False)
            print(f"File written to {output_path}")
            return dict_ids
        else:
            data.to_csv(output_path, index=False)
            print(f"File written to {output_path}")
            return dict_ids

In [198]:
url = "https://www.reddit.com/r/india/"
headers = headers = {'User-Agent': 'Mozilla/5.0'}
CLIENT_ID = "#########"
CLIENT_SECRET = "***************"
REDIRECT_URI = "http://localhost:8080"

In [298]:
r = RedditParse(url, headers, CLIENT_ID, CLIENT_SECRET, REDIRECT_URI)
print("BUILDING 'NEW' DATA")
dict_ids = r.build_dataframe(1000, 'C:/Users/DELL/Desktop/reddit_data_classifier_new.csv', 'india', 'new')

  0%|                                                                                         | 0/1000 [00:00<?, ?it/s]

BUILDING 'NEW' DATA
Dict_ids empty, created new...
Fetching subreddit flair info...


 87%|████████████████████████████████████████████████████████████████████▍          | 866/1000 [11:42<01:48,  1.23it/s]

Building Dataframe...
File written to C:/Users/DELL/Desktop/reddit_data_classifier_new.csv





In [299]:
print("BUILDING 'HOT' DATA")
dict_ids = r.build_dataframe(1000, 'C:/Users/DELL/Desktop/reddit_data_classifier_hot.csv', 'india', 'hot', dict_ids)

  0%|                                                                                         | 0/1000 [00:00<?, ?it/s]

BUILDING 'HOT' DATA
Found already dict_ids created, using it..
Fetching subreddit flair info...


 77%|█████████████████████████████████████████████████████████████▏                 | 774/1000 [10:48<03:09,  1.19it/s]

Building Dataframe...
File written to C:/Users/DELL/Desktop/reddit_data_classifier_hot.csv





In [312]:
# again downloading hot data for individual analyisation
print("BUILDING 'HOT' DATA")
dict_ids_hot_new = r.build_dataframe(1000, 'C:/Users/DELL/Desktop/reddit_data_classifier_hot_mod.csv', 'india', 'hot')

  0%|                                                                                         | 0/1000 [00:00<?, ?it/s]

BUILDING 'HOT' DATA
Dict_ids empty, created new...
Fetching subreddit flair info...


 78%|█████████████████████████████████████████████████████████████▍                 | 777/1000 [12:04<03:27,  1.07it/s]

Building Dataframe...
File written to C:/Users/DELL/Desktop/reddit_data_classifier_hot_mod.csv





In [304]:
extension = 'csv'
all_filenames = [i for i in glob.glob('*.{}'.format(extension))]

In [305]:
all_filenames

['reddit_data_classifier_hot.csv', 'reddit_data_classifier_new.csv']

In [306]:
combined_csv = pd.concat([pd.read_csv(f) for f in all_filenames ])

In [307]:
combined_csv.shape

(884, 20)

In [308]:
combined_csv.to_csv( "C:/Users/DELL/Desktop/reddit_india_classifier_combined_data.csv", index=False, encoding='utf-8-sig')

In [309]:
with open('reddit_india_classifier_combined_data.csv', 'rb') as f:
    result = chardet.detect(f.read())
    print(result)

{'encoding': 'UTF-8-SIG', 'confidence': 1.0, 'language': ''}


In [355]:
len(dict_ids)

884

## METHOD 3 : PUSHSHIFT API 

### COLLECTING MONTH WISE REDDIT INDIA DATA TITLE + FLAIRS 

In [430]:
import json
def getPushshiftData(after, before, sub):
    """ 
        function for implementing METHOD 3, Helps in fetching reddit API using PushShift API

        Parameters: 
            after (str) : results after which data is to be fetched (epoch UNIX time)
            before (str) : results before which data is to be fetched (epoch UNIX time)
            sub (str) : this is the name of subreddit

        Returns: 
            data (dict) : json item containing various parameters (info) about query searched 
  
        """
    url = 'https://api.pushshift.io/reddit/search/submission/?title='+'&size=1000&after='+str(after)+'&before='+str(before)+'&subreddit='+str(sub)
    print(url)
    r = requests.get(url)
    data = json.loads(r.text)
    return data['data']

In [510]:
columns=['id', 'title', 'author', 'author_flair_text', 'created_utc', 'full_link', 'num_comments', 'num_crossposts', 'over_18', 'permalink', 'url', 'total_awards_received', 'pinned', 'score', 'flair']

In [511]:
full_data_jan = getPushshiftData(1577882040, 1580474040, 'india')
full_data_feb = getPushshiftData(1580560440, 1582979640, 'india')
full_data_mar = getPushshiftData(1583066040, 1585658040, 'india')
full_data_apr = getPushshiftData(1585744440, 1588250040, 'india')


https://api.pushshift.io/reddit/search/submission/?title=&size=1000&after=1577882040&before=1580474040&subreddit=india
https://api.pushshift.io/reddit/search/submission/?title=&size=1000&after=1580560440&before=1582979640&subreddit=india
https://api.pushshift.io/reddit/search/submission/?title=&size=1000&after=1583066040&before=1585658040&subreddit=india
https://api.pushshift.io/reddit/search/submission/?title=&size=1000&after=1585744440&before=1588250040&subreddit=india


In [509]:
full_data_jan[910]['score']

0

In [512]:
def get_df(full_data):
    """ 
        function for getting nested lists of containing all parameters as features (columns) for dataset build dataframe

        Parameters: 
            full_data (json) : json data fetched from PushShift API

        Returns: 
            data_items (2-D list) : nested list , contaning 15 columns values named as [id, title, author, author_flair_text, 
            created_utc, full_link, num_comments, num_crossposts, over_18, permalink, url, total_awards_received, pinned, score, link_flair_text]
  
     """
    
    data_items =  []

    for i in range(1000):
        try:
            data_items.append([full_data[i]['id'], full_data[i]['title'], full_data[i]['author'], full_data[i]['author_flair_text'], 
                              full_data[i]['created_utc'], full_data[i]['full_link'], full_data[i]['num_comments'], full_data[i]['num_crossposts'], 
                               full_data[i]['over_18'], 
                              full_data[i]['permalink'], full_data[i]['url'], full_data[i]['total_awards_received'], full_data[i]['pinned'], full_data[i]['score'],
                               full_data[i]['link_flair_text']])
            
        except Exception as error:
            pass
    
    return data_items

In [513]:
df_jan, df_feb, df_mar, df_apr = pd.DataFrame(get_df(full_data_jan), columns=columns), pd.DataFrame(get_df(full_data_feb), columns=columns), pd.DataFrame(get_df(full_data_mar), columns=columns), pd.DataFrame(get_df(full_data_apr), columns=columns)

In [515]:
# writing four months data to disk
df_jan.to_csv( "C:/Users/DELL/Desktop/reddit_india_jan_data.csv", index=False)
df_feb.to_csv( "C:/Users/DELL/Desktop/reddit_india_feb_data.csv", index=False)
df_mar.to_csv( "C:/Users/DELL/Desktop/reddit_india_mar_data.csv", index=False)
df_apr.to_csv( "C:/Users/DELL/Desktop/reddit_india_apr_data.csv", index=False)

In [402]:
def get_flairs(after, before, sub, flair):
    """ 
        function for getting data items containing info on passed subreddits topic , mainly based on flair type

        Parameters: 
            after (str) : results after which data is to be fetched (epoch UNIX time)
            before (str) : results before which data is to be fetched (epoch UNIX time)
            sub (str) : this is the name of subreddit
            flair (str) : passed flair topics on which basis returned results will depend on

        Returns: 
            data (json) : json feteched data from PushShift API
  
     """
    url = 'https://api.pushshift.io/reddit/search/submission/?title='+'&size=1000&after='+str(after)+'&before='+str(before)+'&subreddit='+str(sub)+'&link_flair_text='+str(flair)
    print(url)
    r = requests.get(url)
    data = json.loads(r.text)
    return data['data']

In [411]:
from csv import writer
 
def append_list_as_row(file_name, list_of_elem):
    """ 
        function for writing a new row to the end of csv
        
        Parameters: 
            file_name (str) : file on which writing operation is to be done
            list_of_elem (list) : list of all column values of dataframe

        Returns: 
            data (json) : json feteched data from PushShift API
  
     """
    
    # Open file in append mode
    with open(file_name, 'a+', newline='') as write_obj:
        # Create a writer object from csv module
        csv_writer = writer(write_obj)
        # Add contents of list as last row in the csv file
        csv_writer.writerow(list_of_elem)


In [517]:
all_filenames = ['reddit_india_jan_data.csv', 'reddit_india_feb_data.csv', 'reddit_india_mar_data.csv', 'reddit_india_apr_data.csv']
combined_csv_new = pd.concat([pd.read_csv(f) for f in all_filenames ])
combined_csv_new.shape

(3074, 15)

In [518]:
combined_csv_new.to_csv( "C:/Users/DELL/Desktop/reddit_india_classifier_combined_data.csv", index=False, encoding='utf-8-sig')

In [465]:
full_data_april = getPushshiftData(1585744440, 1588250040, 'india')

https://api.pushshift.io/reddit/search/submission/?title=&size=1000&after=1585744440&before=1588250040&subreddit=india


In [466]:
apr_ids = [full_data_april[i]['id'] for i in range(1000)]

In [469]:
apr_ids[0]

'fsyy5x'

### FETCHING ALL COMMENTS DATA 

In [503]:
def get_comments(after, before, sub):
    """ 
        function for getting detailed info about commments based on specific subreddit topic , on time interval
        
        Parameters: 
            after (str) : results after which data is to be fetched (epoch UNIX time)
            before (str) : results before which data is to be fetched (epoch UNIX time)
            sub (str) : subreddit topic to search for 

        Returns: 
            data (json) : json feteched data from PushShift API, all comments data
  
     """
    url = 'https://api.pushshift.io/reddit/search/comment/?subreddit='+str(sub)+'&size=1000&after='+str(after)+'&before='+str(before)
    print(url)
    r = requests.get(url)
    data = json.loads(r.text)
    return data['data']

In [520]:
comments_jan = get_comments(1577882040, 1580474040, 'india')
comments_feb = get_comments(1580560440, 1582979640, 'india')
comments_mar = get_comments(1583066040, 1585658040, 'india')
comments_apr = get_comments(1585744440, 1588250040, 'india')

https://api.pushshift.io/reddit/search/comment/?subreddit=india&size=1000&after=1577882040&before=1580474040
https://api.pushshift.io/reddit/search/comment/?subreddit=india&size=1000&after=1580560440&before=1582979640
https://api.pushshift.io/reddit/search/comment/?subreddit=india&size=1000&after=1583066040&before=1585658040
https://api.pushshift.io/reddit/search/comment/?subreddit=india&size=1000&after=1585744440&before=1588250040


In [521]:
comments_list = ['id', 'parent_id', 'link_id', 'body', 'author', 'author_flair_text', 'author_fullname', 'permalink', 'created_utc', 'score', 'total_awards_received']

In [524]:
def get_comment_month(comment_data):
    """ 
        function for getting nested lists of containing all parameters as features (columns) for comments dataset build dataframe
        
        Parameters: 
            comment_data (json) : full json params fetched from PushShift API

        Returns: 
            data_items (2-D list) : nested list , contaning 11 columns values named as [id, parent_id, link_id, body, 
            author, author_flair_text, author_fullname, permalink, created_utc, score, url, total_awards_received]
  
     """
    data_items =  []

    for i in range(1000):
        try:
            data_items.append([comment_data[i]['id'], comment_data[i]['parent_id'], comment_data[i]['link_id'], comment_data[i]['body'], comment_data[i]['author'],
                              comment_data[i]['author_flair_text'], comment_data[i]['author_fullname'], comment_data[i]['permalink'], 
                               comment_data[i]['created_utc'], comment_data[i]['score'], comment_data[i]['total_awards_received']])
            
        except Exception as error:
            pass
    
    return data_items

In [525]:
com_jan, com_feb, com_mar, com_apr = pd.DataFrame(get_comment_month(comments_jan), columns=comments_list), pd.DataFrame(get_comment_month(comments_feb), columns=comments_list), pd.DataFrame(get_comment_month(comments_mar), columns=comments_list), pd.DataFrame(get_comment_month(comments_apr), columns=comments_list)

In [527]:
# writing four months data comments to disk
com_jan.to_csv( "C:/Users/DELL/Desktop/reddit_india_jan_data_com.csv", index=False)
com_feb.to_csv( "C:/Users/DELL/Desktop/reddit_india_feb_data_com.csv", index=False)
com_mar.to_csv( "C:/Users/DELL/Desktop/reddit_india_mar_data_com.csv", index=False)
com_apr.to_csv( "C:/Users/DELL/Desktop/reddit_india_apr_data_com.csv", index=False)

In [528]:
all_filenames_comments = ['reddit_india_jan_data_com.csv', 'reddit_india_feb_data_com.csv', 'reddit_india_mar_data_com.csv', 'reddit_india_apr_data_com.csv']
combined_csv_comments = pd.concat([pd.read_csv(f) for f in all_filenames_comments])
combined_csv_comments.shape

(3885, 11)

In [529]:
# this is the combined dataset for four months 
combined_csv_comments.to_csv( "C:/Users/DELL/Desktop/reddit_india_comments_data.csv", index=False, encoding='utf-8-sig')