### Import Python Libraries

In [1]:
import pandas as pd

### Read in the data files

In [2]:
# merge Sherlock and Poirot data
holmes = pd.read_csv('./datasets/holmes.csv')

In [3]:
# merge Sherlock and Poirot data
poirot = pd.read_csv('./datasets/poirot.csv')

### Eye ball the data

In [4]:
holmes.head()

Unnamed: 0,all_awardings,allow_live_comments,author,author_flair_css_class,author_flair_richtext,author_flair_text,author_flair_type,author_fullname,author_patreon_flair,author_premium,...,updated_utc,gilded,rte_mode,brand_safe,author_created_utc,suggested_sort,mod_reports,user_reports,parent_whitelist_status,whitelist_status
0,[],False,antdude,,[],,text,t2_4a27h,False,False,...,,,,,,,,,,
1,[],False,LowellAdams_61,,[],,text,t2_95v9d3o1,False,False,...,,,,,,,,,,
2,[],False,euphemiarise,,[],,text,t2_9d6kycs2,False,False,...,,,,,,,,,,
3,[],False,heynow345,,[],,text,t2_1oifygcn,False,False,...,,,,,,,,,,
4,[],False,AlanCummings243,,[],,text,t2_9982pbon,False,False,...,,,,,,,,,,


### Merge Poirot and Holmes data set

In [5]:
result = pd.concat([poirot , holmes])

In [6]:
result.columns

Index(['all_awardings', 'allow_live_comments', 'author',
       'author_flair_css_class', 'author_flair_richtext', 'author_flair_text',
       'author_flair_type', 'author_fullname', 'author_patreon_flair',
       'author_premium', 'awarders', 'can_mod_post', 'contest_mode',
       'created_utc', 'domain', 'full_link', 'gildings', 'id',
       'is_crosspostable', 'is_meta', 'is_original_content',
       'is_reddit_media_domain', 'is_robot_indexable', 'is_self', 'is_video',
       'link_flair_background_color', 'link_flair_richtext',
       'link_flair_text_color', 'link_flair_type', 'locked', 'media_only',
       'no_follow', 'num_comments', 'num_crossposts', 'over_18',
       'parent_whitelist_status', 'permalink', 'pinned', 'pwls',
       'retrieved_on', 'score', 'selftext', 'send_replies', 'spoiler',
       'stickied', 'subreddit', 'subreddit_id', 'subreddit_subscribers',
       'subreddit_type', 'thumbnail', 'title', 'total_awards_received',
       'treatment_tags', 'upvote_ratio',

### Lets create a dataframe focussing on columns we need

In [7]:

df_h_p = result[['subreddit','title', 'selftext']]

### Data Cleaning

In [8]:
# identify NAN columns
df_h_p.isnull().sum()

subreddit      0
title          0
selftext     433
dtype: int64

In [9]:
# number of records per Subreddit
df_h_p.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 791 entries, 0 to 487
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   subreddit  791 non-null    object
 1   title      791 non-null    object
 2   selftext   358 non-null    object
dtypes: object(3)
memory usage: 24.7+ KB


### Number of NaN records per Subreddit

In [10]:
df_h_p[df_h_p['subreddit']=='SherlockHolmes']['selftext'].isna().sum()

285

In [11]:
df_h_p[df_h_p['subreddit']=='poirot']['selftext'].isna().sum()

148

### Decision: Since the SelfText has a lot of NaN values , we will combine the SelfText and Title and populate the column Title with new value and use it for analysis

In [12]:
df_h_p['title1'] = df_h_p['title'] + (' '+ df_h_p['selftext']).fillna('')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_h_p['title1'] = df_h_p['title'] + (' '+ df_h_p['selftext']).fillna('')


In [13]:
df_h_p.isna().sum()

subreddit      0
title          0
selftext     433
title1         0
dtype: int64

### Lets change the subreddit columns from SherlockHolmes and Poirot to 1 and 0

In [14]:
df_h_p['subreddit'].replace(['SherlockHolmes','poirot'],[1,0],inplace = True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().replace(


In [15]:
df_h_p['title'] = df_h_p['title1']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_h_p['title'] = df_h_p['title1']


In [16]:
df_h_p.drop(columns = 'title1', inplace = True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


In [17]:
df_h_p.head()

Unnamed: 0,subreddit,title,selftext
0,0,Plz tell me Where do people watch poirot? I am...,Where do people watch poirot? I am from norway...
1,0,"My day started off a bit too similar ""Hercule ...",I was planning on watching the Suchet Christma...
2,0,Christmas day tradition!,
3,0,Happy holidays!,
4,0,What is your favorite most random scenes from ...,


In [18]:
# export the file to CSv
df_h_p.to_csv("./datasets/data_for_eda.csv", index=False)