# EDA

In [84]:
import pandas as pd
import numpy as np
import time 
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder

In [85]:
# read in data from folder in repository
food = pd.read_csv('./Data/food.csv', encoding = 'ISO-8859-1')
cooking = pd.read_csv('./Data/cooking.csv', encoding = 'ISO-8859-1')
keto = pd.read_csv('./Data/keto.csv', encoding = 'ISO-8859-1')
healthy_food = pd.read_csv('./Data/healthy_food.csv', encoding = 'ISO-8859-1')
diy = pd.read_csv('./Data/diy.csv', encoding = 'ISO-8859-1')
data = pd.read_csv('./Data/data.csv', encoding = 'ISO-8859-1')

  interactivity=interactivity, compiler=compiler, result=result)
  interactivity=interactivity, compiler=compiler, result=result)
  interactivity=interactivity, compiler=compiler, result=result)
  interactivity=interactivity, compiler=compiler, result=result)
  interactivity=interactivity, compiler=compiler, result=result)
  interactivity=interactivity, compiler=compiler, result=result)


In [86]:
#check columns
food.columns

Index(['all_awardings', 'allow_live_comments', 'author',
       'author_flair_css_class', 'author_flair_richtext', 'author_flair_text',
       'author_flair_type', 'author_fullname', 'author_patreon_flair',
       'author_premium', 'awarders', 'can_mod_post', 'contest_mode',
       'created_utc', 'domain', 'full_link', 'gildings', 'id',
       'is_crosspostable', 'is_meta', 'is_original_content',
       'is_reddit_media_domain', 'is_robot_indexable', 'is_self', 'is_video',
       'link_flair_background_color', 'link_flair_richtext',
       'link_flair_text_color', 'link_flair_type', 'locked', 'media_only',
       'no_follow', 'num_comments', 'num_crossposts', 'over_18',
       'parent_whitelist_status', 'permalink', 'pinned', 'post_hint',
       'preview', 'pwls', 'removed_by_category', 'retrieved_on', 'score',
       'selftext', 'send_replies', 'spoiler', 'stickied', 'subreddit',
       'subreddit_id', 'subreddit_subscribers', 'subreddit_type', 'thumbnail',
       'thumbnail_height', 

**These features are likely the most helpful in identifying which subreddit to associate a post with. These will be considered the features for initial testing.**

   - 'author': This column may contain several duplicate names which could help identify a post from a specific subreddit as members are likely to post repeatidly
   - 'created_utc': This timestime will simply be kept for interpretability purposes
   - 'id': This unique identify will be kept for reference only
   - 'selftext': this is the main body of the post, this will be tokenized for NLP analysis
   - 'subreddit': this is the subreddit identifier and will be used as the target (y) column when the dataframes are combined <br>
   - 'title': this is the title of the post and will be tokenized for NLP analysis

For this analysis only the 'author','id','selftext','subreddit','title' fields will be used. 

Check for null values in the 'author','id','selftext','subreddit','title' columns.

In [87]:
#create list of datasets
data_sets = [food,cooking,keto,healthy_food,diy,data]
data_sets_names = ['food','cooking','keto','healthy_food','diy','data']

In [88]:
# explore null values for 'title' and 'self text'
count = 0
for item in data_sets:
    body_nulls = item['selftext'].isnull().sum()
    title_nulls = item['title'].isnull().sum()
    length = len(item)
    name = data_sets_names[count]
    count +=1
    print(f" In {name} there are {body_nulls} null submissions and {title_nulls} null titles {(body_nulls+ title_nulls)/length}% of the data")
    print()



 In food there are 19566 null submissions and 0 null titles 0.9783% of the data

 In cooking there are 1381 null submissions and 0 null titles 0.06905% of the data

 In keto there are 314 null submissions and 0 null titles 0.0157% of the data

 In healthy_food there are 5495 null submissions and 0 null titles 0.27475% of the data

 In diy there are 10448 null submissions and 0 null titles 0.5224% of the data

 In data there are 5560 null submissions and 0 null titles 0.278% of the data



There appear to be no null title values. The food dataset has over 97% null 'selftext' (submission) nulls. It will be excluded from the analysis. 

In [89]:
#remove the food dataset
data_sets.remove(food)
data_sets_names.remove('food')

In [90]:
data_sets_names

['cooking', 'keto', 'healthy_food', 'diy', 'data']

In [91]:
# create new data frames with only columns of interest

cooking = cooking[['author','id','selftext','subreddit','title']]
keto = keto[['author','id','selftext','subreddit','title']]
healthy_food = healthy_food[['author','id','selftext','subreddit','title']]
diy = diy[['author','id','selftext','subreddit','title']]
data = data[['author','id','selftext','subreddit','title']]

In [92]:
# check the changes were applied
cooking.head()

Unnamed: 0,author,id,selftext,subreddit,title
0,YurkeyYacon,ewbxxh,If you make:\n\n1. A dish with some amount of ...,Cooking,Does infusing oil make the herb flavor stronge...
1,truedef,ewbvum,Will a dutch baby in an enameled cast iron ves...,Cooking,Dutch baby in enameled cast iron
2,Roupert2,ewbi6w,Mine is an 8 cup glass measuring cup. I had be...,Cooking,What was your favorite kitchen purchase of 2019?
3,jewelsbyjules,ewbetu,[removed],Cooking,Dinner ideas for a foreign guest
4,CholoFlakes,ewbdkh,I will cook dinner for my family tomorrow and ...,Cooking,Best dishes to serve alongside some chili con ...


In [93]:
# check all nulls
cooking.isnull().sum()

author          0
id              0
selftext     1381
subreddit       0
title           0
dtype: int64

In [94]:
keto.isnull().sum()

author         0
id             0
selftext     314
subreddit      0
title          0
dtype: int64

In [95]:
healthy_food.isnull().sum()

author          0
id              0
selftext     5495
subreddit       0
title           0
dtype: int64

In [96]:
diy.isnull().sum()

author           0
id               0
selftext     10448
subreddit        0
title            0
dtype: int64

In [97]:
data.isnull().sum()

author          0
id              0
selftext     5560
subreddit       0
title           0
dtype: int64

In [98]:
#recreate list of datasets with the updates
data_sets = [cooking,keto,healthy_food,diy,data]
data_sets_names = ['cooking','keto','healthy_food','diy','data']

There only appears to be null values in the 'selftext' columns. All rows with a null value in the 'selftext' column will be dropped. This will result in complete sets of data sets. 

In [99]:
# drop all rows containing a null values (only impacts the'selftext' column)
for item in data_sets:
    item.dropna(inplace = True)

In [100]:
#check to see if nulls were dropped
cooking.isnull().sum()

author       0
id           0
selftext     0
subreddit    0
title        0
dtype: int64

In [101]:
data.isnull().sum()

author       0
id           0
selftext     0
subreddit    0
title        0
dtype: int64

In [102]:
#check the new lengths of the datasets
for item in data_sets:
    print(len(item))

18619
19686
14505
9552
14440


In [103]:
#check for 'removed' or 'deleted' selftext columns

for item in data_sets:
    print(f"[removed] {len(item[item['selftext'] == '[removed]'])}")
    print(f"[deleted] {len(item[item['selftext'] == '[deleted]'])}")

[removed] 4434
[deleted] 27
[removed] 5262
[deleted] 37
[removed] 1356
[deleted] 302
[removed] 1256
[deleted] 82
[removed] 2392
[deleted] 17


Note: There are multiple 'deleted' and 'removed' selftext' values. When the 'selftext' column is tokenized it may confuse 'removed' and 'deleted' as actual posts and likely associate them with one another. Rows containing 'removed' or 'deleted' posts will be dropped from all dataframes.

In [104]:
# drop all rows containing 'deleted' and 'removed' in the 'selftext' column

for item in data_sets:
    remove = list(item[item['selftext'] == '[removed]'].index)
    item.drop(index = remove, inplace = True)
    deleted = list(item[item['selftext'] == '[deleted]'].index)
    item.drop(index = deleted, inplace = True)

In [106]:
#check that the rows were dropped

for item in data_sets:
    print(f"[removed] {len(item[item['selftext'] == '[removed]'])}")
    print(f"[deleted] {len(item[item['selftext'] == '[deleted]'])}")

[removed] 0
[deleted] 0
[removed] 0
[deleted] 0
[removed] 0
[deleted] 0
[removed] 0
[deleted] 0
[removed] 0
[deleted] 0


### The data sets have all been successfully cleaned. They will now be combined to begin NLP preprocessing.
The DIY subset of data will be removed prior to modeling, it is being added here to simplify the preprossing steps.

In [107]:
# add all three dataset into one dataframe 
main = pd.concat(data_sets)

In [108]:
#check to ensure merge worked
main.head()

Unnamed: 0,author,id,selftext,subreddit,title
0,YurkeyYacon,ewbxxh,If you make:\n\n1. A dish with some amount of ...,Cooking,Does infusing oil make the herb flavor stronge...
1,truedef,ewbvum,Will a dutch baby in an enameled cast iron ves...,Cooking,Dutch baby in enameled cast iron
2,Roupert2,ewbi6w,Mine is an 8 cup glass measuring cup. I had be...,Cooking,What was your favorite kitchen purchase of 2019?
4,CholoFlakes,ewbdkh,I will cook dinner for my family tomorrow and ...,Cooking,Best dishes to serve alongside some chili con ...
5,Tanstorm,ewba8g,"been having trouble with things sticking, kind...",Cooking,Problem with stainless steel sticking Eggs or ...


In [109]:
# check the new DF size
main.shape

(61637, 5)

In [110]:
#check for any remaining nulls
main.isnull().sum()

author       0
id           0
selftext     0
subreddit    0
title        0
dtype: int64

In [111]:
data_sets_names

['cooking', 'keto', 'healthy_food', 'diy', 'data']

Create value indicators for each subreddit. 
   - Cooking: 0
   - Keto: 1
   - EatCheapAndHealthy: 2
   - DIY: 3
   - DataScience: 4

In [113]:
main['subreddit'].value_counts()

keto                  14387
Cooking               14158
EatCheapAndHealthy    12847
datascience           12031
DIY                    8214
Name: subreddit, dtype: int64

In [115]:
# encode the subreddits using data dictionary

subreddit = {'subreddit': {
                            'Cooking' :0,
                            'keto' : 1,
                            'EatCheapAndHealthy' : 2, 
                            'DIY' : 3,
                            'datascience' : 4
                        }
                        }
main.replace(subreddit, inplace = True)

In [116]:
main['subreddit'].value_counts()

1    14387
0    14158
2    12847
4    12031
3     8214
Name: subreddit, dtype: int64

In [117]:
main['subreddit'].value_counts(normalize=True)

1    0.233415
0    0.229700
2    0.208430
4    0.195191
3    0.133264
Name: subreddit, dtype: float64

In [118]:
main.head()

Unnamed: 0,author,id,selftext,subreddit,title
0,YurkeyYacon,ewbxxh,If you make:\n\n1. A dish with some amount of ...,0,Does infusing oil make the herb flavor stronge...
1,truedef,ewbvum,Will a dutch baby in an enameled cast iron ves...,0,Dutch baby in enameled cast iron
2,Roupert2,ewbi6w,Mine is an 8 cup glass measuring cup. I had be...,0,What was your favorite kitchen purchase of 2019?
4,CholoFlakes,ewbdkh,I will cook dinner for my family tomorrow and ...,0,Best dishes to serve alongside some chili con ...
5,Tanstorm,ewba8g,"been having trouble with things sticking, kind...",0,Problem with stainless steel sticking Eggs or ...


In [119]:
# export the cleaned data to the datafolder 

main.to_csv('./Data/5_reddit_clean_data.csv', index = False)

## Continue to next notebook 'Modeling'