## Feature Engineering
- `is_poll` flag
- sentiment labeling
- `other_platform` mentioned tag

In [1]:
import os
import json
import numpy as np
import pandas as pd
import nltk

In [69]:
posts['subreddit'].unique()

array(['Depop', 'EtsySellers', 'Etsy', 'Flipping', 'Grailed', 'poshmark',
       'stockx'], dtype=object)

In [7]:
comments = pd.DataFrame()
posts = pd.DataFrame()

for file in os.listdir('./Data/comments/csv'):
    temp = pd.read_csv(f'./Data/comments/cleaned/{file}', low_memory=False)
    comments = pd.concat([comments, temp])
    
for file in os.listdir('./Data/posts/csv'):
    temp = pd.read_csv(f'./Data/posts/cleaned/{file}', low_memory=False)
    posts = pd.concat([posts, temp])

posts.dropna(axis=0, subset=['selftext'], inplace=True)
comments.dropna(axis=0, subset=['body'], inplace=True)

comments.reset_index(drop=False, names=['original_index'], inplace=True)
posts.reset_index(drop=False, names=['original_index'], inplace=True)

In [8]:
posts['is_poll'] = posts['poll_data'].apply(lambda x: False if x is np.nan else True)

In [9]:
def other_platform(record) -> bool:
    other_platforms = ['Depop', 'EtsySellers', 'Etsy', 'Flipping', 'Grailed', 'poshmark','stockx', 'Mercari', 'ThredUp', 'TheRealReal', 'TRR', 'Ebay']
    platform = posts.loc[record,'subreddit'].lower()
    other_platforms = [i.lower() for i in other_platforms if i.lower() != platform]
    all_text = str(posts.loc[record,'selftext']) + str(posts.loc[record,'author_flair_text']) + str(posts.loc[record,'poll_data'])
    for n in other_platforms:
        if n in all_text.lower():
            return True
    return False

In [10]:
posts['other_platform'] = [other_platform(x) for x in posts.index]

In [11]:
posts['other_platform'].value_counts()

other_platform
False    152925
True      34876
Name: count, dtype: int64

In [12]:
comments.dtypes

original_index         int64
subreddit_id          object
subreddit             object
subreddit_type        object
id                    object
parent_id             object
link_id               object
author                object
created_utc            int64
score                  int64
controversiality        bool
ups                  float64
likes                  int64
body                  object
author_flair_text     object
created               object
dtype: object

In [13]:
posts.dtypes

original_index        int64
subreddit            object
subreddit_id         object
subreddit_type       object
id                   object
name                 object
media                object
is_video               bool
created_utc           int64
num_comments          int64
score                 int64
ups                   int64
selftext             object
author_flair_text    object
link_flair_text      object
poll_data            object
created              object
is_poll                bool
other_platform         bool
dtype: object

In [14]:
posts.isnull().sum()

original_index            0
subreddit                 0
subreddit_id              0
subreddit_type        30924
id                        0
name                  59413
media                187790
is_video                  0
created_utc               0
num_comments              0
score                     0
ups                       0
selftext                  0
author_flair_text    172600
link_flair_text      102377
poll_data            186613
created                   0
is_poll                   0
other_platform            0
dtype: int64

### Polls
-  `total_votes`
- `options_text`
- `options_weighting`

- Review `flair_tags` and generate further labels in combination with CVEC terms
- Parse `poll_data` for additional features, extract relevant data according to `flair_tags`
- Use comments `author_flair_text` to indicate if someone is a marked buyer/seller

In [16]:
posts['poll_votes'] = posts['poll_data'].apply(lambda x: eval(x)['total_vote_count'] if isinstance(x, str) else -1)

In [17]:
posts[posts.poll_data.isnull()==False]

Unnamed: 0,original_index,subreddit,subreddit_id,subreddit_type,id,name,media,is_video,created_utc,num_comments,score,ups,selftext,author_flair_text,link_flair_text,poll_data,created,is_poll,other_platform,poll_votes
6437,6438,Depop,t5_2y4mt,public,fr5ey3,,,False,1585486194,9,5,-1,I see consistently negative responses towards ...,,,"{'options': [{'id': '83067', 'text': 'Only dro...",2020-03-29 08:49:54,True,False,175
6506,6507,Depop,t5_2y4mt,public,ft0jwq,,,False,1585750493,9,4,-1,"If so or if not, why? I’m considering deleting...",,,"{'options': [{'id': '132565', 'text': 'Yes I d...",2020-04-01 10:14:53,True,False,128
6550,6551,Depop,t5_2y4mt,public,ftsfpj,,,False,1585854657,7,1,-1,Just wondering if anyone has tried setting the...,,,"{'options': [{'id': '152597', 'text': 'Never t...",2020-04-02 15:10:57,True,False,36
6592,6593,Depop,t5_2y4mt,public,fuiiw4,,,False,1585954865,12,4,-1,Like when you post a plain t-shirt and tag it ...,,,"{'options': [{'id': '170461', 'text': 'it’s fi...",2020-04-03 19:01:05,True,False,155
6637,6638,Depop,t5_2y4mt,public,fvgavp,,,False,1586103351,2,3,-1,I keep trying to log onto my page and it keeps...,,,"{'options': [{'id': '193337', 'text': 'Yes', '...",2020-04-05 12:15:51,True,False,71
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
185954,8474,stockx,t5_3m7es,public,tslmcn,t3_tslmcn,,False,1648681021,3,0,-1,.\n\n[View Poll](https://www.reddit.com/poll/t...,,Question,"{'is_prediction': False, 'options': [{'id': '1...",2022-03-30 18:57:01,True,False,57
186842,9362,stockx,t5_3m7es,public,wqhv1v,t3_wqhv1v,,False,1660718739,11,0,-1,Thinking of buying some shoes off stockX. Dont...,,Question,"{'is_prediction': False, 'options': [{'id': '1...",2022-08-17 02:45:39,True,False,280
186872,9392,stockx,t5_3m7es,public,wsazfr,t3_wsazfr,,False,1660906576,5,1,-1,Ordered some New Balances off of StockX like 5...,,Question,"{'is_prediction': False, 'options': [{'id': '1...",2022-08-19 06:56:16,True,False,39
187460,9980,stockx,t5_3m7es,public,yqkre5,t3_yqkre5,,False,1668004699,2,0,-1,.\n\n[View Poll](https://www.reddit.com/poll/y...,,Question,"{'is_prediction': False, 'options': [{'id': '1...",2022-11-09 09:38:19,True,False,88


In [18]:
sample_poll_1 = {
    'is_prediction': False, 
    'options': [
        {'id': '20094950', 'text': 'Goat', 'vote_count': 50},
        {'id': '20094951', 'text': 'Stockx', 'vote_count': 51}
        ],
    'prediction_status': None,
    'resolved_option_id': None,
    'total_stake_amount': None, 
    'total_vote_count': 101, 
    'tournament_id': None, 
    'user_selection': None, 
    'user_won_amount': None, 
    'vote_updates_remained': None, 
    'voting_end_timestamp': 1669935211731
    }

sample_poll_2 = {
    'options': [
        {'id': '83067', 
         'text': 'Only dropshipping is bad', 
         'vote_count': 44},
        {'id': '83068', 
         'text': 'Both are bad, but dropshipping significantly worse', 
         'vote_count': 74},
        {'id': '83069',
         'text': 'Both are equally bad',
         'vote_count': 50},
        {'id': '83070',
         'text': 'Neither are bad', 
         'vote_count': 7}
        ], 
    'total_vote_count': 175, 
    'user_selection': None, 
    'voting_end_timestamp': 1585745394498
    }


In [19]:
sample_poll_1['total_vote_count'], sample_poll_2['total_vote_count']

(101, 175)

In [20]:
posts.loc[0,'poll_data'] is np.nan

True

In [21]:
def poll_options(poll_data: str) -> str:
    if isinstance(poll_data, str):
        poll_data = eval(str(poll_data))
        options = [i['text'] for i in poll_data['options']]
        return ', '.join(options)

In [22]:
# posts['poll_options'] = posts['poll_data'].apply(lambda x: poll_options(x) if isinstance(x, str) else None)
posts['poll_options'] = posts['poll_data'].apply(poll_options)

In [23]:
posts['poll_options'].value_counts()

poll_options
Yes, No                                                                       79
yes, no                                                                       12
$1-10, $10-15, $16-20, $21-25, $26-30, $30+                                    5
Goat, Stockx                                                                   4
yes, no, results                                                               4
                                                                              ..
below 50, 50-100, 100-200, above 200                                           1
90’s, 80’s, 70’s-60’s                                                          1
modeled pictures, flat lays, honestly doesn’t matter                           1
Free shipping, % off, Doesn’t matter. If I like the item then I’ll buy it.     1
Or don’t, Reorder                                                              1
Name: count, Length: 1079, dtype: int64

In [24]:
posts['poll_data'].isnull()

0         True
1         True
2         True
3         True
4         True
          ... 
187796    True
187797    True
187798    True
187799    True
187800    True
Name: poll_data, Length: 187801, dtype: bool

### Topic Generation
- Creating list of general topics with simple text processing and `CVEC`

In [25]:
def remove_stopwords(sentence: str) -> str:
    stopwords = nltk.corpus.stopwords.words('english')
    sentence = sentence.split(' ')
    output = [word for word in sentence if word.lower() not in stopwords]
    return ' '.join(output).replace('\n', ' ').strip()

In [27]:
posts['processed_text'] = posts['selftext'].apply(remove_stopwords)

In [30]:
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer

tokenizer = RegexpTokenizer(r'[A-Za-z]+')
lemmatizer = WordNetLemmatizer()
all_stopwords = stopwords.words('english') + [
    'anyone','someone','everyone','everybody',
    'also','you','your','etsy','depop','grailed','ebay',
    'http','www','com','reddit']

def post_processor(sentence: str, tokenizer, lemmatizer, stopwords) -> str:
    tokenized = tokenizer.tokenize(sentence.lower())
    lemmatized = [lemmatizer.lemmatize(i) for i in tokenized]
    no_stopwords = [w for w in lemmatized if w not in all_stopwords]
    return ' '.join(no_stopwords)
    
j = 0
total_posts = len(posts)
cleaned_posts = []
for post in posts['selftext']:
    cleaned_posts.append(post_processor(post, tokenizer, lemmatizer, all_stopwords))

In [60]:
vectorizer = CountVectorizer(
    analyzer='word',
    tokenizer=None,
    preprocessor=None,
    stop_words=None,
    max_features=2000
)
output = vectorizer.fit_transform(cleaned_posts)
topics_counts = pd.DataFrame(output.toarray(),
                             columns=vectorizer.get_feature_names_out()).sum().sort_values(
                                 ascending=False)

In [63]:
topics_counts.drop(index = [i for i in topics_counts.index if i in topics])[0:20]

wa        121294
like       75056
would      71185
get        68513
seller     58185
know       57267
one        56560
ha         54154
time       53776
day        47516
want       44700
make       42341
new        37224
people     36873
amp        34730
thing      33554
really     33383
even       31720
got        31670
back       31234
dtype: int64

In [65]:
vectorizer_phrases = CountVectorizer(
    analyzer='word',
    tokenizer=None,
    preprocessor=None,
    stop_words=None,
    max_features=2000,
    ngram_range=(2,3)
)
output_phrases = vectorizer_phrases.fit_transform(cleaned_posts)
phrases_counts = pd.DataFrame(output_phrases.toarray(),
                             columns=vectorizer_phrases.get_feature_names_out()).sum().sort_values(
                                 ascending=False)


In [68]:
phrases_counts[0:20]

feel like          7786
post office        6396
wa wondering       6380
free shipping      5563
would like         5524
shipping label     5220
let know           5026
item wa            4773
week ago           4625
day ago            4432
tracking number    4418
even though        4408
star review        4196
first time         4173
thanks advance     3982
make sure          3979
look like          3830
sold item          3632
month ago          3468
doe know           3371
dtype: int64

In [None]:
topics = [
    'buyer','customer',
    'item','product',
    'sale','sell','selling','sold','purchase','refund','return'
    'money','paypal','offer','price','free','pay','fee','cost',
    'listing','post','order',
    'flipping',
    'account','review','store','shop','business',
    'photo',
    'shipping','ship','sent','package','shipped','label','tracking','send', 'post office', 'free shipping'
    'received',
    'case', 'issue', # perhaps contested shipments or returns?
    'advice','look','looking','search'
]

In [None]:
flair_tags = []
for n in posts['author_flair_text']:
    [flair_tags.append(i) for i in n['link_flair_text'].unique() if i is not np.nan]

In [None]:
flair_tags

In [188]:
posts

Unnamed: 0,original_index,subreddit,subreddit_id,subreddit_type,id,name,media,is_video,created_utc,num_comments,...,ups,selftext,author_flair_text,link_flair_text,poll_data,created,is_poll,other_platform,poll_votes,poll_options
0,0,Depop,t5_2y4mt,,3hpz75,t3_3hpz75,,True,1440080171,0,...,1,I'm new to Depop and someone PMed me in regard...,,,,2015-08-20 10:16:11,0,False,-1,
1,1,Depop,t5_2y4mt,,4d397o,t3_4d397o,,True,1459630313,0,...,1,,,,,2016-04-02 16:51:53,0,False,-1,
2,2,Depop,t5_2y4mt,,53rq3m,t3_53rq3m,,True,1474436117,6,...,2,Basically what payment method would be the eas...,,,,2016-09-21 01:35:17,0,False,-1,
3,3,Depop,t5_2y4mt,,5tyzae,,,True,1487059455,0,...,-1,I accidentally forgot to put in a suite # on t...,,,,2017-02-14 03:04:15,0,False,-1,
4,4,Depop,t5_2y4mt,,5u0w82,,,True,1487087024,0,...,-1,@justinburns901\n\n,,,,2017-02-14 10:43:44,0,False,-1,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
187803,10316,stockx,t5_3m7es,public,10008cv,t3_10008cv,,False,1672510848,7,...,-1,"I was looking at the prices for Chicago 2s, in...",,Problem,,2022-12-31 13:20:48,0,False,-1,
187804,10317,stockx,t5_3m7es,public,1001h9n,t3_1001h9n,,False,1672514414,7,...,-1,&amp;#x200B;\n\n[wtf is this](https://preview....,,Problem,,2022-12-31 14:20:14,0,False,-1,
187805,10318,stockx,t5_3m7es,public,1002wz2,t3_1002wz2,,False,1672518573,5,...,-1,&amp;#x200B;\n\n[My Bid is 125$ and StockX is ...,,Question,,2022-12-31 15:29:33,0,False,-1,
187806,10319,stockx,t5_3m7es,public,1003i0e,t3_1003i0e,,False,1672520271,3,...,-1,My bid is 125. Other bids are as high as 140 y...,,Problem,,2022-12-31 15:57:51,0,False,-1,


In [193]:
comments

Unnamed: 0,original_index,subreddit_id,subreddit,subreddit_type,id,parent_id,link_id,author,created_utc,score,controversiality,ups,likes,body,author_flair_text,created
0,0,t5_2y4mt,Depop,,d5wpc6e,t3_4d397o,t3_4d397o,yung117,1469843685,1,False,1.0,-1,just got duped here. $150,,2016-07-29 21:54:45
1,1,t5_2y4mt,Depop,,d6155q3,t1_d5wpc6e,t3_4d397o,rowland_thunder,1470153328,1,False,1.0,-1,"Yo, I'm a redditor and work for Depop in the U...",,2016-08-02 11:55:28
2,2,t5_2y4mt,Depop,,d623bws,t1_d6155q3,t3_4d397o,yung117,1470199498,1,False,1.0,-1,Well im not sure what theyre doing rn but they...,,2016-08-03 00:44:58
3,3,t5_2y4mt,Depop,,d64vtsa,t1_d623bws,t3_4d397o,rowland_thunder,1470366856,1,False,1.0,-1,Shoot me an email: meg@depop.com,,2016-08-04 23:14:16
4,4,t5_2y4mt,Depop,,d7wgx1d,t3_53rq3m,t3_53rq3m,MightyHarambe,1474487357,1,False,1.0,-1,Only pay through Depop if you think you might ...,,2016-09-21 15:49:17
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3745594,126588,t5_3m7es,stockx,public,j2fpyax,t3_zzjxit,t3_zzjxit,WowJerryIsntThatGood,1672527678,1,False,,-1,It looks like someone test fit trucks onto the...,,2022-12-31 18:01:18
3745595,126589,t5_3m7es,stockx,public,j2ftn03,t1_j2fms09,t3_1003i0e,Siebo_,1672529401,0,False,,-1,So is my bid actually the highest or should I ...,,2022-12-31 18:30:01
3745596,126590,t5_3m7es,stockx,public,j2fuzpe,t3_zztvvl,t3_zztvvl,Niximusprime949,1672530007,3,False,,-1,"You stop using StockX, that will fix it",,2022-12-31 18:40:07
3745597,126591,t5_3m7es,stockx,public,j2fv1f0,t1_j2fuzpe,t3_zztvvl,Brilliant_Pen4959,1672530029,1,False,,-1,using it for prices,,2022-12-31 18:40:29


In [194]:
# checking for subfolder, if not create
if not os.path.exists(f"./Data/preprocessed/"):
    os.makedirs("./Data/preprocessed/") 
# exporting CSV
posts.to_csv('./Data/preprocessed/posts.csv')
comments.to_csv('./Data/preprocessed/comments.csv')

In [201]:
posts[['id','selftext']].to_csv('./Data/preprocessed/posts_sentiment_data.csv', index=False)

In [205]:
pd.read_csv('./Data/preprocessed/posts_sentiment_data.csv', index_col='id')

Unnamed: 0_level_0,selftext
id,Unnamed: 1_level_1
3hpz75,I'm new to Depop and someone PMed me in regard...
4d397o,
53rq3m,Basically what payment method would be the eas...
5tyzae,I accidentally forgot to put in a suite # on t...
5u0w82,@justinburns901\n\n
...,...
10008cv,"I was looking at the prices for Chicago 2s, in..."
1001h9n,&amp;#x200B;\n\n[wtf is this](https://preview....
1002wz2,&amp;#x200B;\n\n[My Bid is 125$ and StockX is ...
1003i0e,My bid is 125. Other bids are as high as 140 y...


In [207]:
comments

Unnamed: 0,original_index,subreddit_id,subreddit,subreddit_type,id,parent_id,link_id,author,created_utc,score,controversiality,ups,likes,body,author_flair_text,created
0,0,t5_2y4mt,Depop,,d5wpc6e,t3_4d397o,t3_4d397o,yung117,1469843685,1,False,1.0,-1,just got duped here. $150,,2016-07-29 21:54:45
1,1,t5_2y4mt,Depop,,d6155q3,t1_d5wpc6e,t3_4d397o,rowland_thunder,1470153328,1,False,1.0,-1,"Yo, I'm a redditor and work for Depop in the U...",,2016-08-02 11:55:28
2,2,t5_2y4mt,Depop,,d623bws,t1_d6155q3,t3_4d397o,yung117,1470199498,1,False,1.0,-1,Well im not sure what theyre doing rn but they...,,2016-08-03 00:44:58
3,3,t5_2y4mt,Depop,,d64vtsa,t1_d623bws,t3_4d397o,rowland_thunder,1470366856,1,False,1.0,-1,Shoot me an email: meg@depop.com,,2016-08-04 23:14:16
4,4,t5_2y4mt,Depop,,d7wgx1d,t3_53rq3m,t3_53rq3m,MightyHarambe,1474487357,1,False,1.0,-1,Only pay through Depop if you think you might ...,,2016-09-21 15:49:17
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3745594,126588,t5_3m7es,stockx,public,j2fpyax,t3_zzjxit,t3_zzjxit,WowJerryIsntThatGood,1672527678,1,False,,-1,It looks like someone test fit trucks onto the...,,2022-12-31 18:01:18
3745595,126589,t5_3m7es,stockx,public,j2ftn03,t1_j2fms09,t3_1003i0e,Siebo_,1672529401,0,False,,-1,So is my bid actually the highest or should I ...,,2022-12-31 18:30:01
3745596,126590,t5_3m7es,stockx,public,j2fuzpe,t3_zztvvl,t3_zztvvl,Niximusprime949,1672530007,3,False,,-1,"You stop using StockX, that will fix it",,2022-12-31 18:40:07
3745597,126591,t5_3m7es,stockx,public,j2fv1f0,t1_j2fuzpe,t3_zztvvl,Brilliant_Pen4959,1672530029,1,False,,-1,using it for prices,,2022-12-31 18:40:29


In [210]:
posts[posts['selftext'].isnull()]

Unnamed: 0,original_index,subreddit,subreddit_id,subreddit_type,id,name,media,is_video,created_utc,num_comments,...,ups,selftext,author_flair_text,link_flair_text,poll_data,created,is_poll,other_platform,poll_votes,poll_options
1,1,Depop,t5_2y4mt,,4d397o,t3_4d397o,,True,1459630313,0,...,1,,,,,2016-04-02 16:51:53,0,False,-1,
39180,13,EtsySellers,t5_2wthr,,1bo6p1,,,True,1365095433,2,...,3,,,,,2013-04-04 13:10:33,0,False,-1,
64145,0,Etsy,t5_2qq24,public,8543m,,,False,1237230610,0,...,-1,,,,,2009-03-16 15:10:10,0,False,-1,
107418,3,Flipping,t5_2y9q6,,1kvmco,,,True,1377184474,1,...,16,,,,,2013-08-22 11:14:34,0,False,-1,
149726,1,Grailed,t5_31k6k,,3qsz1q,t3_3qsz1q,,True,1446182058,0,...,3,,,,,2015-10-30 01:14:18,0,False,-1,
154606,0,poshmark,t5_34hq6,,2ly1gi,t3_2ly1gi,,True,1415689743,0,...,1,,,,,2014-11-11 02:09:03,0,False,-1,
177493,6,stockx,t5_3m7es,public,7ud3if,,,False,1517431520,2,...,-1,,,,,2018-01-31 15:45:20,0,False,-1,


AttributeError: 'float' object has no attribute 'split'