In [1]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from transformers import pipeline, BertTokenizer
from tqdm import tqdm

df = pd.read_json('dataframes/reddit_data.json')
print(df.info())

<class 'pandas.core.frame.DataFrame'>
Index: 14770 entries, c1e4o to c0i10ti
Data columns (total 13 columns):
 #   Column            Non-Null Count  Dtype         
---  ------            --------------  -----         
 0   score             14770 non-null  int64         
 1   controversiality  14770 non-null  int64         
 2   subreddit         14770 non-null  object        
 3   body              14770 non-null  object        
 4   month             14770 non-null  int64         
 5   year              14770 non-null  int64         
 6   original_size     14770 non-null  int64         
 7   PS                14770 non-null  int64         
 8   XBOX              14770 non-null  int64         
 9   PS_Count          14770 non-null  float64       
 10  XBOX_Count        14770 non-null  float64       
 11  date              14770 non-null  datetime64[ns]
 12  naive_sentiment   14770 non-null  float64       
dtypes: datetime64[ns](1), float64(3), int64(7), object(2)
memory usage: 1.6+ MB

To start off, we'll be needing a value to predict.  I'm going to be using a secondary sentiment analysis through a bert classifier that ranks the sentiment on a scale of 1-5 stars, as well as gives a confidence score.

In [None]:
#Since this is a lot of data to keep in memory at once, I'm splitting up the work so I can save progress over time
df06 = df[df['year']==2006]
df07 = df[df['year']==2007]
df08 = df[df['year']==2008]
df09 = df[df['year']==2009]

classifier = pipeline('sentiment-analysis', model='nlptown/bert-base-multilingual-uncased-sentiment')
max_length = 512

In [None]:
#2006
labels = []
scores = []
for post in tqdm(df06['body']):
        a = classifier(post, max_length=max_length, truncation=True) #this returns a 1-element list of a dictionary
        labels.append(a[0]['label'][0]) #a[0]['label'] returns a label between 1 star and 5 stars, I only need the number
        scores.append(a[0]['score']) #confidence score of said label
df06['bert_labels'] = labels
df06['bert_scores'] = scores

print(df06.head())
df06.to_json('dataframes/df06_bert.json')

In [None]:
#2007
labels = []
scores = []
for post in tqdm(df07['body']):
        a = classifier(post, max_length=max_length, truncation=True)
        labels.append(a[0]['label'][0])
        scores.append(a[0]['score'])
df07['bert_labels'] = labels
df07['bert_scores'] = scores

print(df07.head())
df07.to_json('dataframes/df07_bert.json')

In [None]:
#2008
labels = []
scores = []
for post in tqdm(df08['body']):
        a = classifier(post, max_length=max_length, truncation=True)
        labels.append(a[0]['label'][0])
        scores.append(a[0]['score'])
df08['bert_labels'] = labels
df08['bert_scores'] = scores

print(df08.head())
df08.to_json('dataframes/df08_bert.json')

In [None]:
#2009
labels = []
scores = []
for post in tqdm(df09['body']):
        a = classifier(post, max_length=max_length, truncation=True)
        labels.append(a[0]['label'][0])
        scores.append(a[0]['score'])
df09['bert_labels'] = labels
df09['bert_scores'] = scores

print(df09.head())
df09.to_json('dataframes/df09_bert.json')

In [23]:
#recombining data after the split
df06 = pd.read_json('dataframes/df06_bert.json')
df07 = pd.read_json('dataframes/df07_bert.json')
df08 = pd.read_json('dataframes/df08_bert.json')
df09 = pd.read_json('dataframes/df09_bert.json')

df = df06.append(df07.append(df08.append(df09)))
print(df.info())

<class 'pandas.core.frame.DataFrame'>
Index: 14770 entries, c1e4o to c0i10ti
Data columns (total 15 columns):
 #   Column            Non-Null Count  Dtype         
---  ------            --------------  -----         
 0   score             14770 non-null  int64         
 1   controversiality  14770 non-null  int64         
 2   subreddit         14770 non-null  object        
 3   body              14770 non-null  object        
 4   month             14770 non-null  int64         
 5   year              14770 non-null  int64         
 6   original_size     14770 non-null  int64         
 7   PS                14770 non-null  int64         
 8   XBOX              14770 non-null  int64         
 9   PS_Count          14770 non-null  float64       
 10  XBOX_Count        14770 non-null  float64       
 11  date              14770 non-null  datetime64[ns]
 12  naive_sentiment   14770 non-null  float64       
 13  bert_labels       14770 non-null  int64         
 14  bert_scores       147

Next thing I need to do is decide which columns will be useful for my model.

score should have an impact, though controversiality might not since it's effectively a boolean value with a large majority of values being equal to 0.
body will definitely make an impact, although it should have dummies for popular terms.
month, year, and date are mostly for the time series I made previously, and will not add much.
original_size, PS_Count, and XBOX_Count were columns I added for plotting, and have no purpose here.
naive_sentiment is definitely important.
PS, XBOX, and subreddit are categoricals that should aid in predictions.
bert_labels are going to be the prediction value and bert_scores will supplement that.

In [24]:
df.drop(columns=['PS_Count', 'controversiality', 'XBOX_Count', 'original_size', 'month', 'year', 'date'], inplace=True)
print(df.info())

<class 'pandas.core.frame.DataFrame'>
Index: 14770 entries, c1e4o to c0i10ti
Data columns (total 8 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   score            14770 non-null  int64  
 1   subreddit        14770 non-null  object 
 2   body             14770 non-null  object 
 3   PS               14770 non-null  int64  
 4   XBOX             14770 non-null  int64  
 5   naive_sentiment  14770 non-null  float64
 6   bert_labels      14770 non-null  int64  
 7   bert_scores      14770 non-null  float64
dtypes: float64(2), int64(4), object(2)
memory usage: 1.0+ MB
None


Next, we need to address the score column's outliers.  Half of all values are between 0 and 3, but the overall range is from -45 to 398.  The graph from the EDA shows that the variance has been increasing over time and has a non-normal distribution.  If we only keep rows between -30 and 30, we only lose 161 columns in the process, and in doing so make score a more reliable predictor.

In [25]:
df.drop(df[(df['score'] > 30) | (df['score'] < -30)].index, inplace=True)
print(df.describe())

              score            PS          XBOX  naive_sentiment  \
count  14609.000000  14609.000000  14609.000000     14609.000000   
mean       2.132453      0.448080      0.610446         0.077496   
std        4.001580      0.497314      0.487666         0.221732   
min      -30.000000      0.000000      0.000000        -1.000000   
25%        1.000000      0.000000      0.000000        -0.006667   
50%        1.000000      0.000000      1.000000         0.056481   
75%        3.000000      1.000000      1.000000         0.194924   
max       30.000000      1.000000      1.000000         1.000000   

        bert_labels   bert_scores  
count  14609.000000  14609.000000  
mean       2.486412      0.436582  
std        1.465876      0.145641  
min        1.000000      0.208104  
25%        1.000000      0.329384  
50%        2.000000      0.401645  
75%        4.000000      0.508862  
max        5.000000      0.980305  


Excellent, this dropped the std of score from 11.1 to 4.001, while maintaining the percentiles and only changing the mean by about 0.8.  The last thing we need to address is the body and subreddit columns.  We should be able to create dummies for subreddit just fine, but the sheer number of unique words in body would add almost 20,000 columns to the database.

In [26]:
df = pd.get_dummies(data=df, columns=['subreddit'])
small_col = []
total_col = 0
for col in df.columns:
    num_posts = df[col][df[col]==True].count()
    print(col, num_posts)
    total_col += 1
    if num_posts <= 10:
        small_col.append(col)

score 5842
body 0
PS 6546
XBOX 8918
naive_sentiment 51
bert_labels 5671
bert_scores 0
subreddit_4chan 1
subreddit_AmericanGovernment 1
subreddit_AmericanPolitics 1
subreddit_Anarchism 5
subreddit_Android 41
subreddit_Art 1
subreddit_AskReddit 1250
subreddit_Astronomy 1
subreddit_BDSMcommunity 1
subreddit_Baking 1
subreddit_Borderlands 6
subreddit_Christianity 2
subreddit_CommonLaw 2
subreddit_DAE 2
subreddit_DIY 2
subreddit_Design 1
subreddit_DoesAnybodyElse 41
subreddit_Drugs 1
subreddit_Economics 27
subreddit_Equality 4
subreddit_Eve 2
subreddit_FashionTechnology 1
subreddit_Favors 4
subreddit_FreeMicrosoftPoints 1
subreddit_Frugal 20
subreddit_GameDeals 3
subreddit_Games 1
subreddit_HappyBirthday 1
subreddit_Health 1
subreddit_Homebrewing 1
subreddit_IAmA 224
subreddit_ILiveIn 1
subreddit_IndieGaming 4
subreddit_Israel 1
subreddit_JRPG 4
subreddit_Libertarian 11
subreddit_MW2 11
subreddit_MapleLinks 8
subreddit_Marijuana 74
subreddit_MensRights 5
subreddit_Music 12
subreddit_NSFW_no

Creating dummies for subreddit alone brought us from 6 columns up to 222 columns, 169 of which only have 10 mentions or less.  To avoid overfitting, I will be merging these columns.

In [27]:
#leave out the non-subreddit columns so we don't delete them
small_col.remove('body')
small_col.remove('bert_scores')

#row-wise aggregation of columns within small_col
df['subreddit_other'] = df[small_col].aggregate('sum', axis=1)
df = df.drop(columns=small_col)
print(df.columns)

Index(['score', 'body', 'PS', 'XBOX', 'naive_sentiment', 'bert_labels',
       'bert_scores', 'subreddit_Android', 'subreddit_AskReddit',
       'subreddit_DoesAnybodyElse', 'subreddit_Economics', 'subreddit_Frugal',
       'subreddit_IAmA', 'subreddit_Libertarian', 'subreddit_MW2',
       'subreddit_Marijuana', 'subreddit_Music', 'subreddit_PS3',
       'subreddit_WTF', 'subreddit_apple', 'subreddit_atheism',
       'subreddit_bestof', 'subreddit_business', 'subreddit_canada',
       'subreddit_comics', 'subreddit_entertainment', 'subreddit_environment',
       'subreddit_fffffffuuuuuuuuuuuu', 'subreddit_funny', 'subreddit_gadgets',
       'subreddit_gaming', 'subreddit_geek', 'subreddit_hardware',
       'subreddit_linux', 'subreddit_movies', 'subreddit_nsfw',
       'subreddit_offbeat', 'subreddit_pics', 'subreddit_politics',
       'subreddit_programming', 'subreddit_promos', 'subreddit_reddit.com',
       'subreddit_science', 'subreddit_scifi', 'subreddit_secretsanta',
       'sub

54 columns is a lot more manageable.  Onto working with the body column.

In [None]:
superstring = ''

for entry in df['body']:
    superstring += entry

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
df_vocabulary = tokenizer.tokenize(superstring)

In [None]:
#going to tokenize and create dummies for bodies based off of most frequent words that aren't stopwords
from nltk.corpus import stopwords
sw = stopwords.words('english')

vocab_series = pd.Series(df_vocabulary)
print(vocab_series)
print(vocab_series.value_counts())