In [92]:
import pandas as pd
import io
import requests
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

In [93]:
#Read in the data from github
#Put the data in dicts
csvs = ['askmen','askwomen','aww','conspiracy','fitness','knitting']
csv_dict = dict()
for page in csvs:
    url = "https://raw.githubusercontent.com/rer145/cis572-project/master/data/"+str(page)+".csv"
    raw_data = requests.get(url).content
    df = pd.read_csv(io.StringIO(raw_data.decode('utf-8')))
    csv_dict[page] = df

In [94]:
#List of dataframes
csv_dict.keys()

dict_keys(['askmen', 'askwomen', 'aww', 'conspiracy', 'fitness', 'knitting'])

In [95]:
#Look at data one frame at a time
for key in csv_dict.keys():
    print(key)
    for column in list(csv_dict[key]):
        print(column)
        print(csv_dict[key][column].unique())
        print()

askmen
post_score
[    1     0     2     3     6    24     4    11     9     5    22   145
    13    10     7    14    37    25    19    12    33    30    39  1897
    48    34   250    31    16    17   441     8    32    55   252    45
   189    38    27    79   166    61 14124    59    43    21    35   149
   273    20    52    78    28  9193    46    36    23   267   119   167
    53  1308    15    73   160    82    49 14819   289   460    18   199
  2033    29    58   365    41  6338    26    99   142   102   382    44
    57 14608    81   185   100   151  9885   144   126    96  5721   122
   162    40    86  5954   129    42   169    95    70 10259   292   205
  7937   197   107    51    72   191    66   109  9424   186   117   528
  4855    91   355    64   615 12151    93   201    68    69    85   223
  7456    90   115   213    76    92    54  8478   221    74   156    60
    77   291   774 11299    94   158   275   173]

post_archived
[False]

post_distinguished
[nan 'moderat

In [96]:
#Remove columns that only have a singular value
for key in csv_dict.keys():
    unique = [c for c in csv_dict[key].columns if len(set(csv_dict[key][c])) == 1]
    print(key)
    print(unique)
    print()
    csv_dict[key].drop(unique, axis=1, inplace=True)

askmen
['post_archived', 'post_downs', 'post_is_original_content', 'post_is_video', 'post_pinned', 'subreddit', 'subreddit_subscribers', 'day_of_week']

askwomen
['post_archived', 'post_downs', 'post_gilded_platinum', 'post_is_original_content', 'post_is_video', 'post_pinned', 'subreddit', 'day_of_week']

aww
['post_archived', 'post_downs', 'post_edited', 'post_gilded_platinum', 'post_over_18', 'post_pinned', 'subreddit', 'posted_len_minutes', 'post_text_sentiment', 'post_text_characters', 'post_text_words', 'post_text_uniq_words', 'post_text_stop_words', 'post_text_non_stop_words', 'day_of_week']

conspiracy
['post_archived', 'post_downs', 'post_gilded_platinum', 'post_is_original_content', 'post_is_video', 'post_pinned', 'subreddit', 'subreddit_subscribers', 'day_of_week']

fitness
['post_archived', 'post_downs', 'post_gilded_platinum', 'post_is_original_content', 'post_is_video', 'post_over_18', 'post_pinned', 'subreddit', 'day_of_week']

knitting
['post_archived', 'post_downs', 'po

In [97]:
#Look at data one frame at a time
for key in csv_dict.keys():
    print(key)
    for column in list(csv_dict[key]):
        print(column)
        print(csv_dict[key][column].unique())
        print()

"""for key in csv_dict.keys():
    print(key)
    print(csv_dict[key]['post_num_reports'].unique())
    print()"""

askmen
post_score
[    1     0     2     3     6    24     4    11     9     5    22   145
    13    10     7    14    37    25    19    12    33    30    39  1897
    48    34   250    31    16    17   441     8    32    55   252    45
   189    38    27    79   166    61 14124    59    43    21    35   149
   273    20    52    78    28  9193    46    36    23   267   119   167
    53  1308    15    73   160    82    49 14819   289   460    18   199
  2033    29    58   365    41  6338    26    99   142   102   382    44
    57 14608    81   185   100   151  9885   144   126    96  5721   122
   162    40    86  5954   129    42   169    95    70 10259   292   205
  7937   197   107    51    72   191    66   109  9424   186   117   528
  4855    91   355    64   615 12151    93   201    68    69    85   223
  7456    90   115   213    76    92    54  8478   221    74   156    60
    77   291   774 11299    94   158   275   173]

post_distinguished
[nan 'moderator']

post_edited
['Fal

"for key in csv_dict.keys():\n    print(key)\n    print(csv_dict[key]['post_num_reports'].unique())\n    print()"

In [98]:
list(csv_dict['aww'])

['post_score',
 'post_distinguished',
 'post_gilded',
 'post_gilded_silver',
 'post_gilded_gold',
 'post_is_original_content',
 'post_is_video',
 'post_likes',
 'post_num_comments',
 'post_num_crossposts',
 'post_num_reports',
 'subreddit_subscribers',
 'post_ups',
 'post_title_sentiment',
 'post_title_characters',
 'post_title_words',
 'post_title_uniq_words',
 'post_title_stop_words',
 'post_title_non_stop_words']

In [99]:
for key in csv_dict.keys():
    #Change post_distinguished to numerical values, only None and Moderator values
    if 'post_distinguished' in csv_dict[key]:
        csv_dict[key]['post_distinguished'] = csv_dict[key]['post_distinguished'].map({None: 0, 'moderator': 1})
        
    #Mark edited posts with a 1
    if 'post_edited' in csv_dict[key]:
        csv_dict[key]['post_edited'] = (csv_dict[key]['post_edited'] == 'False')*1
        
    #Drop post_num_reports since they are all NaN
    #***Might already be gone***
    #csv_dict[key].drop('post_num_reports', axis=1, inplace=True)
    
    #Remove post_likes and post_ups since they are not predictive
    if 'post_likes' in csv_dict[key]:
        csv_dict[key].drop('post_likes', axis=1, inplace=True)
    if 'post_ups' in csv_dict[key]:
        csv_dict[key].drop('post_ups', axis=1, inplace=True)
        
    #Make post_over_18 binary
    if 'post_over_18' in csv_dict[key]:
        csv_dict[key]['post_over_18'] = csv_dict[key]['post_over_18'].astype('category').cat.codes
        
    print('iter')

iter
iter
iter
iter
iter
iter


In [100]:
for key in csv_dict.keys():
    csv_dict[key] = csv_dict[key].sort_values('post_score', ascending=False)
    cutoff_pos = round(csv_dict[key].shape[0]*.25)
    cutoff_score = csv_dict[key].iloc[cutoff_pos]['post_score']
    print('Subreddit:',key,'\t Number of popular posts:',cutoff_pos,'\t Popular score cutoff:',int(cutoff_score))
    
    csv_dict[key].loc[csv_dict[key]['post_score'] >= cutoff_score, 'popular'] = 1
    csv_dict[key].loc[csv_dict[key]['post_score'] < cutoff_score, 'popular'] = 0

Subreddit: askmen 	 Number of popular posts: 248 	 Popular score cutoff: 19
Subreddit: askwomen 	 Number of popular posts: 248 	 Popular score cutoff: 21
Subreddit: aww 	 Number of popular posts: 247 	 Popular score cutoff: 36
Subreddit: conspiracy 	 Number of popular posts: 247 	 Popular score cutoff: 39
Subreddit: fitness 	 Number of popular posts: 239 	 Popular score cutoff: 14
Subreddit: knitting 	 Number of popular posts: 249 	 Popular score cutoff: 112


In [101]:
def prepare_data(data):
    #remove post_score
    data.drop('post_score', axis=1, inplace=True)
    
    #split into test and training
    X = data.drop('popular', axis=1)
    y = data['popular']
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=50)
    
    #split into validation and training
    X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=50)
    
    #scale the data
    X_train = scaler.fit_transform(X_train)
    X_val = scaler.transform(X_val)
    X_test = scaler.transform(X_test)
    
    #return data
    return X_train, X_val, X_test, y_train, y_val, y_test

In [102]:
X_train, X_val, X_test, y_train, y_val, y_test = prepare_data(csv_dict['askmen'])

  return self.partial_fit(X, y)
  updated_mean = (last_sum + new_sum) / updated_sample_count
  new_unnormalized_variance = np.nanvar(X, axis=0) * new_sample_count
  return self.fit(X, **fit_params).transform(X)
  from ipykernel import kernelapp as app
  app.launch_new_instance()


In [104]:
X_train

array([[-0.03977786,  0.27331829, -0.03977786, ..., -0.36725723,
        -0.3601445 , -0.36406274],
       [-0.03977786,  0.27331829, -0.03977786, ..., -0.66603935,
        -0.73214038, -0.61955244],
       [-0.03977786, -3.65873798, -0.03977786, ...,  1.46811863,
         1.37583624,  1.48823761],
       ...,
       [-0.03977786,  0.27331829, -0.03977786, ...,  0.230307  ,
         0.32184793,  0.17885288],
       [-0.03977786,  0.27331829, -0.03977786, ...,  0.69982176,
         0.6938438 ,  0.68983229],
       [-0.03977786,  0.27331829, -0.03977786, ..., -0.66603935,
        -0.73214038, -0.61955244]])