Notes:

* run 1 - tf-idf features with ridge regression (alpha = 0.5), all r-squared values negative
* run 2 - log-counts with ridge regression (alpha = 0.5), results are now positive but still low
* run 3 - log-counts with random forest (max_depth=2, random_state=0), results are worse 
* run 4 - log-counts with random forest (max_depth=2), results are yet worse 
* run 5 - log-counts with random forest (max_depth=5, random_state=0), still bad
* run 6 - log-counts with ridge regression (alpha = 0.75), no noticeable difference from run 2
* run 7 - log-counts with ridge regression (alpha = 0.25), no noticeable difference from run 2
* run 8 - log-counts with kernel ridge regression (kernel="rbf", gamma=0.1, alpha=0.5), hideous performance
* run 9 - log-counts with SVR with defaults, all r-squared values negative
* run 10 - log-counts with SVR (kernel="linear"), similar to run 2
* run 11 - log-counts with SVR (kernel="poly"), maybe a slight improvement but not enough to be useful
* run 12 - log-counts with lasso regression (alpha = 0.5), all r-squared values are negative

## Common Resources

In [None]:
user_agent_string = 'None' #replace with your user agent string

In [None]:
from nltk.stem.porter import *
stemmer = PorterStemmer()

def get_comment_text(r):
    comments = []
    cid = None
    
    for comment in r['data']['children']:
        body = comment['data']['body']
        cid = comment['data']['name'] #after id
        comments.append(body)
        
    return comments, cid

def load_csv(filepath):
    return [line.strip().split(',') for line in open(filepath, "r")]

punc_list = str.maketrans({"\'":" ", "\"":" ", ",":" ", ".":" ", "?":" ", "!":" ", "-":" ", "/":" ", ";":" ", ":":" ", "{":" ", "}":" ", "_":" ","*":" ", "^":" ", "~":" ", "`":" "})

def tokenize(string):
    string = string.strip().lower()
    string = string.translate(punc_list) #depunctuate
    string_list = re.split("\s+", string)
    string_list = [stemmer.stem(token) for token in string_list]
    return string_list

## Verify usernames

In [None]:
data = load_csv("big5.csv")

#get list of names
names = [row[0] for row in data[1:]]

for name in names:
    
    r = requests.get("https://www.reddit.com/user/{}/comments.json?limit=1".format(name), headers = {'User-agent':user_agent_string})
    r_data = r.json()

    if 'error' in r_data.keys():
        print(r_data)
        print(name)
        
    time.sleep(5)

## Scrape user posts

In [None]:
import requests, time, os

def get_comment_text(r):
    comments = []
    cid = None
    
    for comment in r['data']['children']:
        body = comment['data']['body']
        cid = comment['data']['name'] #after id
        comments.append(body)
        
    return comments, cid

def load_csv(filepath):
    return [line.strip().split(',') for line in open(filepath, "r")]

#open big5.csv
data = load_csv("big5.csv")

#get list of names
names = [row[0] for row in data[1:]]
#filterlist = [ID[:-4] for ID in os.listdir("comments")]
#names = [name for name in names if name not in filterlist]
#print(names)
    
for name in names:
    
    last_id = None
    comment_text = None
    
    while comment_text != []:
        if last_id == None:
            r = requests.get("https://www.reddit.com/user/{}/comments.json?limit=100".format(name), headers = {'User-agent':user_agent_string})
        else:
            r = requests.get("https://www.reddit.com/user/{}/comments.json?limit=100&after={}".format(name, last_id), headers = {'User-agent':user_agent_string})

        r_data = r.json()
        
        if 'error' in r_data.keys() and r_data['error'] == 403:
            break
        
        till_reset = int(r.headers['x-ratelimit-reset'])
        req_remaining = int(r.headers['x-ratelimit-remaining'])
        
        if req_remaining == 0:
            time.sleep(till_reset+5)
        
        with open("comments/{}.txt".format(name), "a") as fh:
            comment_text, last_id = get_comment_text(r_data)
            
            for comment in comment_text:
                comment = comment.encode("ascii", errors="ignore").decode()
                fh.write(comment+"\n")
                fh.write("*BREAK*\n")

        time.sleep(5)

## Process data

In [None]:
import re, os, math

class WordFeatures:
    def __init__(self):
        self.counts = {}
        self.wordlist = []
        
    def add_words(self, key, new_words):
        if key not in self.counts.keys():
            self.counts[key] = dict([(word,0) for word in self.wordlist])
        
        for word in new_words:
            if word not in self.wordlist:
                self.__update_wordlist__(word)
                
            self.counts[key][word] += 1
            
    def __update_wordlist__(self, word):
        self.wordlist.append(word)
        for key in self.counts.keys():
            self.counts[key][word] = 0
            
    def generate_tfidf(self):
        #tf-idf(w, d) = bow(w, d) * log (N / # documents in which word w appears)
        #generate dict of scaling factors for each word
        scaling_factors = {}
        n = len(self.counts.keys())
        for word in self.wordlist:
            doc_count = sum([1 for k in self.counts.keys() if self.counts[k][word] > 0]) 
            scaling_factors[word] = math.log(n / (doc_count+1))
        
        #generate weighted counts
        tfidf = {}
        for k in self.counts.keys():
            tfidf[k] = dict([(word, str(self.counts[k][word] * scaling_factors[word])) for word in self.wordlist])
            
        return tfidf
    
    def generate_lbow(self):
        lbow = {}
        for k in self.counts.keys():
            lbow[k] = dict([(word, str(math.log(self.counts[k][word]+0.000000001))) for word in self.wordlist])
            
        return lbow
        
#for file in data folder
words = WordFeatures()
filterlist = []

for file in os.listdir("comments"):
    
    accumulator = []
    
    with open("comments/"+file, "r") as fh:
        for line in fh:
            
            if line.strip() == "*BREAK*":
                continue

            accumulator += tokenize(line)
        
        name = file[:-4]
        if len(accumulator) < 1000:
            filterlist.append(name)
        words.add_words(name, accumulator)

#populate processed csv
#write out tf-idf values and OCEAN values to new csv
csv = load_csv("big5.csv")
#tfidf = words.generate_tfidf()
tfidf = words.generate_lbow()

for i in range(len(csv)): #for each row
    for word in words.wordlist:
        if i == 0:
            csv[i].append(word)
        else:
            name = csv[i][0]
            try:
                csv[i].append(tfidf[name][word])
            except:
                filterlist.append(name)
                break

with open("processed_big5.csv", "w") as fh:
    for row in csv:
        if row[0] not in filterlist:
            print(','.join(row), file=fh)

## Build model

In [None]:
#ridge regression, SVR, or regression tree

from sklearn import linear_model
#from sklearn.ensemble import RandomForestRegressor
#from sklearn.svm import SVR
#from sklearn.kernel_ridge import KernelRidge
import pickle

data = load_csv("processed_big5.csv")

#extract out inputs and outputs
split_index = int(0.8 * len(data))
train_x = [[float(i) for i in row[6:]] for row in data[1:split_index]]
test_x = [[float(i) for i in row[6:]] for row in data[split_index:]]

outputs = {}
for i in range(1,6):
    key = data[0][i]
    y_data = [row[i] for row in data]
    y_train = [float(i) for i in y_data[1:split_index]]
    y_test = [float(i) for i in y_data[split_index:]]
    
    outputs[key] = {"train":y_train, "test":y_test}

#build and evaluate models
for Y_key in outputs.keys():
    
    model = linear_model.Ridge(alpha=0.25)
    #model = linear_model.Lasso(alpha=0.5)
    #model = RandomForestRegressor(max_depth=5, random_state=0)
    #model = KernelRidge(kernel="rbf", gamma=0.1, alpha=0.5)
    #model = SVR(kernel="poly")
    model.fit(train_x, outputs[Y_key]["train"])
    
    r2 = model.score(test_x, outputs[Y_key]["test"])
    print(Y_key + " r-squared: " + str(r2))
    
    #pickle model
    fh = open(Y_key+".model", "wb")
    pickle.dump(model, fh)
    fh.close()

## Apply model

In [None]:
from sklearn import linear_model
import pickle, requests, time

def retrieve_comments(username):
    comment_text = None
    last_id = None
    usertext = []
    
    while comment_text != []:
        if last_id == None:
            r = requests.get("https://www.reddit.com/user/{}/comments.json?limit=100".format(username), headers = {'User-agent':user_agent_string})
        else:
            r = requests.get("https://www.reddit.com/user/{}/comments.json?limit=100&after={}".format(username, last_id), headers = {'User-agent':user_agent_string})

        r_data = r.json()
        
        if 'error' in r_data.keys() and r_data['error'] == 403:
            break
        
        till_reset = int(r.headers['x-ratelimit-reset'])
        req_remaining = int(r.headers['x-ratelimit-remaining'])
        
        if req_remaining == 0:
            time.sleep(till_reset+5)
        
        comment_text, last_id = get_comment_text(r_data)

        for comment in comment_text:
            comment = comment.encode("ascii", errors="ignore").decode()
            comment = tokenize(comment)
            usertext = usertext + comment
            
        time.sleep(5)
            
    return usertext
                
#get target user
target_user = None

#get wordlist from processed_big5.csv
csv = load_csv("processed_big5.csv")
wordlist = csv[0][6:]

#retrieve user comment history
usertext = retrieve_comments(target_user)

#process and count words from comment history
#the tf-idf weighting scheme seems like it won't work one single document corpuses
counts = [[math.log(usertext.count(word)+0.000000001) for word in wordlist]]

#load models
with open("O.model", "rb") as fh:
    O_model = pickle.load(fh)
with open("C.model", "rb") as fh:
    C_model = pickle.load(fh)
with open("E.model", "rb") as fh:
    E_model = pickle.load(fh)
with open("A.model", "rb") as fh:
    A_model = pickle.load(fh)
with open("N.model", "rb") as fh:
    N_model = pickle.load(fh)

#run model.predict() for each model
print("O: {}".format(O_model.predict(counts)))
print("C: {}".format(C_model.predict(counts)))
print("E: {}".format(E_model.predict(counts)))
print("A: {}".format(A_model.predict(counts)))
print("N: {}".format(N_model.predict(counts)))