# Data Preprocessing Version with one row per author:

### Features (as in paper):
1. 11.140 ngram features: tf and tf-idf weighted word and character ngrams stemmed with Porter's stemmer
2. type-token ratio
3. ratio of comments in English
4. ratio of British english vs. American English words
5. 93 features from LIWC 
6. 26 PSYCH features (Preotiuc: Paraphrase Database and MRC Psycholinguistics Database)

### Columns (from the description of the dataset):
1. 'global':[7,10], #subreddits_commented, subreddits_commented_mbti, num_comments
2. 'liwc':[10,103], #liwc
3. 'word':[103,3938], #top1000 word ngram (1,2,3) per dimension based on chi2
4. 'char':[3938,7243], #top1000 char ngrams (2,3) per dimension based on chi2
5. 'sub':[7243,12228], #number of comments in each subreddit
6. 'ent':[12228,12229], #entropy
7. 'subtf':[12229,17214], #tf-idf on subreddits
8. 'subcat':[17214,17249], #manually crafted subreddit categories
9. 'lda50':[17249,17299], #50 LDA topics
10. 'posts':[17299,17319], #posts statistics
11. 'lda100':[17319,17419], #100 LDA topics
12. 'psy':[17419,17443], #psycholinguistic features
13. 'en':[17443,17444], #ratio of english comments
14. 'ttr':[17444,17445], #type token ratio
15. 'meaning':[17445,17447], #additional pyscholinguistic features
16. 'time_diffs':[17447,17453], #commenting time diffs
17. 'month':[17453,17465], #monthly distribution
18. 'hour':[17465,17489], #hourly distribution
19. 'day_of_week':[17489,17496], #daily distribution
20. 'word_an':[17496,21496], #word ngrams selected by F-score
21. 'word_an_tf':[21496,25496], #tf-idf ngrams selected by F-score
22. 'char_an':[25496,29496], #char ngrams selected by F-score
23. 'char_an_tf':[29496,33496], #tf-idf char ngrams selected by F-score
24. 'brit_amer':[33496,33499], #british vs american english ratio


## Import packages

In [1]:
import nltk
# import ssl

# try:
#     _create_unverified_https_context = ssl._create_unverified_context
# except AttributeError:
#     pass
# else:
#     ssl._create_default_https_context = _create_unverified_https_context
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
nltk.download('tagsets')
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.util import bigrams, ngrams
import re
import string
from string import punctuation
import sklearn
from sklearn.feature_extraction.text import TfidfVectorizer 
from sklearn.feature_extraction.text import CountVectorizer 
from collections import Counter
from num2words import num2words 
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm
import datetime
import random
random.seed(32)

# close nltk download window to continue

[nltk_data] Downloading package punkt to /home/sophia/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/sophia/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/sophia/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package tagsets to /home/sophia/nltk_data...
[nltk_data]   Package tagsets is already up-to-date!


## Import data

In [11]:
df = pd.read_csv('/home/sophia/ma_py/pandora_bigfive1000.csv')
# print(pandora.info(verbose=True))
df.head()

Unnamed: 0,author,author_flair_text,body,downs,created_utc,subreddit_id,link_id,parent_id,score,controversiality,gilded,id,subreddit,ups,word_count,word_count_quoteless,lang
0,Sabata11792,,Not seeing any break or signal lights and no p...,,1534890968,t5_3fqup,t3_995l9s,t1_e4lbrls,1.0,0,0,e4lkg2l,ATBGE,,19,19,en
1,Swarels,INTP,"Multiverses, matrix theory, consciousness. Scr...",,1499749893,t5_2qhvl,t3_6mjw62,t1_dk26jre,7.0,0,0,dk26vpo,INTP,,51,46,en
2,pearlz176,Manchester United,Hope you've enjoyed the ride :D,,1485613795,t5_2qi58,t3_5qnd1v,t1_dd0mdqg,2.0,0,0,dd0nxif,soccer,,6,6,en
3,rainbowhotpocket,Colts,"Idk, in the AFC if i recall correctly since 20...",,1466965660,t5_2qmg3,t3_4pypuh,t1_d4oulvo,11.0,0,0,d4our0i,nfl,11.0,62,61,en
4,amathyx,http://myanimelist.net/profile/amathy,22 hours later and the music is still going[co...,,1495851189,t5_2qh22,t3_6ddiow,t3_6ddiow,6.0,0,0,di3inz7,anime,,32,29,en


In [18]:
# change language to numeric representation
def adjust(df):
    # change lang to numerical representation
    language = df['lang'].values.tolist()
    language = set(language)
    language
    df['language']= np.select([df.lang == 'en', df.lang == 'es', df.lang == 'nl'], 
                            [0, 1, 2], 
                            default=3)
    # print(gramsdf['language'])
    df = df.drop(columns=['lang'])

    return df

df = adjust(df)

In [12]:
def create_timecolumns(df):
    readable = []
    weekday = []
    month = []
    year = []
    for row in df['created_utc']:
        item = datetime.datetime.fromtimestamp(row)
        weekday_item = item.strftime('%A')
        readable_item = datetime.datetime.fromtimestamp(row).isoformat()
        month.append(str(readable_item[5:7]))
        year.append(str(readable_item[0:4]))
        readable.append(readable_item)
        weekday.append(weekday_item)
    df['time'] = readable
    df['weekday'] = weekday
    df['month'] = month
    df['year'] = year
    return df

# pandora = create_timecolumns(pandora)
# pandora.head()
# test = pandora.iloc[0]['time']
# print(test)
# print(test[0:4])
# lst = pandora.weekday.tolist()
# lstset = set(lst)
# print(lstset)

In [13]:
def timecounter(lst, vocablst):
    if vocablst == 'weekday':
        vocab = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
    elif vocablst == 'month':
        vocab = ['01', '02', '03', '04', '05', '06', '07', '08', '09', '10', '11', '12']
    elif vocablst == 'year':
        vocab = ['2015', '2016', '2017', '2018', '2019']
    else:
        print("No valid input: vocab list")
    vectorizer = CountVectorizer(analyzer="word", vocabulary=vocab)
    vectors = vectorizer.fit_transform(lst)
    v = vectors.toarray()
#     is_all_zero = np.all((v == 0))
#     names = vectorizer.get_feature_names()
    return v

# item = ['Sunday Tuesday']
# print(item)
# test = timecounter(item, 'weekday')
# print(test)

In [14]:
def subredditcounter(df, lst):
    lst = df['subreddit'].tolist()
    subredditset = set(lst)
    subredditlist = list(subredditset)
    vectorizer = CountVectorizer(analyzer="word", vocabulary=subredditlist)
    vectors = vectorizer.fit_transform(lst)
    v = vectors.toarray()
    return v

In [32]:
def create_authordf(df):    
    # body
    df['complete_body'] = df.groupby(['author'])['body'].transform(lambda x : ' '. join(x))
    # language
    df['lang'] = df['language'].apply(lambda x: str(x))
    df['all_lang'] = df.groupby(['author'])['lang'].transform(lambda x : ' '. join(x))
    # created_utc
    df['utc_lst'] = df['created_utc'].apply(lambda x: str(x))
    df['all_utc'] = df.groupby(['author'])['utc_lst'].transform(lambda x : ' '. join(x))
    df['all_utc'] = df['all_utc'].apply(lambda x: x.split())
    # controversiality
    df['mean_controversiality'] = df.groupby(['author']).agg({'controversiality': ['mean']})
    df['mean_controversiality'] = df['mean_controversiality'].fillna(0)
    # gilded
    df['mean_gilded'] = df.groupby(['author']).agg({'gilded': ['mean']})
    df['mean_gilded'] = df['mean_gilded'].fillna(0)
    # number of subreddits
    df['num_subreddits'] = df.groupby(['author'])['subreddit'].transform(lambda x : ' '. join(x))
    df['num_subreddits'] = df['num_subreddits'].apply(lambda x: len(set(x.split())))
    # number of comments per subreddit
    df['subreddit_dist'] = df.groupby(['author'])['subreddit'].transform(lambda x : ' '. join(x))
    subreddit = subredditcounter(df, df['subreddit_dist'])
    df['subreddit_dist'] = subreddit.tolist()
    # time
    df = create_timecolumns(df)
    df['weekday_dist'] = df.groupby(['author'])['weekday'].transform(lambda x : ' '. join(x))
    weekday = timecounter(df['weekday_dist'], 'weekday')
    df['weekday_dist'] = weekday.tolist()
    df['month_dist'] = df.groupby(['author'])['month'].transform(lambda x : ' '. join(x))
    month = timecounter(df['month_dist'], 'month')
    df['month_dist'] = month.tolist()
    df['year_dist'] = df.groupby(['author'])['year'].transform(lambda x : ' '. join(x))
    year = timecounter(df['year_dist'], 'year')
    df['year_dist'] = year.tolist()
    
    newdf = df[['author', 'complete_body', 'all_utc', 'mean_controversiality', 
                'mean_gilded', 'num_subreddits', 'subreddit_dist', 'weekday_dist', 
                'month_dist', 'year_dist', 'all_lang']]
    newdf = newdf.sort_values(by='author')
    newdf = newdf.drop_duplicates(subset=['author'])
    return newdf


pandora = create_authordf(df)
pandora
pandora['body'] = pandora['complete_body']
pandora
# print(type(newdf.iloc[428]['weekday_dist']))

Unnamed: 0,author,complete_body,all_utc,mean_controversiality,mean_gilded,num_subreddits,subreddit_dist,weekday_dist,month_dist,year_dist,all_lang,body
906,-BigSexy-,Oooh i see,[1510236798],0.0,0.0,1,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0]","[0, 0, 1, 0, 0]",0,Oooh i see
145,-BlitzN9ne,**Quality** material right here,[1549708109],0.0,0.0,1,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0]","[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[0, 0, 0, 0, 1]",0,**Quality** material right here
367,-CrestiaBell,A slidewhistle or a meow-meow board That's bec...,"[1538664591, 1475867279, 1505862626, 151267621...",0.0,0.0,4,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, ...","[0, 0, 0, 0, 0, 0, 0]","[1, 0, 0, 0, 0, 0, 0, 0, 1, 2, 1, 2]","[0, 2, 3, 1, 1]",0 0 0 0 0 0 0,A slidewhistle or a meow-meow board That's bec...
295,-tactical-throw-away,Sorry for your feelings. Kek &lt;------- This ...,"[1498536785, 1486701409, 1506834463]",0.0,0.0,1,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0]","[0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0]","[0, 0, 3, 0, 0]",0 0 0,Sorry for your feelings. Kek &lt;------- This ...
791,137288,Carly's so glad to get your .0000003 cents Exc...,"[1536611153, 1550537879, 1516548513, 1523299682]",0.0,0.0,1,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0]","[1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0]","[0, 0, 0, 3, 1]",0 0 0 0,Carly's so glad to get your .0000003 cents Exc...
...,...,...,...,...,...,...,...,...,...,...,...,...
324,xanthraxoid,I'd really like this video to include some inf...,"[1469892161, 1486826547, 1498046590, 1550346594]",0.0,0.0,4,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0]","[0, 2, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0]","[0, 1, 2, 0, 1]",0 0 3 0,I'd really like this video to include some inf...
954,xenomouse,"You're a guy, aren't you? I can definitely see...","[1506710219, 1502740906, 1517847908, 1506874589]",0.0,0.0,2,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0]","[0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0]","[0, 0, 3, 1, 0]",0 0 0 0,"You're a guy, aren't you? I can definitely see..."
208,xeroctr3,man even the thought of it makes me depressed....,[1521414051],0.0,0.0,1,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0]","[0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[0, 0, 0, 1, 0]",0,man even the thought of it makes me depressed....
990,xzack18,Not all of us are out to kill,[1533749569],0.0,0.0,1,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0]","[0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0]","[0, 0, 0, 1, 0]",0,Not all of us are out to kill


In [34]:
authors = pd.read_csv('/home/sophia/ma_py/author_profiles.csv')
# find missing data in big five traits
authorslst = authors['author'].tolist()
print("Author search: ", 'DarthHedonist' in authorslst)
print("Author search: ", 'FonsoTheWhitesican' in authorslst)
print("Author search: ", 'chaosking121' in authorslst)

bigfive = authors[['author','agreeableness','openness','conscientiousness','extraversion','neuroticism']]
bigfive = bigfive.dropna()
# print(bigfive[bigfive['author'] == "DarthHedonist"])

# pandoradf = pd.merge(pandora, bigfive, how='left', on='author')
pandoradf = pandora.merge(bigfive, how='left', on=['author'])
# pandoradf = pandoradf.dropna()
pandoradf = pandoradf.sort_values(by='author')
pandoradf = pandoradf[pandoradf['agreeableness'].notna()]
pandoradf = pandoradf.reset_index()


print("Length of dataframe: ", len(pandoradf))
print("NaN in df? ", pandoradf.isnull().any().any())
print("Sum of NaN in agreeableness", pandoradf['agreeableness'].isnull().values.sum())
print("Sum of NaN in openness", pandoradf['openness'].isnull().values.sum())
print("Sum of NaN in conscientiousness", pandoradf['conscientiousness'].isnull().values.sum())
print("Sum of NaN in extraversion", pandoradf['extraversion'].isnull().values.sum())
print("Sum of NaN in neuroticism", pandoradf['neuroticism'].isnull().values.sum())
# nan_values = pandoradf[pandoradf['neuroticism'].isna()]
# nan_values
pandoradf.head()
# pandoradf[pandoradf.isnull().any(axis=1)]

# number of entries does not fit

Author search:  True
Author search:  True
Author search:  True
Length of dataframe:  422
NaN in df?  False
Sum of NaN in agreeableness 0
Sum of NaN in openness 0
Sum of NaN in conscientiousness 0
Sum of NaN in extraversion 0
Sum of NaN in neuroticism 0


Unnamed: 0,index,author,complete_body,all_utc,mean_controversiality,mean_gilded,num_subreddits,subreddit_dist,weekday_dist,month_dist,year_dist,all_lang,body,agreeableness,openness,conscientiousness,extraversion,neuroticism
0,0,-BigSexy-,Oooh i see,[1510236798],0.0,0.0,1,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0]","[0, 0, 1, 0, 0]",0,Oooh i see,39.0,92.0,1.0,18.0,4.0
1,1,-BlitzN9ne,**Quality** material right here,[1549708109],0.0,0.0,1,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0]","[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[0, 0, 0, 0, 1]",0,**Quality** material right here,50.0,85.0,15.0,50.0,30.0
2,2,-CrestiaBell,A slidewhistle or a meow-meow board That's bec...,"[1538664591, 1475867279, 1505862626, 151267621...",0.0,0.0,4,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, ...","[0, 0, 0, 0, 0, 0, 0]","[1, 0, 0, 0, 0, 0, 0, 0, 1, 2, 1, 2]","[0, 2, 3, 1, 1]",0 0 0 0 0 0 0,A slidewhistle or a meow-meow board That's bec...,50.0,85.0,50.0,85.0,50.0
3,3,-tactical-throw-away,Sorry for your feelings. Kek &lt;------- This ...,"[1498536785, 1486701409, 1506834463]",0.0,0.0,1,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0]","[0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0]","[0, 0, 3, 0, 0]",0 0 0,Sorry for your feelings. Kek &lt;------- This ...,2.0,92.0,31.0,60.0,53.0
4,4,137288,Carly's so glad to get your .0000003 cents Exc...,"[1536611153, 1550537879, 1516548513, 1523299682]",0.0,0.0,1,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0]","[1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0]","[0, 0, 0, 3, 1]",0 0 0 0,Carly's so glad to get your .0000003 cents Exc...,10.0,87.0,49.0,7.0,87.0


In [23]:
def bigfive_cat(df):
    # change big five to binary representation
    df['agree'] = df['agreeableness'].apply(lambda x: 0 if x<50 else 1)
    df['openn'] = df['openness'].apply(lambda x: 0 if x<50 else 1)
    df['consc'] = df['conscientiousness'].apply(lambda x: 0 if x<50 else 1)
    df['extra'] = df['extraversion'].apply(lambda x: 0 if x<50 else 1)
    df['neuro'] = df['neuroticism'].apply(lambda x: 0 if x<50 else 1)
    return df

## Adjust representations of some columns

## Feature extraction

In [24]:
def choose_stopwordlist(df, mode):
    if mode == 'NLTK':
        stopwordList = stopwords.words('english')
    if mode == 'NLTK-neg':
        stopwordList = stopwords.words('english')
        stopwordList.remove('no')
        stopwordList.remove('nor')
        stopwordList.remove('not')
    return stopwordList

# stopwordList = choose_stopwordlist(pandoradf, mode='NLTK-neg')

# print(stopwordList)

### Preprocessing

1. lower 
2. tokenize
3. numbers to words
4. delete special tokens

In [25]:
def decontracted(phrase):
    # specific
    phrase = re.sub(r"won\'t", "will not", phrase)
    phrase = re.sub(r"can\'t", "can not", phrase)

    # general
    phrase = re.sub(r"n\'t", " not", phrase)
    phrase = re.sub(r"\'re", " are", phrase)
    phrase = re.sub(r"\'s", " is", phrase)
    phrase = re.sub(r"\'d", " would", phrase)
    phrase = re.sub(r"\'ll", " will", phrase)
    phrase = re.sub(r"\'t", " not", phrase)
    phrase = re.sub(r"\'ve", " have", phrase)
    phrase = re.sub(r"\'m", " am", phrase)
    return phrase

# featuredf['probody'] = featuredf['body'].apply(lambda x:(decontracted(''.join(x))))
# print(featuredf.iloc[5]['probody'])

def senttokenize(df):
    sentbody = []
    for row in df['body']:
        sentences = sent_tokenize(row)
        sentbody.append(sentences)
    df['senttokens'] = sentbody
    return df

In [26]:
def low_stop_num_token(workdf, stopwordList):
    # lower, remove special characters, remove stopwords
    workdf['probody'] = workdf['probody'].apply(lambda x: ' '.join([x.lower() for x in x.split() if x.isalnum()]))
    workdf['probody'] = workdf['probody'].apply(lambda x: ' '.join([x for x in x.split() if (x not in stopwordList)]))
    newbody = []
    newprobody = []
    # num2words
    for sentence in tqdm(workdf['probody']):
        # string to list
        inputtext = sentence.split()
        numlist = []
        for i in range(len(inputtext)):
            if inputtext[i].isnumeric():
                numlist.append(i)
        for number in numlist:
            inputtext[number] = num2words(inputtext[number])
        
        # list to string
        celltext = ' '.join(inputtext)
        newprobody.append(celltext)
        # tokenize
        words = word_tokenize(celltext)
        newbody.append(words)
    workdf['probody'] = newprobody
    workdf['tokens'] = newbody
    return workdf

# preprocesseddf = preprocessing(featuredf)
# print(preprocesseddf.iloc[2]['body'])
# preprocesseddf.head()
# preprocesseddf.info()

In [27]:
tqdm.pandas()
# Porter Stemmer
def stemming(df):
    ps = PorterStemmer()
    df['tokens'] = df['tokens'].progress_apply(lambda x:([ps.stem(word) for word in x]))
    return df

# stemmeddf = stemming(preprocesseddf)
# print(stemmeddf.iloc[1]['tokens'])
# stemmeddf.head()

In [28]:
## Sort dataframe

In [29]:
# gramsdf.info(verbose=True)

In [36]:
def ordering(df):
    lst = []
    for i in range(len(df)):
        lst.append(df.author[i] + str(i))
    df['ident'] = lst
    
    cols_tomove = ['index', 'author', 'ident', 'body', 'probody', 'tokens', 'senttokens', 'agreeableness', 'openness', 'conscientiousness', 'extraversion', 'neuroticism', 'agree', 'openn', 'consc', 'extra', 'neuro']
    orderdf  = df[cols_tomove + [col for col in df.columns if col not in cols_tomove]]
#     orderdf.info(verbose=True)
    return orderdf

# Wrapper

In [37]:
def preprocess(df):
    # adjust some column representations
    df = bigfive_cat(df)
    # choose stopwordlist with or without negation
    stopwordList = choose_stopwordlist(df, mode='NLTK-neg')
    # decontract abbreviations (e.g., n't to not)
    df['probody'] = df['body'].apply(lambda x:(decontracted(''.join(x))))
    # create sentence tokens
    df = senttokenize(df)
    # lower, remove stopwords, num2words, tokenize
    df = low_stop_num_token(df, stopwordList)
    # porters stemmer
    df = stemming(df)
    df = ordering(df)
    return df

predf = preprocess(pandoradf)
print(predf.ident)


  0%|          | 0/422 [00:00<?, ?it/s]

  0%|          | 0/422 [00:00<?, ?it/s]

0                 -BigSexy-0
1                -BlitzN9ne1
2              -CrestiaBell2
3      -tactical-throw-away3
4                    1372884
               ...          
417           xanthraxoid417
418             xenomouse418
419              xeroctr3419
420               xzack18420
421           zugzwang_03421
Name: ident, Length: 422, dtype: object


In [38]:
predf.info(verbose=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 422 entries, 0 to 421
Data columns (total 27 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   index                  422 non-null    int64  
 1   author                 422 non-null    object 
 2   ident                  422 non-null    object 
 3   body                   422 non-null    object 
 4   probody                422 non-null    object 
 5   tokens                 422 non-null    object 
 6   senttokens             422 non-null    object 
 7   agreeableness          422 non-null    float64
 8   openness               422 non-null    float64
 9   conscientiousness      422 non-null    float64
 10  extraversion           422 non-null    float64
 11  neuroticism            422 non-null    float64
 12  agree                  422 non-null    int64  
 13  openn                  422 non-null    int64  
 14  consc                  422 non-null    int64  
 15  extra 

In [39]:
predf['probody']

0                                               oooh see
1                                         material right
2      slidewhistle board watch cartoon school martin...
3                   sorry kek onekek kek kek kek nothing
4      carly glad get cents except uk debuted modern ...
                             ...                        
417    would really like video include information cr...
418    not definitely see would make boy scene feel n...
419    man even thought makes loving someone cant kno...
420                                          not us kill
421    institutions accommodate religious serious med...
Name: probody, Length: 422, dtype: object

## Export dataframe

In [40]:
predf.to_pickle("preprocessed_author.pkl")