### Here is the notebook to go from the raw corpus to F4 format.  The notebook works backwards, as the full dataset is much larger than what we were able to analyze.  We first process more than half of the original dataset, before taking a much smaller sample from that original dataset for our tfidf and analysis/prediction in other notebooks.

In [1]:
#import
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns; sns.set()
import re
from langdetect import detect, DetectorFactory
DetectorFactory.seed = 0
%matplotlib inline
import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('stopwords')
nltk.download('tagsets')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /home/ec2-user/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/ec2-user/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/ec2-user/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package tagsets to /home/ec2-user/nltk_data...
[nltk_data]   Package tagsets is already up-to-date!
[nltk_data] Downloading package wordnet to /home/ec2-user/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [2]:
PARA_PAT = r'\n\n+'
SENT_PAT = r'([.;?!"“”]+)'
TOKEN_PAT = r'(\W+)'

### Read data, detect language, and store which posts were in english

In [2]:
df = pd.read_csv('blogtext.csv')

In [3]:
def findlang(text):
    try: 
        lang = detect(text)
    except:
        lang = "none"
    return lang

#df['lang'] = df['text'].apply(lambda x: findlang(x))

In [10]:
df2 = df.iloc[:376000, :]
df2['lang'] = df2['text'].apply(lambda x: findlang(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [13]:
english = df2.index[df2['lang'] == 'en'].tolist()

In [20]:
englishdf = pd.DataFrame(english)
englishdf
englishdf.to_csv('english.csv', index = False)

In [21]:
englishtest = pd.read_csv('english.csv')
englishtest.head()

Unnamed: 0,0
0,0
1,2
2,3
3,4
4,5


### raw text to paragraphs

In [5]:
df.index.names = ['doc_id']

In [8]:
paras = df.text.str.split(PARA_PAT, expand=True)\
    .stack()\
    .to_frame()\
    .rename(columns={0:'para_str'})
paras.index.names = ['doc_id', 'para_num']

In [10]:
paras.para_str = paras.para_str.str.strip()
paras.para_str = paras.para_str.str.replace(r'\n', ' ')
paras.para_str = paras.para_str.str.replace(r'\s+', ' ')
paras = paras[~paras.para_str.str.match(r'^\s*$')]

Note: It appears there are no paragraphs in our dataset.  All posts have been combined to a single paragraph. Still leaving that column in our token table

In [15]:
paras1 = paras.iloc[:100000,:]
paras1.shape

(100000, 1)

### Going from paragraphs to setences was too big for our memory, so wrote function to process in chunks

In [45]:
def processdata(parasinp):

    sents = parasinp.para_str.str.split(SENT_PAT, expand=True)\
        .stack()\
        .to_frame()\
        .rename(columns={0:'sent_str'})
    sents.index.names = ['doc_id', 'para_num', 'sent_num']
    tokens = sents.sent_str\
        .apply(lambda x: pd.Series(nltk.pos_tag(nltk.word_tokenize(x))))\
        .stack()\
        .to_frame()\
        .rename(columns={0:'pos_tuple'})
    tokens.index.names = ['chap_num', 'para_num', 'sent_num', 'token_num']
    tokens['pos'] = tokens.pos_tuple.apply(lambda x: x[1])
    tokens['token_str'] = tokens.pos_tuple.apply(lambda x: x[0])
    tokens = tokens.drop('pos_tuple', 1)
    tokens['punc'] = tokens.token_str.str.match(r'^[\W_]*$').astype('int')
    tokens['num'] = tokens.token_str.str.match(r'^.*\d.*$').astype('int')
    return tokens
    

In [62]:
paras1 = paras.iloc[:1000,:]
tok = processdata(paras1)

numiter = 677
start = 1000
end = 2000
count = 0

while count < numiter:
    paras2 = paras.iloc[start:end,:]
    tok2 = processdata(paras2)
    tok = pd.concat([tok,tok2], axis = 0, sort = False)
    
    count += 1
    start +=1000
    end +=1000
    print('batch: ',count)

tok.shape

batch:  1
batch:  2
batch:  3
batch:  4
batch:  5
batch:  6
batch:  7
batch:  8
batch:  9
batch:  10
batch:  11
batch:  12
batch:  13
batch:  14
batch:  15
batch:  16
batch:  17
batch:  18
batch:  19
batch:  20
batch:  21
batch:  22
batch:  23
batch:  24
batch:  25
batch:  26
batch:  27
batch:  28
batch:  29
batch:  30
batch:  31
batch:  32
batch:  33
batch:  34
batch:  35
batch:  36
batch:  37
batch:  38
batch:  39
batch:  40
batch:  41
batch:  42
batch:  43
batch:  44
batch:  45
batch:  46
batch:  47
batch:  48
batch:  49
batch:  50
batch:  51
batch:  52
batch:  53
batch:  54
batch:  55
batch:  56
batch:  57
batch:  58
batch:  59
batch:  60
batch:  61
batch:  62
batch:  63
batch:  64
batch:  65
batch:  66
batch:  67
batch:  68
batch:  69
batch:  70
batch:  71
batch:  72
batch:  73
batch:  74
batch:  75
batch:  76
batch:  77
batch:  78
batch:  79
batch:  80
batch:  81
batch:  82
batch:  83
batch:  84
batch:  85
batch:  86
batch:  87
batch:  88
batch:  89
batch:  90
batch:  91
batch:  

KeyboardInterrupt: 

We processed over half the data, this took about 4 hrs on a large AWS machine so we halted the processing and decided to work with what had already been processed

In [64]:
#write large token table to csv
#tok.to_csv('token.csv')

### read data back in and join with other values we want such as topic, gender, and id

In [22]:
tokenstest = pd.read_csv('token.csv', index_col=[0,1,2,3])

  mask |= (ar1 == a)


In [23]:
df.index.names = ['chap_num']

In [24]:
dfjoin = df[['gender', 'topic','id']]

In [25]:
joined = tokenstest.join(dfjoin, on='chap_num', how = 'left')

In [26]:
joined.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,pos,token_str,punc,num,gender,topic,id
chap_num,para_num,sent_num,token_num,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0,0,0,0,NNP,Info,0,0,male,Student,2059027
0,0,0,1,VBZ,has,0,0,male,Student,2059027
0,0,0,2,VBN,been,0,0,male,Student,2059027
0,0,0,3,VBN,found,0,0,male,Student,2059027
0,0,0,4,(,(,1,0,male,Student,2059027


### filter out non english posts

In [32]:
testjoined = joined.loc[joined.index.isin(english, level='chap_num')]

In [35]:
testjoined.head(1000)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,pos,token_str,punc,num,gender,topic,id
chap_num,para_num,sent_num,token_num,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0,0,0,0,NNP,Info,0,0,male,Student,2059027
0,0,0,1,VBZ,has,0,0,male,Student,2059027
0,0,0,2,VBN,been,0,0,male,Student,2059027
0,0,0,3,VBN,found,0,0,male,Student,2059027
0,0,0,4,(,(,1,0,male,Student,2059027
0,0,0,5,JJ,+/-,1,0,male,Student,2059027
0,0,0,6,CD,100,0,1,male,Student,2059027
0,0,0,7,NNS,pages,0,0,male,Student,2059027
0,0,0,8,",",",",1,0,male,Student,2059027
0,0,0,9,CC,and,0,0,male,Student,2059027


### We realized our data was still way too big to be able to be analyzed, so decided to take sample of 10k, split evenly between posts written by male and female authors

In [16]:
joined['gender'].value_counts()

male      44909430
female    44685067
Name: gender, dtype: int64

In [36]:
male = testjoined[testjoined['gender'] == 'male']

In [37]:
subindex = male.index.get_level_values('chap_num')
ind = subindex.unique()

In [38]:
sample_ids = np.random.choice(ind, 5000, replace = False)

In [39]:
sample = male[subindex.isin(sample_ids)].copy()

In [40]:
female = testjoined[testjoined['gender'] == 'female']

In [41]:
female.shape

(44264177, 7)

In [42]:
fsubindex = female.index.get_level_values('chap_num')
find = fsubindex.unique()

In [43]:
fsample_ids = np.random.choice(find, 5000, replace = False)

In [44]:
fsample = female[fsubindex.isin(fsample_ids)].copy()

In [45]:
fsample.shape

(1302064, 7)

In [46]:
combinedsample = pd.concat([sample, fsample], axis=0, sort=False)

In [47]:
combinedsample.shape

(2506161, 7)

In [48]:
len(combinedsample.index.get_level_values('chap_num').unique())

10000

In [49]:
#write sample of data to csv

#combinedsample.to_csv('sample3.csv')

In [3]:
combinedsample = pd.read_csv('sample3.csv', index_col=[0,1,2,3])
#subindex = combinedsample.index.get_level_values('chap_num')
#print(subindex)
#ind = subindex.unique()
#print(ind)
#sample_ids = np.random.choice(ind, 15000,replace = False)
#print(len(sample_ids))
#combinedsample = combinedsample[subindex.isin(sample_ids)].copy()

In [7]:
combinedsample.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,pos,token_str,punc,num,gender,topic,id
chap_num,para_num,sent_num,token_num,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
141,0,0,0,WDT,whatever,0,0,male,Non-Profit,3705830
141,0,0,1,PRP,it,0,0,male,Non-Profit,3705830
141,0,0,2,VBZ,is,0,0,male,Non-Profit,3705830
141,0,0,3,PRP,they,0,0,male,Non-Profit,3705830
141,0,0,4,VBD,put,0,0,male,Non-Profit,3705830


In [8]:
female = combinedsample[combinedsample['gender']=='female']
male = combinedsample[combinedsample['gender']=='male']


5000


### Here we create the tfidf for all posts, only male authors, and only female authors.  We reused the processing code but left the three different results below

In [42]:
WORDS = (male.punc == 0) & (combinedsample.num == 0)
male.loc[WORDS, 'term_str'] = male.token_str.str.lower()\
    .str.replace(r'["_*.]', '')

vocab = male[male.punc == 0].term_str.value_counts().to_frame()\
    .reset_index()\
    .rename(columns={'index':'term_str', 'term_str':'n'})
vocab = vocab.sort_values('term_str').reset_index(drop=True)
vocab.index.name = 'term_id'

# Get priors for V
vocab['p'] = vocab.n / vocab.n.sum()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


In [43]:
# Add stems
stemmer = nltk.stem.porter.PorterStemmer()
vocab['port_stem'] = vocab.term_str.apply(lambda x: stemmer.stem(x))

# Define stopwords
sw = pd.DataFrame({'x':1}, index=nltk.corpus.stopwords.words('english'))
vocab['stop'] = vocab.term_str.map(sw.x).fillna(0).astype('int')
del(sw)

# Add term_ids to tokens 
male['term_id'] = male['term_str'].map(vocab.reset_index()\
    .set_index('term_str').term_id).fillna(-1).astype('int')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if sys.path[0] == '':


In [44]:
vocab.size

240625

In [45]:
WORDS = (male.punc == 0) & (male.num == 0) & male.term_id.isin(vocab[vocab.stop==0].index)

In [46]:
def createbow(tokens, OHCO, countitem):
    BOW = tokens[WORDS].groupby(OHCO+[countitem])[countitem].count()
    return BOW

In [47]:
bag = createbow(male, ['chap_num'], 'term_id')

In [49]:
#bagsmall = pd.DataFrame(bag)

#bagsmall = bagsmall[bagsmall['term_id'] > 1]

In [50]:
#bagsmall = pd.Series(bagsmall['term_id'])

In [51]:
#bagsmall

In [52]:
DTM = bag.unstack().fillna(0)

In [53]:
DTM.shape

(4999, 47974)

In [54]:
alpha = .000001 # We introduce an arbitrary smoothing value
alpha_sum = alpha * vocab.shape[0]
TF = DTM.apply(lambda x: (x + alpha) / (x.sum() + alpha_sum), axis=1)
TF.head()

term_id,0,1,2,3,4,5,6,7,8,9,...,48115,48116,48117,48118,48119,48120,48121,48122,48123,48124
chap_num,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
141,9.896275e-09,9.896275e-09,9.896275e-09,9.896275e-09,9.896275e-09,9.896275e-09,9.896275e-09,9.896275e-09,9.896275e-09,9.896275e-09,...,9.896275e-09,9.896275e-09,9.896275e-09,9.896275e-09,9.896275e-09,9.896275e-09,9.896275e-09,9.896275e-09,9.896275e-09,9.896275e-09
145,3.471642e-09,3.471642e-09,3.471642e-09,3.471642e-09,3.471642e-09,3.471642e-09,3.471642e-09,3.471642e-09,3.471642e-09,3.471642e-09,...,3.471642e-09,3.471642e-09,3.471642e-09,3.471642e-09,3.471642e-09,3.471642e-09,3.471642e-09,3.471642e-09,3.471642e-09,3.471642e-09
220,3.697114e-08,3.697114e-08,3.697114e-08,3.697114e-08,3.697114e-08,3.697114e-08,3.697114e-08,3.697114e-08,3.697114e-08,3.697114e-08,...,3.697114e-08,3.697114e-08,3.697114e-08,3.697114e-08,3.697114e-08,3.697114e-08,3.697114e-08,3.697114e-08,3.697114e-08,3.697114e-08
226,1.387961e-08,1.387961e-08,1.387961e-08,1.387961e-08,1.387961e-08,1.387961e-08,1.387961e-08,1.387961e-08,1.387961e-08,1.387961e-08,...,1.387961e-08,1.387961e-08,1.387961e-08,1.387961e-08,1.387961e-08,1.387961e-08,1.387961e-08,1.387961e-08,1.387961e-08,1.387961e-08
238,2.470279e-07,2.470279e-07,2.470279e-07,2.470279e-07,2.470279e-07,2.470279e-07,2.470279e-07,2.470279e-07,2.470279e-07,2.470279e-07,...,2.470279e-07,2.470279e-07,2.470279e-07,2.470279e-07,2.470279e-07,2.470279e-07,2.470279e-07,2.470279e-07,2.470279e-07,2.470279e-07


In [55]:
DTM.shape

(4999, 47974)

In [56]:
vocab.shape

(48125, 5)

In [57]:
N_docs = DTM.shape[0]
vocab['df'] = DTM[DTM > 0].count()
TFIDF = TF * np.log2(N_docs / vocab[vocab.stop==0]['df'])
TFIDF.head()

term_id,0,1,2,3,4,5,6,7,8,9,...,48115,48116,48117,48118,48119,48120,48121,48122,48123,48124
chap_num,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
141,1.215997e-07,1.215997e-07,1.215997e-07,1.215997e-07,1.215997e-07,1.215997e-07,1.215997e-07,1.215997e-07,1.215997e-07,1.215997e-07,...,1.215997e-07,1.215997e-07,1.215997e-07,1.215997e-07,1.215997e-07,1.215997e-07,1.215997e-07,1.215997e-07,1.215997e-07,1.215997e-07
145,4.265754e-08,4.265754e-08,4.265754e-08,4.265754e-08,4.265754e-08,4.265754e-08,4.265754e-08,4.265754e-08,4.265754e-08,4.265754e-08,...,4.265754e-08,4.265754e-08,4.265754e-08,4.265754e-08,4.265754e-08,4.265754e-08,4.265754e-08,4.265754e-08,4.265754e-08,4.265754e-08
220,4.542801e-07,4.542801e-07,4.542801e-07,4.542801e-07,4.542801e-07,4.542801e-07,4.542801e-07,4.542801e-07,4.542801e-07,4.542801e-07,...,4.542801e-07,4.542801e-07,4.542801e-07,4.542801e-07,4.542801e-07,4.542801e-07,4.542801e-07,4.542801e-07,4.542801e-07,4.542801e-07
226,1.705447e-07,1.705447e-07,1.705447e-07,1.705447e-07,1.705447e-07,1.705447e-07,1.705447e-07,1.705447e-07,1.705447e-07,1.705447e-07,...,1.705447e-07,1.705447e-07,1.705447e-07,1.705447e-07,1.705447e-07,1.705447e-07,1.705447e-07,1.705447e-07,1.705447e-07,1.705447e-07
238,3.035337e-06,3.035337e-06,3.035337e-06,3.035337e-06,3.035337e-06,3.035337e-06,3.035337e-06,3.035337e-06,3.035337e-06,3.035337e-06,...,3.035337e-06,3.035337e-06,3.035337e-06,3.035337e-06,3.035337e-06,3.035337e-06,3.035337e-06,3.035337e-06,3.035337e-06,3.035337e-06


In [58]:
vocab['tfidf_sum'] = TFIDF.sum()
vocab['tfidf_mean'] = TFIDF.mean()
vocab['tfidf_max'] = TFIDF.max()

In [59]:
TOPSM = pd.DataFrame(index=range(10))

In [60]:
for m in ['tfidf']:
    for n in ['mean','max']:
        key = '{}_{}'.format(m,n)
        TOPSM[key] = vocab.sort_values(key, ascending=False).term_str.head(10).tolist()

### TFIDF for both male and female authors

In [19]:
#COMBINED TFIDF
#TOPS

Unnamed: 0,tfidf_mean,tfidf_max
0,urllink,asshole/bitch
1,nbsp,yo-fish
2,'s,yayness
3,n't,whoops
4,'m,emo
5,like,overwhelmed
6,know,profit
7,one,testing
8,get,tsk
9,blog,bandwagon


### tfidf for female authors

In [30]:
#TOPSF

Unnamed: 0,tfidf_mean,tfidf_max
0,urllink,yo-fish
1,nbsp,asshole/bitch
2,'s,whoops
3,n't,testing
4,'m,tsk
5,like,smarterchild
6,know,sharonland
7,get,tittiez
8,today,mouth
9,really,internet


### tfidf for male authors

In [61]:
TOPSM

Unnamed: 0,tfidf_mean,tfidf_max
0,urllink,yayness
1,nbsp,emo
2,'s,overwhelmed
3,n't,profit
4,'m,testing
5,like,bandwagon
6,one,swirly
7,think,prodigals
8,get,parvez
9,blog,'chelsea
