### This notebook creates a topic model using only nouns

In [81]:
import pandas as pd
import numpy as np
import glob

import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.tokenize import word_tokenize

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.metrics import roc_auc_score
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.metrics.pairwise import cosine_similarity

import scipy.sparse
from scipy.sparse import csr_matrix, hstack

import matplotlib.pyplot as plt
import seaborn as sns

import pandas as pd
from sklearn.preprocessing import StandardScaler
import scipy.cluster.hierarchy as sch
from sklearn.cluster import AgglomerativeClustering
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
import matplotlib.pyplot as plt

In [118]:
data = pd.read_csv('sample3.csv')

### process text

In [119]:
#only keep nouns

db = data[(data["pos"]=='NN') | (data["pos"]=='NNS') | (data["pos"]=='NNP')| (data["pos"]=='NNPS')]


#retaining labels
db_mapping = db[['chap_num','gender','id']].copy()
db_mapping.drop_duplicates(inplace=True)
print(len(db_mapping))
db_mapping.head()

#Aggregating corpus
db.token_str = db.token_str.astype('str')
cdb = db.groupby('chap_num')\
    .apply(lambda x: ' '.join(x.token_str))\
    .to_frame()\
    .rename(columns={0:'text'})
cdb.head()

#Merginig db
dbm = pd.merge(cdb,db_mapping,left_index=True,right_on='chap_num')

dbm.text = dbm.text.str.lower()
dbm.text = dbm.text.str.replace('[^a-zA-Z]'," ")
dbm.text = dbm.text.str.replace('urllink'," ")
dbm.text = dbm.text.str.replace('nbsp'," ")
dbm.text = dbm.text.str.replace(r'\n',' ')
dbm.text = dbm.text.str.replace(r'\s+',' ')
dbm.text = dbm.text.str.replace('([ ]{2,})',' ')
dbm = dbm[~dbm.text.str.match(r'^\s*$')]
dbm = dbm[dbm.text.apply(lambda x:len(x))>10]

9975


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self[name] = value


In [120]:
joined = dbm.groupby('id')\
.apply(lambda x: ' '.join(x.text))\
    .to_frame()\
    .rename(columns={0:'text1'})

In [121]:
joined.reset_index(inplace = True)

In [122]:
cols = ['id', 'gender']

gendid = dbm[cols]
gendid.drop_duplicates(inplace = True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.


In [123]:
joined = joined.merge(gendid, on='id', how = 'left')

### create tfidf and topic models

In [124]:
vect = TfidfVectorizer(min_df = .005,lowercase=True,stop_words='english',norm='l1',ngram_range=(1,2),max_features=1500).fit(joined['text1'])
tfidf = vect.transform(joined['text1'])
tfidf = tfidf.toarray()

In [188]:
lda = LatentDirichletAllocation(n_components=50, max_iter=50,learning_method='online', random_state=0,doc_topic_prior = .35)
lda.fit(tfidf)

LatentDirichletAllocation(batch_size=128, doc_topic_prior=0.35,
             evaluate_every=-1, learning_decay=0.7,
             learning_method='online', learning_offset=10.0,
             max_doc_update_iter=100, max_iter=50, mean_change_tol=0.001,
             n_components=50, n_jobs=1, n_topics=None, perp_tol=0.1,
             random_state=0, topic_word_prior=None,
             total_samples=1000000.0, verbose=0)

In [189]:
def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        message = f"Topic #{topic_idx}: "
        message += " ".join([feature_names[i]
                             for i in topic.argsort()[:-n_top_words - 1:-1]])
        print(message)
    print()

In [190]:
#inspect topics

tf_feature_names = vect.get_feature_names()
print_top_words(lda, tf_feature_names, n_top_words=10)

Topic #0: look peace break bar huh ben let winter sites socks
Topic #1: person word end problem monday nights stop students york try
Topic #2: world friends friend business feeling care information pool field form
Topic #3: post yeah cool john bye experience yea dreams wind view
Topic #4: lol kids page shot wan links son collection answer joy
Topic #5: job team group water dinner order film beach cute alex
Topic #6: guy computer thanks gon company reality went male brain david
Topic #7: day pictures songs june plan just hehehe remember em television
Topic #8: heart eyes fact rest pain games truth question children floor
Topic #9: time week mind mouth ca market update butt heh tape
Topic #10: food boy feelings books movies august middle moments matt boyfriend
Topic #11: moment brother hair sorry new article crap road gmail service
Topic #12: im http case note list video star sucks pages left
Topic #13: weeks thats sort hmmm looks le corner window amp dogs
Topic #14: morning couple city 

In [192]:
#find max topic for each document
tr1['max'] = tr1.idxmax(axis = 1)
finalDf = pd.concat([tr1, joined[['gender']]], axis = 1)

In [193]:
#split into different dfs for each gender
male = finalDf[finalDf['gender'] == 'male']

female = finalDf[finalDf['gender'] == 'female']


In [194]:
#male topic assignments

male['max'].value_counts()

42    67
5     67
3     63
11    61
40    58
24    57
38    55
49    55
30    52
44    52
39    49
2     47
29    47
26    44
41    43
6     42
45    42
43    40
21    40
12    39
25    36
46    34
7     33
48    33
17    32
18    32
20    32
0     32
14    31
15    30
37    29
8     28
32    28
22    28
19    28
4     27
1     27
23    26
47    26
27    26
35    25
9     23
28    22
34    21
31    20
36    20
10    19
33    18
16    14
13    14
Name: max, dtype: int64

In [195]:
#female assigments

female['max'].value_counts()

24    80
3     63
2     58
26    57
44    52
23    50
38    50
45    49
40    48
41    46
30    46
19    45
5     45
22    42
15    41
8     39
35    39
6     39
42    39
43    39
27    38
47    38
21    38
16    37
7     35
12    35
20    35
48    35
4     34
49    34
46    34
29    33
39    32
17    31
31    31
11    30
0     29
34    29
14    29
10    29
28    27
13    27
18    27
9     25
36    20
1     20
25    20
37    19
32    18
33    15
Name: max, dtype: int64