***

# A Jupyter Notebook for the history of sociology project
### Analying journal articles using LDA

***


***

## Part I: run the model

***

In [1]:
# Import all necessary packages and such

from __future__ import print_function
import pyLDAvis
import pyLDAvis.sklearn
pyLDAvis.enable_notebook()

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import numpy as np
import os.path
import pandas as pd
from glob import glob

import warnings
warnings.filterwarnings('ignore') # only use this when you know the script and want to supress unnecessary warnings

# specify the main corpus path. This will be used throughout the script
MACHINE_LEARNING = "C:/Users/renswilderom/Documents/Machine learning"

In [2]:
df = pd.read_excel("C:/Users/renswilderom/Documents/Dan Silver Projects/history of sociology/stm_prep_clean_9.xlsx") 

In [3]:
df[:5]

Unnamed: 0,JOURNAL,YEAR,TITLE,TEXT
0,American_Journal_of_Sociology,1895,Christian Sociology I,while many considerations would advisable ...
1,American_Journal_of_Sociology,1895,Christian Sociology II,speak jesus anticipating regenerate society...
2,American_Journal_of_Sociology,1895,Christian Sociology,the term christian sociology unfortunate cer...
3,American_Journal_of_Sociology,1895,Contributions of the United States,number the student social science ...
4,American_Journal_of_Sociology,1895,Contributions to Social Philosophy II,this not chance world but world law both ...


In [4]:
%%time
# original vectorizer

tf_vectorizer_original = CountVectorizer(strip_accents = 'unicode',
                                stop_words = 'english',
                                lowercase = True,
                                token_pattern = r'\b[a-zA-Z]{3,}\b', # keeps words of 3 or more characters
                                max_df = 0.5, 
                                min_df = 10)
dtm_tf_original = tf_vectorizer_original.fit_transform(df['TEXT'].values.astype('U')) # import articles in column 'TEXT' as unicode string
print(dtm_tf_original.shape)


(12907, 50345)
Wall time: 1min 26s


***

### A 50 topic model with TF vectorizer

***

In [5]:
%%time
# for TF DTM
lda_tf_original = LatentDirichletAllocation(n_topics=50, random_state=0)
lda_tf_original.fit(dtm_tf_original)

Wall time: 11min 9s


In [6]:
%%time
# LDA tf visualization
pyLDAvis.sklearn.prepare(lda_tf_original, dtm_tf_original, tf_vectorizer_original)

Wall time: 1min 3s


In [7]:
# print topics again

n_top_words = 30

def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print("Topic #%d:" % topic_idx)
        print(" ".join([feature_names[i]
                        for i in topic.argsort()[:-n_top_words - 1:-1]]))
    print()

tf_feature_names = tf_vectorizer_original.get_feature_names() 
print_top_words(lda_tf_original, tf_feature_names, n_top_words)

Topic #0:
white black racial race whites blacks negro segregation negroes south discrimination african southern minority percent color nonwhite prejudice north inequality races composition americans interracial census minorities skin nonwhites northern puerto
Topic #1:
model structural parameters estimates log fit origin latent destination tilly overall observed covariates estimate migration logit coded loglinear turnout function predictors fixed association sociodemographic continuous restrictions selection freedom lin treiman
Topic #2:
science empirical theoretical sociological scientific sociologists approach causal theories method models scientists methods sciences concept concepts definition explanation assumptions methodological basic term phenomena historical assumption function processes argument interpretation systems
Topic #3:
organizations government movement party organizational local leaders policy movements democratic action participation parties protest politics civil le

***

### A 50 topic model with TF-IDF vectorizer

***

In [8]:
%%time
tfidf_vectorizer_original = TfidfVectorizer(**tf_vectorizer_original.get_params())
dtm_tfidf_original = tfidf_vectorizer_original.fit_transform(df['TEXT'].values.astype('U'))
print(dtm_tfidf_original.shape)

(12907, 50345)
Wall time: 1min 17s


In [9]:
%%time
lda_tfidf_original = LatentDirichletAllocation(n_topics=50, random_state=0)
lda_tfidf_original.fit(dtm_tfidf_original)

Wall time: 6min 50s


In [10]:
# print topics again

n_top_words = 30

def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print("Topic #%d:" % topic_idx)
        print(" ".join([feature_names[i]
                        for i in topic.argsort()[:-n_top_words - 1:-1]]))
    print()

tfidf_feature_names = tfidf_vectorizer_original.get_feature_names() 
print_top_words(lda_tfidf_original, tfidf_feature_names, n_top_words)

Topic #0:
colonel employes census establishments wages wright children cent average eleventh punishments censuses law manufacturing females punishment statistics inspector products offender sentiment steuart reported earnings wrights industries manufactures employed factory industry
Topic #1:
kidd intellect kentucky mountain telic mountains progress pleasures man pleasure mountaineers cabin civilservice psychic corn feeling county region logs mind cabins race evolution faculty force animal rivalry organism conative physical
Topic #2:
lifestory census university manufactures enumeration congress constitution man government facts scholar science conservatism schedule hamilton schedules contributions inquiry fourvolume scholasticism inquiries framers country deaf statistics truth dumb independence law censustaking
Topic #3:
mission church inner workingmen trades clergy schools asylums religious municipal socialistic socialism unions charity associations philanthropy movement evangelical c

***

## End of script

***

Some more draft code:

In [None]:
# create a doctopic matrix ORIGINAL


# filenames = sorted([os.path.join(path, fn).replace('\\', '/') for fn in os.listdir(path)])

# print(filenames[0])

# dtm_transformed = tf_vectorizer_original.fit_transform(articles_original)

# doctopic = lda_tf_original.fit_transform(dtm_transformed)

# doctopic = doctopic / np.sum(doctopic, axis=1, keepdims=True)

# # Write doctopic to a csv file
# os.chdir("C:/Users/renswilderom/Documents/Dan Silver Projects/history of sociology") 

# filenamesclean = [fn.split('/')[-1] for fn in filenames ] 

# i=0
# with open('doctopic_original_history of sociolgy.csv',mode='w') as fo:
#     for rij in doctopic:
#         fo.write('"'+filenamesclean[i]+'"')
#         fo.write(',')
#         for kolom in rij:
#             fo.write(str(kolom))
#             fo.write(',')
#         fo.write('\n')
#         i+=1

In [None]:
# open doctopic.csv and create a new row with variable names

# csv_file = pd.read_csv("C:/Users/renswilderom/Documents/Dan Silver projects/history of sociology/doctopic_original.csv", header=None, index_col=False,
#                   names = ["file", "t_0", "t_1", "t_2", "t_3", "t_4", "t_5", "t_6", "t_7", "t_8", "t_9", 
#                             "t_10", "t_11", "t_12", "t_13", "t_14", "t_15", "t_16", "t_17", "t_18", "t_19", 
#                             "t_20", "t_21", "t_22", "t_23", "t_24", "t_25", "t_26", "t_27", "t_28", "t_29", 
#                              "t_30", "t_31", "t_32", "t_33", "t_34", "t_35", "t_36", "t_37", "t_38", "t_39",
#                               "t_40", "t_41", "t_42", "t_43", "t_44", "t_45", "t_46", "t_47", "t_48", "t_49"])

# # Load the xls file as a dataframe
# df = csv_file
# df[0:5]

In [10]:
# What is the topic of interest? Specify yourself below.

# topic_of_interest = "t_0" # e.g. t_0, the racial topic

In [None]:
# rank texts in decending order
# And print the top ten articles related to that topic
# df1 = df[['file', topic_of_interest]] 
# df2 = df1.sort(topic_of_interest, ascending=False)
# df3 = df2.head(10)
# df3


In [12]:
# # Make a variable of each file
# file1 = (df3['file'].iloc[0])
# file2 = (df3['file'].iloc[1])
# file3 = (df3['file'].iloc[2])
# file4 = (df3['file'].iloc[3])
# file5 = (df3['file'].iloc[4])
# file6 = (df3['file'].iloc[5])
# file7 = (df3['file'].iloc[6])
# file8 = (df3['file'].iloc[7])
# file9 = (df3['file'].iloc[8])
# file10 = (df3['file'].iloc[9])

In [None]:
# open file in webbrower
# import os
# import webbrowser
# os.chdir(path)
# webbrowser.open('file://' + os.path.realpath(file1)) # Just change the file name here to open other files