***

# From fringe revelry to growth industry - online Jupyter Notebook (Python)

***

***

# The first part of the analyis
### Exploring the initial corpus

***

In [37]:
# Import all necessary packages and such

from __future__ import print_function
import pyLDAvis
import pyLDAvis.sklearn
pyLDAvis.enable_notebook()

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import numpy as np
import os.path
import pandas as pd
from glob import glob

import warnings
warnings.filterwarnings('ignore') # only use this when you know the script and want to supress unnecessary warnings

# specify the main corpus path. This will be used throughout the script
CORPUS_PATH = "P:/My documents/Collected_data/Newspaper articles/Newspaper articles DJ/DJ Guardian (original batch)/"
HOME = "P:/My documents/Project 2 From fringe revelry to growth industry/"

In [2]:
# Import dataset consisting of seperate txt file
import os, os.path, glob
os.chdir(CORPUS_PATH)
files = glob.glob("*.txt")

articles_original=[]
print("Constructing dataset, total number of documents included:")
for file in files: 
    with open(file, errors="ignore") as fi:
        articles_original.append(fi.read())
length=len(articles_original)
print(length)

Constructing dataset, total number of documents included:
5244


In [3]:
# https://github.com/scikit-learn/scikit-learn/blob/master/sklearn/feature_extraction/stop_words.py
english_stopwords = [
"length", "words", "reviewed", "www", "section", "byline", "author",    # this is where my stop word list begins
"page", "features", "caption", "dateline", "said", "say", "says", "just", 
"year", "years", "day", "guardian", "new york times", "nytimes", "nyt", 
"parentheses", "python", "http", "url", "com", "correction", "appended", 
"type", "article", "date", "corrections", "mr", "mrs", "ms", "miss", "sir", 
"snp", "ind", "bnp", "rev","freeman", "hhh", "hhhh", "hhhhh", "pizazz", 
"org", "xfm", "cmp", "stx", "indl", "xxx", "dir", "est", "don", "est", 
"tel", "nnm", "mos", "tha", "ama", "der", "das", "bez", "les", "des", 
"pas", "thu", "mon", "mel", "sur", "moi", "rai", "che", "dab", "gus", 
"taj", "nyse", "dab", "tope", "taj", "smg", "ant", "january", "february", 
"march","april", "may", "june", "july", "august", "september", "october", 
"november", "december", "jan", "feb", "mar", "apr", "may", "june", "july", 
"aug", "sept", "oct", "nov", "dec", "monday", "tuesday", "wednesday", 
"thursday", "friday", "saturday", "sunday", "mondays", "tuesdays", 
"wednesdays", "thursdays", "fridays", "saturdays", "sundays",     
"a", "about", "above", "across", "after", "afterwards", "again", "against",    # this is where the original stop word list begins
"all", "almost", "alone", "along", "already", "also", "although", "always",
"am", "among", "amongst", "amoungst", "amount", "an", "and", "another",
"any", "anyhow", "anyone", "anything", "anyway", "anywhere", "are",
"around", "as", "at", "back", "be", "became", "because", "become",
"becomes", "becoming", "been", "before", "beforehand", "behind", "being",
"below", "beside", "besides", "between", "beyond", "bill", "both",
"bottom", "but", "by", "call", "can", "cannot", "cant", "co", "con",
"could", "couldnt", "cry", "de", "describe", "detail", "do", "done",
"down", "due", "during", "each", "eg", "eight", "either", "eleven", "else",
"elsewhere", "empty", "enough", "etc", "even", "ever", "every", "everyone",
"everything", "everywhere", "except", "few", "fifteen", "fifty", "fill",
"find", "fire", "first", "five", "for", "former", "formerly", "forty",
"found", "four", "from", "front", "full", "further", "get", "give", "go",
"had", "has", "hasnt", "have", "he", "hence", "her", "here", "hereafter",
"hereby", "herein", "hereupon", "hers", "herself", "him", "himself", "his",
"how", "however", "hundred", "i", "ie", "if", "in", "inc", "indeed",
"interest", "into", "is", "it", "its", "itself", "keep", "last", "latter",
"latterly", "least", "less", "ltd", "made", "many", "may", "me",
"meanwhile", "might", "mill", "mine", "more", "moreover", "most", "mostly",
"move", "much", "must", "my", "myself", "name", "namely", "neither",
"never", "nevertheless", "next", "nine", "no", "nobody", "none", "noone",
"nor", "not", "nothing", "now", "nowhere", "of", "off", "often", "on",
"once", "one", "only", "onto", "or", "other", "others", "otherwise", "our",
"ours", "ourselves", "out", "over", "own", "part", "per", "perhaps",
"please", "put", "rather", "re", "same", "see", "seem", "seemed",
"seeming", "seems", "serious", "several", "she", "should", "show", "side",
"since", "sincere", "six", "sixty", "so", "some", "somehow", "someone",
"something", "sometime", "sometimes", "somewhere", "still", "such",
"system", "take", "ten", "than", "that", "the", "their", "them",
"themselves", "then", "thence", "there", "thereafter", "thereby",
"therefore", "therein", "thereupon", "these", "they", "thick", "thin",
"third", "this", "those", "though", "three", "through", "throughout",
"thru", "thus", "to", "together", "too", "top", "toward", "towards",
"twelve", "twenty", "two", "un", "under", "until", "up", "upon", "us",
"very", "via", "was", "we", "well", "were", "what", "whatever", "when",
"whence", "whenever", "where", "whereafter", "whereas", "whereby",
"wherein", "whereupon", "wherever", "whether", "which", "while", "whither",
"who", "whoever", "whole", "whom", "whose", "why", "will", "with",
"within", "without", "would", "yet", "you", "your", "yours", "yourself",
"yourselves"]

In [4]:
# original vectorizer
tf_vectorizer_original = CountVectorizer(lowercase = True,
                                         strip_accents = 'unicode',
                                         stop_words = english_stopwords,
                                         token_pattern = r'\b[a-zA-Z]{3,}\b', # keeps words of 3 or more characters
                                         max_df = 0.5, # ignore words occuring in >50% of the corpus (i.e. corpus specific stop words)
                                         min_df = 10) # ignore words in <10 documents of the corpus
dtm_tf_original = tf_vectorizer_original.fit_transform(articles_original) 
print(dtm_tf_original.shape)

# https://mimno.infosci.cornell.edu/papers/schofield_tacl_2016.pdf
# no stemming and no lematization

(5244, 17072)


In [5]:
# for TF DTM
lda_tf_original = LatentDirichletAllocation(n_topics=20, random_state=0)
lda_tf_original.fit(dtm_tf_original)

LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
             evaluate_every=-1, learning_decay=0.7, learning_method=None,
             learning_offset=10.0, max_doc_update_iter=100, max_iter=10,
             mean_change_tol=0.001, n_components=10, n_jobs=1, n_topics=20,
             perp_tol=0.1, random_state=0, topic_word_prior=None,
             total_samples=1000000.0, verbose=0)

In [6]:
# Conventional topics ORIGINAL

n_top_words = 30

def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print("Topic #%d:" % topic_idx)
        print(" ".join([feature_names[i]
                        for i in topic.argsort()[:-n_top_words - 1:-1]]))
    print()

tf_feature_names = tf_vectorizer_original.get_feature_names() 
print_top_words(lda_tf_original, tf_feature_names, n_top_words)



Topic #0:
people home time world night old way week young good work police going place make local city away pages got children head come big left school street best end days
Topic #1:
radio station bbc music breakfast audience listeners media capital news programme evans stations chris london virgin channel presenter commercial live television million broadcasting week broadcast programmes air itv listening shows
Topic #2:
otis dido estelle redding harvest stax renault replica funereal maid kit market menus engine gmtv lab chris way markets information reporter aloof cent neptunes week rates street pounds washington magic
Topic #3:
album music hop hip pop rap songs sound soul band rock artists musical song reggae track singer record lyrics albums records tracks debut york sounds label single guitar black style
Topic #4:
dollars baker bush german germany franz alex budget house gold white germans taxes washington president deficit rapist reagan karl west tighter democratic election admi

In [38]:
# create a doctopic matrix ORIGINAL

filenames = sorted([os.path.join(CORPUS_PATH, fn) for fn in os.listdir(CORPUS_PATH)])

dtm_transformed = tf_vectorizer_original.fit_transform(articles_original)

doctopic = lda_tf_original.fit_transform(dtm_transformed)

doctopic = doctopic / np.sum(doctopic, axis=1, keepdims=True)

# Write doctopic to a csv file

os.chdir(HOME) 

filenamesclean = [fn.split('/')[-1] for fn in filenames]
i=0
with open('doctopic_original_guardian_dj.csv',mode='w') as fo:
    for rij in doctopic:
        fo.write('"'+filenamesclean[i]+'"')
        fo.write(',')
        for kolom in rij:
            fo.write(str(kolom))
            fo.write(',')
        fo.write('\n')
        i+=1
print("finsihed with creating doctopic matrix")

finsihed with creating doctopic matrix


***

# The second part of the analysis
### Analizing a subcorpus (identifying 'dance music' newspaper articles).

***

In [39]:
# open doctopic.csv and create a new row with variable names

os.chdir(HOME) 

csv_file = pd.read_csv("doctopic_original_guardian_dj.csv", header=None, index_col=False,
                  names = ["file", "t_0", "t_1", "t_2", "t_3", "t_4", "t_5", "t_6", "t_7", "t_8", "t_9", 
                           "t_10", "t_11", "t_12", "t_13", "t_14", "t_15", "t_16", "t_17", "t_18", "t_19"])

# Load the xls file as a dataframe
df = csv_file
df

Unnamed: 0,file,t_0,t_1,t_2,t_3,t_4,t_5,t_6,t_7,t_8,...,t_10,t_11,t_12,t_13,t_14,t_15,t_16,t_17,t_18,t_19
0,1985-01-10_1141_Guardian.txt,0.284783,0.014993,0.001503,0.105858,0.000072,0.000072,0.000072,0.009660,0.227794,...,0.000072,0.000072,0.010711,0.000072,0.000072,0.088976,0.000072,0.000072,0.000072,0.000072
1,1985-01-10_1711_Guardian.txt,0.000065,0.000065,0.000065,0.000065,0.000065,0.000065,0.005538,0.007506,0.918171,...,0.063042,0.000065,0.000065,0.000065,0.000065,0.000065,0.000065,0.000065,0.000065,0.000065
2,1985-01-23_633_Guardian.txt,0.013525,0.000164,0.000164,0.000164,0.000164,0.000164,0.000164,0.000164,0.000164,...,0.000164,0.509944,0.389824,0.000164,0.000164,0.072743,0.000164,0.011504,0.000164,0.000164
3,1985-01-25_386_Guardian.txt,0.154473,0.000282,0.000282,0.000282,0.000282,0.000282,0.000282,0.144199,0.237030,...,0.000282,0.022069,0.000282,0.000282,0.000282,0.406780,0.031495,0.000282,0.000282,0.000282
4,1985-02-18_1271_Guardian.txt,0.524537,0.226302,0.000089,0.000089,0.003576,0.000089,0.003077,0.000089,0.000089,...,0.002521,0.000089,0.016282,0.000089,0.000089,0.190433,0.000089,0.000089,0.000089,0.000089
5,1985-03-09_359_Guardian.txt,0.000296,0.000296,0.000296,0.038704,0.000296,0.000296,0.000296,0.881181,0.026223,...,0.043296,0.000296,0.000296,0.000296,0.000296,0.000296,0.000296,0.000296,0.006159,0.000296
6,1985-03-14_1311_Guardian.txt,0.372414,0.007512,0.000097,0.000097,0.003351,0.000097,0.000097,0.000097,0.100564,...,0.000097,0.000097,0.016364,0.000097,0.000097,0.498443,0.000097,0.000097,0.000097,0.000097
7,1985-03-16_1432_Guardian.txt,0.084231,0.034051,0.000084,0.000084,0.000084,0.000084,0.000084,0.000084,0.051666,...,0.000084,0.000084,0.000084,0.000084,0.000084,0.000084,0.000084,0.001576,0.000084,0.000084
8,1985-04-08_958_Guardian.txt,0.000111,0.373708,0.000111,0.092446,0.000111,0.000111,0.013545,0.000111,0.164414,...,0.000111,0.000111,0.030291,0.000111,0.000111,0.204636,0.016175,0.000111,0.000111,0.000111
9,1985-04-11_872_Guardian.txt,0.460217,0.000164,0.000164,0.000164,0.000164,0.000164,0.003867,0.000164,0.529192,...,0.000164,0.000164,0.000164,0.000164,0.000164,0.000164,0.000164,0.004093,0.000164,0.000164


In [40]:
# calculate mean, std, cutoff high, and cutoff low

df_1 = df.describe().loc[['mean','std']]
df2 = df_1.transpose()
df2['cutoff_high'] = df2['mean'] + 2*df2['std'] 
df2['cutoff_low'] = df2['mean'] + df2['std'] 
df2

Unnamed: 0,mean,std,cutoff_high,cutoff_low
t_0,0.220256,0.201155,0.622566,0.421411
t_1,0.050823,0.116718,0.28426,0.167541
t_2,0.00058,0.001923,0.004426,0.002503
t_3,0.05338,0.096916,0.247212,0.150296
t_4,0.002243,0.011311,0.024865,0.013554
t_5,0.00064,0.003694,0.008029,0.004334
t_6,0.00833,0.027923,0.064177,0.036254
t_7,0.007354,0.036478,0.08031,0.043832
t_8,0.199202,0.20233,0.603862,0.401532
t_9,0.171927,0.214948,0.601823,0.386875


In [41]:
# Select the appropriate cutoff point per topic from the table above

# 9

t_9_cutoff_high = df2.get_value('t_9', 'cutoff_high')
t_9_cutoff_low = df2.get_value('t_9', 'cutoff_low')

In [42]:
# These values are used to create new 'dance_high' and 'dance_low' dummies in the original df 

df['dance_high'] = '0'
df['dance_high'][
    (df['t_9'] > t_9_cutoff_high)
    ] = '1' 

df['dance_low'] = '0'
df['dance_low'][
    (df['t_9'] > t_9_cutoff_low)
    ] = '1' 

df

Unnamed: 0,file,t_0,t_1,t_2,t_3,t_4,t_5,t_6,t_7,t_8,...,t_12,t_13,t_14,t_15,t_16,t_17,t_18,t_19,dance_high,dance_low
0,1985-01-10_1141_Guardian.txt,0.284783,0.014993,0.001503,0.105858,0.000072,0.000072,0.000072,0.009660,0.227794,...,0.010711,0.000072,0.000072,0.088976,0.000072,0.000072,0.000072,0.000072,0,0
1,1985-01-10_1711_Guardian.txt,0.000065,0.000065,0.000065,0.000065,0.000065,0.000065,0.005538,0.007506,0.918171,...,0.000065,0.000065,0.000065,0.000065,0.000065,0.000065,0.000065,0.000065,0,0
2,1985-01-23_633_Guardian.txt,0.013525,0.000164,0.000164,0.000164,0.000164,0.000164,0.000164,0.000164,0.000164,...,0.389824,0.000164,0.000164,0.072743,0.000164,0.011504,0.000164,0.000164,0,0
3,1985-01-25_386_Guardian.txt,0.154473,0.000282,0.000282,0.000282,0.000282,0.000282,0.000282,0.144199,0.237030,...,0.000282,0.000282,0.000282,0.406780,0.031495,0.000282,0.000282,0.000282,0,0
4,1985-02-18_1271_Guardian.txt,0.524537,0.226302,0.000089,0.000089,0.003576,0.000089,0.003077,0.000089,0.000089,...,0.016282,0.000089,0.000089,0.190433,0.000089,0.000089,0.000089,0.000089,0,0
5,1985-03-09_359_Guardian.txt,0.000296,0.000296,0.000296,0.038704,0.000296,0.000296,0.000296,0.881181,0.026223,...,0.000296,0.000296,0.000296,0.000296,0.000296,0.000296,0.006159,0.000296,0,0
6,1985-03-14_1311_Guardian.txt,0.372414,0.007512,0.000097,0.000097,0.003351,0.000097,0.000097,0.000097,0.100564,...,0.016364,0.000097,0.000097,0.498443,0.000097,0.000097,0.000097,0.000097,0,0
7,1985-03-16_1432_Guardian.txt,0.084231,0.034051,0.000084,0.000084,0.000084,0.000084,0.000084,0.000084,0.051666,...,0.000084,0.000084,0.000084,0.000084,0.000084,0.001576,0.000084,0.000084,1,1
8,1985-04-08_958_Guardian.txt,0.000111,0.373708,0.000111,0.092446,0.000111,0.000111,0.013545,0.000111,0.164414,...,0.030291,0.000111,0.000111,0.204636,0.016175,0.000111,0.000111,0.000111,0,0
9,1985-04-11_872_Guardian.txt,0.460217,0.000164,0.000164,0.000164,0.000164,0.000164,0.003867,0.000164,0.529192,...,0.000164,0.000164,0.000164,0.000164,0.000164,0.004093,0.000164,0.000164,0,0


In [43]:
# How many dance articles do I have according to the high criterion?

df3 = df[df.dance_high != '0']
df4 = df3[['file']]
df4.shape

(317, 1)

In [44]:
# How many dance articles do I have according to the low criterion?

df3 = df[df.dance_low != '0']
df5 = df3[['file']]
df5.shape

(917, 1)

In [45]:
# Create lists of file names beloning to the subcorpus 'high' and 'low'
# Probably this can be done in a more straightforward fashion... (but this works)

os.chdir(HOME) 

# Create a Pandas Excel writer using XlsxWriter as the engine.
writer = pd.ExcelWriter('list_low_guardian_dj.xlsx', engine='xlsxwriter')

# Convert the dataframe to an XlsxWriter Excel object.
df5.to_excel(writer, sheet_name='Sheet1')

# Close the Pandas Excel writer and output the Excel file.
writer.save()

***

### Working with selection of the original corpus (i.e. dance low articles)

***

In [46]:
# Copy subcorpus from its original folder to a new destination folder. 

import shutil
import os

# Create A folder for dance articles, if the folder does not exists.
if not os.path.exists("P:/My documents/Collected_data/Newspaper articles/Newspaper articles DJ/DJ Guardian/dance articles low"):
    os.makedirs("P:/My documents/Collected_data/Newspaper articles/Newspaper articles DJ/DJ Guardian/dance articles low")  
CORPUS_PATH_LOW = "P:/My documents/Collected_data/Newspaper articles/Newspaper articles DJ/DJ Guardian/dance articles low"

os.chdir(CORPUS_PATH)

# the following list of articles are dance articles:
files_tocopy = pd.read_excel("C:/Users/renswilderom/Documents/Machine learning/list_low_guardian_dj.xlsx") 
files_tocopy = files_tocopy['file'].apply(lambda x: x.replace('"', "")).tolist()


for f in files_tocopy:
    shutil.copy(f, CORPUS_PATH_LOW)   
     
        
print ("Done with copying files")   

Done with copying files


In [47]:
# Import dataset consisting of seperate txt file
import os, os.path, glob
os.chdir(CORPUS_PATH_LOW)
files = glob.glob("*.txt")

articles_low=[]
print("Constructing dataset, total number of documents included:")
for file in files: 
    with open(file, errors="ignore") as fi:
        articles_low.append(fi.read())
length=len(articles_low)
print(length)

Constructing dataset, total number of documents included:
917


In [48]:
# original vectorizer
tf_vectorizer_low = CountVectorizer(strip_accents = 'unicode',
                                stop_words = english_stopwords,
                                lowercase = True,
                                token_pattern = r'\b[a-zA-Z]{3,}\b', # keeps words of 3 or more characters
                                max_df = 0.5, # ignore words occuring in >50% of the corpus (i.e. corpus specific stop words)
                                min_df = 10) # ignore words in <10 documents of the corpus
dtm_tf_low = tf_vectorizer_low.fit_transform(articles_low) 
print(dtm_tf_low.shape)

# https://mimno.infosci.cornell.edu/papers/schofield_tacl_2016.pdf
# no stemming and no lematization

(917, 3817)


In [20]:
# LDA TF DTM
lda_tf_low = LatentDirichletAllocation(n_topics=20, random_state=0)
lda_tf_low.fit(dtm_tf_low)

LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
             evaluate_every=-1, learning_decay=0.7, learning_method=None,
             learning_offset=10.0, max_doc_update_iter=100, max_iter=10,
             mean_change_tol=0.001, n_components=10, n_jobs=1, n_topics=20,
             perp_tol=0.1, random_state=0, topic_word_prior=None,
             total_samples=1000000.0, verbose=0)

In [21]:
# LDA tf visualization
pyLDAvis.sklearn.prepare(lda_tf_low, dtm_tf_low, tf_vectorizer_low)

In [28]:
# Conventional topics LOW

n_top_words = 30

def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print("Topic #%d:" % topic_idx)
        print(" ".join([feature_names[i]
                        for i in topic.argsort()[:-n_top_words - 1:-1]]))
    print()

tf_feature_names = tf_vectorizer_low.get_feature_names() 
print_top_words(lda_tf_low, tf_feature_names, n_top_words)

# create a doctopic matrix LOW

filenames = sorted([os.path.join(CORPUS_PATH_LOW, fn) for fn in os.listdir(CORPUS_PATH_LOW)])

print(filenames[0])

dtm_transformed = tf_vectorizer_low.fit_transform(articles_low)

doctopic = lda_tf_low.fit_transform(dtm_transformed)

doctopic = doctopic / np.sum(doctopic, axis=1, keepdims=True)

# Write doctopic to a csv file

os.chdir(HOME) 

filenamesclean = [fn.split('/')[-1] for fn in filenames]
i=0
with open('doctopic_low.csv',mode='w') as fo:
    for rij in doctopic:
        fo.write('"'+filenamesclean[i]+'"')
        fo.write(',')
        for kolom in rij:
            fo.write(str(kolom))
            fo.write(',')
        fo.write('\n')
        i+=1

Topic #0:
dance madonna pop song chart album remix number best cook hit record band week singles british disco single charts love albums rock norman time remixes weatherall punk producer david version
Topic #1:
women male men djs female rap judge woman diy parker label andy boys rave technology richard steve need pounds techno decks equipment control culture dance involved courses man age heaven
Topic #2:
noel bands people house band scene british dance manchester radio sound america press going white stage went record pop good guitar britain black got later london style rock trance time
Topic #3:
ibiza summer manumission holiday island season clubbers clubbing pounds beach compilation san balearic djs pacha antonio napa pool spanish travel del ayia tracks compilations cafe sunset carl promoters party mixed
Topic #4:
foot easy dancers line left right step dancing listening country steps forward dance beat hip big feet north physical lounge time watching british greater women seventies 

***

# The third part of the analysis
### Exploring the top articles per topic

***

In [49]:
# Open the CSV file produced in the cell above in order to explore the top articles related to the topic of interest

# IMPORTANT: choose the appripriate CSV _original, _high, or _low

os.chdir(HOME)

import pandas as pd
csv_file = pd.read_csv("doctopic_original_guardian_dj.csv", header=None, index_col=False,
                  names = ["file", "t_0", "t_1", "t_2", "t_3", "t_4", "t_5", "t_6", "t_7", "t_8", "t_9", "t_10", 
                           "t_11", "t_12", "t_13", "t_14", "t_15", "t_16", "t_17", "t_18", "t_19"])

# When creating a row with new names, be careful not to overwrite the original first row.
# Load the xls file a dataframe
df = csv_file

print(df.shape)

(5244, 21)


In [62]:
# What is the topic of interest?

topic_of_interest = "t_9"

# Set the directory, this is based on on the same location as the doctopic matrix

os.chdir(CORPUS_PATH)

In [63]:
# rank texts in decending order

df1 = df[['file', topic_of_interest]] 
df2 = df1.sort_values(topic_of_interest, ascending=False)
df3 = df2.head(100)
df3

Unnamed: 0,file,t_9
1555,1997-02-21_387_Guardian.txt,0.949603
4382,2004-04-24_173_Guardian.txt,0.941108
1211,1994-12-09_367_Guardian.txt,0.911818
1257,1995-03-24_974_Guardian.txt,0.910585
1931,1998-06-12_136_Guardian.txt,0.907750
1229,1995-01-27_680_Guardian.txt,0.898128
4831,2005-04-23_149_Guardian.txt,0.896927
5147,2005-11-12_146_Guardian.txt,0.893726
1364,1995-11-10_667_Guardian.txt,0.889503
5014,2005-08-20_147_Guardian.txt,0.882449


In [64]:
# print text in rank 1

interest1 = df3['file'].iloc[0]
file  = open(interest1, 'r+')
file.read().splitlines()

['February 21, 1997',
 'MUSIC: PAUL OAKENFOLD;',
 'Spin doctor: the dance tracks DJs will be playing this weekend',
 'SECTION: THE GUARDIAN FEATURES PAGE; Pg. T18',
 'LENGTH: 387 words',
 "1. CJ Bolland, The Prophet 'Originally a B-side, I have been playing it for the",
 "past six months.'",
 "2. BBE, Flash 'An underground melodic sound that proves BBE is more than a one",
 "-hit wonder.'",
 "3. Underworld, Dark And Long 'An old track that is becoming an anthem.'",
 "4. Grace, Hand In Hand 'This is Grace's strongest single to date, with brilliant",
 "remixes from Germans Jam El Mar and Legend B.'",
 "5. Legend B, Lost In Love 'The original version was a bit too fast for the UK,",
 "but I play at a tempo that works really well.'",
 "6. HHC, We're Not Alone 'The legendary Colin Hudd returns . . . be prepared!'",
 "7. Three 'N' One, Reflect 'A club floor filler from an independent.'",
 "8. Energy S2, Cafe Del Mar 'Red Jerry's new tune on his own label. Jerry has",
 "mixed a couple of tune

In [65]:
# print text in rank 2

interest2 = df3['file'].iloc[1]
file  = open(interest2, 'r+')
file.read().splitlines()

['April 24, 2004',
 'The Guide: Clubs: * The Magnet LIVERPOOL',
 'BYLINE: marc rowlands',
 'SECTION: The Guide, Pg. 33',
 'LENGTH: 173 words',
 "Liverpool's clubbing renaissance is in full swing. The local live scene has",
 'always been healthy, but the DJ-based nights are looking rosy too. Wanton rave',
 'monthly Chibuku Shake Shake continues to go from strength to strength and',
 'newcomers T-Funkshun and Lost Souls look set to make their underground sounds',
 'regular fixtures (the latter holding another hush hush party this Fri 30 with',
 "Daniel Wang). The refurbished Garlands is busy and Yousef's Circus continues to",
 "trade off the hottest names in house, tonight's party with Sneak and Tom",
 'Middleton being no exception. The rampaging scouse house scene is also',
 'frighteningly popular. Pick of the bunch for many though, not least because you',
 'can count on a good night there almost every day of the week, is the',
 'sophisticated Magnet club/bar. A hangout for students, bo

In [67]:
# print text in rank 3

interest3 = df3['file'].iloc[2]
file  = open(interest3, 'r+')
file.read().splitlines()

['December 9, 1994',
 'IN THE MIX: MISS DJAX',
 'BYLINE: Ben Turner',
 'SECTION: THE GUARDIAN FEATURES PAGE; Pg. T10',
 'LENGTH: 367 words',
 "Date and place of birth: 21/6/62. Eindhoven, The Netherlands (I'm an old",
 'hippie).',
 'How did you break into DJ-ing? When I was 17 I started playing in a really',
 'chaotic club called the Vox in Eindhoven.',
 'What was the inspiration?  It was always my passion to work with music, and ever',
 'since I was a kid I have dreamt about being a musician or a DJ.',
 'What other jobs have you done? I worked at a record shop for eight years and at',
 'a distribution company for two years. I also played bass guitar in a new wave',
 'band.',
 'What are your favourite clubs? Cave Club (Austria), The Omen (Germany), Pure',
 '(Scotland).',
 'Who are your favourite DJs? Richie Hawtin, Jeff Mills, Laurent Garnier, Oliver',
 'Bondzio, Roland Casper.',
 'What are your favourite records of the moment?',
 'Plastikan: Musik, Random XS: Encounter, Robert Armani:

In [68]:
# print text in rank 4

interest4 = df3['file'].iloc[3]
file  = open(interest4, 'r+')
file.read().splitlines()

['March 24, 1995',
 'IN THE MIX: SISTER BLISS',
 'BYLINE: Ben Turner',
 'SECTION: THE GUARDIAN FEATURES PAGE; Pg. T10',
 'LENGTH: 974 words',
 'Date and place of birth',
 'London. 30/12/70.',
 'How did you break into DJing?',
 'I started DJing in 1991 at student parties in Birmingham. I returned to London',
 'after two years, just as Fruit Machine at Heaven was starting up. They were',
 'looking for DJs so I sent them a tape.',
 'What was the inspiration to start DJ-ing?',
 'I once hosted a big Christmas house party but, the week before, I went down with',
 'tonsilitis. The DJ for the party dropped the decks off at my house a few days',
 'before and I fiddled around with the decks and really enjoyed it. When I started',
 'to spin I was playing breakbeat, and that was inspired by Grooverider and Fabio.',
 'How would you describe your DJ sound?',
 'The biggest inspiration on my current sound was Trade, so I try to do what Trade',
 'do within a two-hour set. I play from one end of the dan

In [69]:
# print text in rank 5

interest5 = df3['file'].iloc[4]
file  = open(interest5, 'r+')
file.read().splitlines()

['June 12, 1998',
 'Music: Today Japan . . . tomorrow the world;',
 'The hot new records and gigs of the Japanese invasion:',
 'SECTION: The Guardian Features Page; Pg. 16',
 'LENGTH: 136 words',
 '7 Ignitions by the Boom Boom Satellites - in the vein of a zippier Chemical',
 'Brothers - is out on Monday on R&S.',
 'Fantasma is out now on Matador; Cornelius plays here in September.',
 'Happy End Of You, an album of Pizzicato 5 songs with remixes by 808 State, Saint',
 'Etienne, Dimitri From Paris and others, is out now.',
 'The Fantastic Plastic Machine by Fantastic Plastic Machine was released in',
 'April.',
 'Merzbow supports Extreme Noise Terror in the Meltdown festival in the Queen',
 'Elizabeth Hall on July 2.',
 'Encliptic by DJ Tsuyoshi Suzuki, music for the summer show in Paris of Issey',
 'Miyake, is out in July.',
 'Sushi 3003, a compilation of Japanese ClubPop, is currently available while',
 'Sushi 4004 is out in August. Pacific State, a collection of Japanese electronica'

In [70]:
# print text in rank 6

interest5 = df3['file'].iloc[5]
file  = open(interest5, 'r+')
file.read().splitlines()

['January 27, 1995',
 'IN THE MIX: PHIL MISON',
 'BYLINE: Ben Turner',
 'SECTION: THE GUARDIAN FEATURES PAGE; Pg. T10',
 'LENGTH: 680 words',
 'Date and place of birth: 30.9.70. Dagenham, Essex.',
 'How did you break into DJing?',
 "At the Recession Session at London's Milk Bar in 1991. I did the warm-up for my",
 'friend, Darren Emerson, when Nicky Holloway was on holiday.',
 'What was the inspiration to start DJ-ing?',
 'Jose Padilla at Cafe Del Mar, Ibiza, 1991. Hearing warm-up sets by Emerson, and',
 "one particular set by Danny Rampling at Boy's Own was also inspiring. I was very",
 'sick of walking into clubs and hearing anthems being played as warm-up tunes.',
 'What are your favourite clubs?',
 'The Full Circle in Slough, Cafe Del Mar and various one-off parties in Ibiza.',
 'Who are your favourite DJs?',
 'Darren Emerson, Pete Heller, James Holyrod, Dave Henley, Jose Paddila.',
 'What are your favourite records of the moment? Various Artists: Deep And Slow',
 '(Strictly Rhythm

In [71]:
# print text in rank 7

interest1 = df3['file'].iloc[6]
file  = open(interest1, 'r+')
file.read().splitlines()

['April 23, 2005',
 'The Guide: PREVIEW clubs: * Carl Cox And Friends LONDON',
 'BYLINE: Nick Green',
 'SECTION: The Guide, Pg. 32',
 'LENGTH: 149 words',
 "It's rare these days that a single DJ can carry a night. Punters want more from",
 'their clubbing than some jumped-up twerp in headphones hogging the limelight for',
 '10 hours. Carl Cox, on the other hand, has such a globally stratospheric',
 "reputation that you'd be happy to listen to him for a week. Even if you were",
 'deaf. Cox makes his UK comeback here at Heaven, an event that bisects a European',
 'tour that began a fortnight ago with a 10,000-capacity gig in Rotterdam and',
 'twists its way on through Glasgow, Dublin, Strasbourg and Paris. Cox, having',
 'been locked away in the studio for the winter, has energy to burn and his Second',
 'Sign album to promote, hence the eight-hour live show with dancers, guest',
 "vocalists like Republica's Saffron and Reprazent's Onnallee and DJ friends such",
 'as Michael de Hey, Jori

In [72]:
# print text in rank 8

interest1 = df3['file'].iloc[7]
file  = open(interest1, 'r+')
file.read().splitlines()

['November 12, 2005',
 'The Guide: Preview: Clubs: Inner City Acid GLASGOW',
 'BYLINE: Patric Baird',
 'SECTION: The Guide, Pg. 33',
 'LENGTH: 146 words',
 "Since exploding onto Glasgow's club scene a year ago, the guys behind Inner City",
 'Acid aim to celebrate their first birthday tonight with exactly the kind of',
 'party you would expect from a club responsible for featuring guest appearances',
 'by Mark Moore, Mekon, Scratch Massive and DJQ. In keeping with tradition, DJ duo',
 'FC Kahuna do the birthday honours with one of their acid tinged, psychedelic,',
 'beat driven sets which have made them a firm favourite with Glasgow clubbers.',
 'Responsible for the legendary London night The Big Kahuna Burger, Dan and Jon',
 'have since launched their Headstart nights and released an album, Machine Says',
 'Yes, an electrifying fusion of futurist acid house, dark techno and funky',
 'vocals. Support comes from house demon Nick Ferrara and the residents Jon',
 'Virtue, Monsieur DeLarge 

In [73]:
# print text in rank 9

interest1 = df3['file'].iloc[8]
file  = open(interest1, 'r+')
file.read().splitlines()

['November 10, 1995',
 'JOIN THE CLUB;',
 'Kate Herbert wades through the Ibiza compilations',
 'BYLINE: Kate Herbert',
 'SECTION: THE GUARDIAN FEATURES PAGE; Pg. T15',
 'LENGTH: 667 words',
 "WITH frostbite setting in you'd hope this summer's Ibiza hype would finally die",
 "down as the island's two and a half million visiting clubbers hung up their",
 'sequinned bikinis until 1996. But that would be underestimating Ibiza hype.',
 "Friday saw the release of yet another compilation, Ibiza '95 - not to be",
 "confused with Kiss FM's Ibiza '95 or Spiritually Ibiza, Club Ibiza, Havin' It In",
 "Ibiza, or this year's Cafe Del Mar compilation. Every week it seems another",
 "tribute to the Balearic party paradise is released. What's surprising is how",
 'varied they are in style - DJs, tracks and tempo.',
 "Ibiza '95 (21st Century/17 tracks mixed) is brought to us via Nicky Holloway,",
 'one of the old school DJs who reliably offers excellent tunes. Less respected',
 "for his mixing, he doe

In [74]:
# print text in rank 10

interest1 = df3['file'].iloc[9]
file  = open(interest1, 'r+')
file.read().splitlines()

['August 20, 2005',
 'The Guide: Preview clubs: * Stompa Phunk BRIGHTON',
 'BYLINE: John Mitchell',
 'SECTION: The Guide, Pg. 32',
 'LENGTH: 147 words',
 "Stompa Phunk's weekly sessions at Audio always seem to offer a more flavoursome",
 'beats combination than many a Brighton gaff. Everything from dirty underground',
 "stuff to tech house and raw techno seems to get a look in here - hence Friday's",
 'rather appropriate booking of Josh Wink. His style takes in everything from old',
 'school, Chicago house to Latin and tribal house to hard-edged techno, but this',
 'Philadelphia-born DJ, producer and remixer adds a dancefloor-friendly edge to',
 "his sets which would make the City of Brotherly Love's funk forefathers proud. A",
 'matchless mixing ability obviously helps in this department but Wink never takes',
 'his eye off the floor and is a master of responding to the crowd and building up',
 "the atmosphere. Warming things up are Stompa Phunk's posse of resident DJs,",
 'including 

***

# End of script

***