In [45]:
# Extract sentences from ten xml files using BeautifulSoup and form a list of sentences
from bs4 import BeautifulSoup as b
import glob
for filename in glob.iglob('*.xml'):
#     print(filename)
    lst = []
    
    with open(filename, "rb") as f: # opening xml file
        content = f.read()
        soup = b(content, "lxml")

        for word in soup.find_all('sentence'):
            sentences=word.string.strip()
#             print ((sentences))
            lst.append(sentences)
   

In [46]:
# #Cleaning data using regex
# import re
# lst = re.sub(r"http\S+", "", str(lst))
# lst = re.sub(r"http", "", str(lst))
# lst = re.sub(r"@\S+", "",str(lst))
# lst = re.sub(r"[^A-Za-z0-9(),!?@\'\`\"\_\n]", " ", str(lst))
# lst = lst.lower()


In [47]:
# Count Vectorizer is used on list of sentences with stop words
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import pandas as pd
import numpy as np
from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS

%matplotlib inline

In [48]:
vect=CountVectorizer(ngram_range=(1,1),stop_words='english')

In [50]:
# Transform data into document term matrix
dtm=vect.fit_transform(lst)

In [51]:
dtm

<5450x6150 sparse matrix of type '<class 'numpy.int64'>'
	with 84268 stored elements in Compressed Sparse Row format>

In [24]:
# Document term matrix converted to pandas dataframe. 

pd.DataFrame(dtm.toarray(),columns=vect.get_feature_names())

Unnamed: 0,00,000,011984,021,03,04,047,053,054,055,...,younger,youth,yuk,yusuf,zalewski,zatkoska,zealand,zeqiri,zitter,zoned
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [52]:
# Topic modeling using LDA algorithm from sklearn

lda=LatentDirichletAllocation(n_components=5)

In [61]:
lda.fit_transform(dtm)



array([[ 0.83654032,  0.14318299,  0.00681115,  0.0066669 ,  0.00679864],
       [ 0.57067067,  0.03363659,  0.03340823,  0.03356937,  0.32871515],
       [ 0.42762016,  0.01149814,  0.01086181,  0.01062451,  0.53939538],
       ..., 
       [ 0.88437666,  0.02869452,  0.02863927,  0.02932203,  0.02896752],
       [ 0.01549187,  0.01541641,  0.01539375,  0.9382798 ,  0.01541818],
       [ 0.00379107,  0.98482357,  0.00379564,  0.00379702,  0.0037927 ]])

In [62]:
lda_dtf=lda.fit_transform(dtm)



In [63]:
# Display the top 10 words within each specific topic model using mglearn library
import numpy as np
sorting=np.argsort(lda.components_)[:,::-1]
features=np.array(vect.get_feature_names())

In [64]:
import mglearn
mglearn.tools.print_topics(topics=range(5), feature_names=features,
sorting=sorting, topics_per_chunk=5, n_words=10)

topic 0       topic 1       topic 2       topic 3       topic 4       
--------      --------      --------      --------      --------      
applicant     applicant     decision      business      act           
court         government    tribunal      telephone     tribunal      
evidence      licence       police        number        applicant     
ddungu        award         authority     reasons       documents     
order         site          appellant     assessment    legal         
george        act           act           2005          information   
minister      uganda        duty          protection    2004          
claim         state         assistance    judgment      decision      
sharman       authority     did           sharman       applicants    
hca           lra           law           court         section       




In [65]:
# Visualization of topic models using pyLDAvis library
from __future__ import  print_function
import pyLDAvis
import pyLDAvis.sklearn
pyLDAvis.enable_notebook()

In [66]:
graph=pyLDAvis.sklearn.prepare(lda,dtm,vect)
pyLDAvis.display(graph)

In [None]:
%matplotlib inline
from os import path
from PIL import Image
import numpy as np
import matplotlib.pyplot as plt
from wordcloud import WordCloud,STOPWORDS

In [None]:
d = path.dirname(__name__)

In [None]:
# use a few regular expressions to clean up pour data, and save it back to disk for future use
def standardize_text(df, text_field):
    df[text_field] = df[text_field].str.replace(r"http\S+", "")
    df[text_field] = df[text_field].str.replace(r"http", "")
    df[text_field] = df[text_field].str.replace(r"@\S+", "")
    df[text_field] = df[text_field].str.replace(r"[^A-Za-z0-9(),!?@\'\`\"\_\n]", " ")
    df[text_field] = df[text_field].str.replace(r"@", "at")
    df[text_field] = df[text_field].str.lower()
    return df

questions = standardize_text(questions, "text")

questions.to_csv("clean_data.csv")
questions.head()