In [1]:
%matplotlib inline
import pandas as pd
import matplotlib.pyplot as plt

pd.set_option('display.mpl_style', 'default')
plt.rcParams['figure.figsize'] = (15, 5)

import os
import sys
import traceback

In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.manifold import MDS
from gensim import corpora, models, similarities, matutils

In [3]:
def data_cleanse(docs_to_clean):
    import re
    D=len(docs_to_clean)
    for d in range(0, D):
        docs_to_clean[d] = docs_to_clean[d].lower()
        docs_to_clean[d] = re.sub(r'-', ' ', docs_to_clean[d])
        docs_to_clean[d] = re.sub(r'[^a-zA-Z0-9 ]', '', docs_to_clean[d])
        docs_to_clean[d] = re.sub(r' +', ' ', docs_to_clean[d])
        docs_to_clean[d] = re.sub(r'\s\w\s', ' ', docs_to_clean[d]) #eliminate single letters
    return docs_to_clean

In [4]:
readme_df = pd.read_json('readmes.txt')

In [5]:
our_texts, names = readme_df[(readme_df.language== 'Python') & (readme_df.readme.notnull())]['readme'], readme_df[(readme_df.language== 'Python') & (readme_df.readme.notnull())]['name']

In [6]:
readme_list = []
for i in our_texts:
    readme_list.append(i)

In [7]:
our_texts = data_cleanse(readme_list)

In [8]:
stop=[]
with open('stoplist-multilingual.txt') as f:
    stop=f.readlines()
    stop=[word.strip('\n') for word in stop]

In [9]:
stop_str = "please found find two first would one may png rss custom titles hn metadata demo services meta jewelry ejercicios statistics licensed daily week schedule manager history core board uk top country get see writes name writing live stack nginx column columns wish published social media wanted learn learning plano world official mechanize argument years tags tag comment wrote old term apps como sources archive place pandas records grasa abdomen incinerador according across action actual actually adding addresses admin age agent allow along already always amount analyze another answer answers anything apache apis appear appears append appropriate around array aspx associated attempt attribute attributes authentication auto average avoid back background bad bajar bank bar barriga base bash basically basis big bit box br branch brand break brew budget bug bugs building bunch bus business ca cache caching calendar call calls card cards cases catalog categories category celery certain cfg changed character characters checking checks chicago child choice choose chrome city classes clean cleaning cli click client close co codes collect collected collects come comes commands common complete completed computer con conditions conf configure connect connection console contained contents context control convert cookie cookies copies correct correctly corresponding could count couple crawled crawling crawls creates creating creation credentials cron crontab crummy css csstarget curl dataset datasets dates datetime datos days dc dd de deal debug decided def defaults define defined definition del delay delete depending deploy der design designed desired detail detailed determine developed developer di dict dictionaries dictionary die dir directly dirty disclaimer display distribute distributed div doc docker document documents dom domain domains done downloader due dump dumps duplicate duplicates easier either el element elements else empty enabled engine english enjoy enough ensure entire entries entry env errors es etc even event events eventually every everything ex exactly excel except exchange execution exist existing exists exit express extension external extra extracted extracting extracts false far fast faster feature fetch fetcher fetching field figure filename fill filter final finally finding finds fine finished firefox fix flag flask follows foo fork formats formatted formatting frequency front full fun functionality gather generated generates gets give gives global glossary gnu go goal goes going graph great group groups gui guide handle handling hard head header headers high hit host hosted hour hours house however hr href idea ids il imdb imgur implement implementation implemented implied important improve inc incident individual ini initial initialize insert inside instance int integer intended interactive interest interested interesting internet intro introduction io ip ipython item javascript join js keep keys keyword keywords kind known la label language large last layout least left length less let lets li liability liable lib likely limit limited lines linux listed lists loaded loading localhost locally located locations logging logic login logs long looking looks los mac machine made making manage manually many master matches matching max maximum maybe means meant members memory menu merge methods might mine minimum minute minutes missing mkdir mkvirtualenv mlb mm mode model models modified moment mongo mongodb monitor month mostly move much multi must mysql navigate necessary network never next nice night node nodes none notebook nothing notice null numbers objects obtain often ones online original os others otherwise overview para parameter parameters params parsed parses part parts party pass passed past paste pattern pdf pdfs per perform performance period permission phantomjs php pickle pipeline pipelines plain platform plugin popular populate por port position posted postgres postgresql pprint pre prediction prerequisites present press pretty previous primary prints problem problems processed processing programming programs progress prompt proper properly property proxies proxy pulls purposes push put pyllage pypi pyquery quality que quemar queries question queue quickly quotes random range rank rankings rate rather raw re reading readme readthedocs ready real reason rebajar received recent recommend recommended refer reference regex register regular release released releases relevant remove rename replace report reporting reports require research resource resources response rest resulting retrieve retrieved returned row rows ruby saves saving schema scrap scrapper scrapyd screen se searches searching seconds seems selector selectors self sell sends sent sentence separate separated serve servers session sets setting several shall share shell short show shows similar since size sleep something sometimes soon sort span special specifically speed spiders split spreadsheet sqlalchemy ssh standard starting statement static stdout step steps still stock stop storing strings structured stuff style sub subject submit summary supported supports system tab tables take taken target task tasks tell template templates testing thanks thing think third though thread threads three timestamp tmp together token tor total track tracker tree tried tries trying turn tutorial types ubuntu ul un una und unicode unique unit unix updates upload upon urllib us usr usually utf utility valid var varchar variable variables venv verbose versions via view virtual virtualenvwrapper vs wait warning webpage webscraper webscraping welcome well wget whatever whether whole window within without word words wrapper xpath yet yield zip"
more_stops= stop_str.split()

In [10]:
stop = set(more_stops + stop)

In [11]:
stop_rev = []
for i in stop:
    stop_rev.append(unicode(i))

In [12]:
stop = stop_rev

In [24]:
texts = [[word for word in document.lower().split() if word not in stop] for document in our_texts]

In [25]:
dictionary = corpora.Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]

In [26]:
number_topics=40
model = models.LdaMulticore(corpus, id2word=dictionary, num_topics=number_topics, passes=10)

In [27]:
topics_indexed=[[b for (a,b) in topics] for topics in model.show_topics(number_topics,10,formatted=False)]
topics_indexed=pd.DataFrame(topics_indexed)
topics_indexed

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,charades,returns,ideas,current,gathering,game,snadalivegmailcom,form,source,library
1,class,amazon,reviews,python3,table,returns,print,downloading,image,single
2,team,django,class,articles,clone,worldcup,cd,installation,teams,easy
3,lxml,directory,source,environment,job,download,git,movie,clone,read
4,hodor,logical,illogical,episode,spock,season,star,trek,mudd,hodorhodor
5,email,usage,csv,directory,council,clone,requires,scrapyscraper,awesome,websites
6,mortgage,table,things,scrapesequencepy,default,stores,link,details,object,inserts
7,ncbi,nsw,takes,tool,record,address,usage,friend,check,government
8,craigslist,listings,appartment,appraisal,okapi,status,preflabel,nivel,analysis,aurelia
9,stats,collection,facebook,tool,deals,usage,save,scroll,automating,nytimes


In [17]:
# this topics matrix is still plagued with stopwords
#I'll go through each word in a sublime text file and create an additional list of stopwords
topics_1 = []
for i in topics_indexed:
    for n in i:
        topics_1.append(n)
        

In [18]:
topics_1 = set(topics_1)

In [19]:
print list(topics_1)

[u'code', u'help', u'lyrics', u'aggregating', u'text', u'actions', u'subreddit', u'course', u'london', u'ris', u'query', u'headlines', u'mayor', u'issues', u'bvl', u'web', u'27', u'radios', u'ncbi', u'title', u'crawl', u'selenium', u'wwoofscrape', u'simple', u'add', u'program', u'listing', u'usage', u'2014', u'folder', u'indicating', u'scraperwiki', u'started', u'freeaudiobooks', u'exports', u'return', u'finance', u'format', u'python', u'jazz', u'db', u'citenet', u'game', u'facebook', u'decimal102', u'scripts', u'dokku', u'fab', u'projetslisterprojetsaction', u'projects', u'username', u'articles', u'runs', u'scrapers', u'socialscrape', u'university', u'companies', u'email', u'magnet', u'timeout', u'team', u'movie', u'bamboo', u'restaurants', u'user', u'profile', u'set', u'list', u'twitter', u'scrapes', u'scraper', u'pyproxy', u'hodor', u'video', u'torrents', u'images', u'scraped', u'informacin', u'snadalivegmailcom', u'hacker', u'reviews', u'bclayseasupennedu', u'illogical', u'mortgage

In [22]:
further_stops = [u'code', u'help', u'aggregating', u'text', u'actions', u'ris', u'query', u'headlines', u'issues', u'web', u'27', u'radios', u'title', u'crawl', u'selenium', u'wwoofscrape', u'simple', u'add', u'program', u'2014', u'folder', u'indicating', u'scraperwiki', u'started', u'exports', u'return', u'format', u'python', u'db', u'decimal102', u'scripts', u'username', u'runs', u'scrapers', u'socialscrape', u'magnet', u'timeout', u'bamboo', u'user', u'profile', u'set', u'list', u'scrapes', u'scraper', u'pyproxy', u'scraped', u'05', u'res', u'beautifulsoup', u'content', u'waiting', u'score', u'import', u'scraperpy', u'public', u'retry10', u'modify', u'wi', u'dependencies', u'news', u'search', u'output', u'license', u'website', u'times', u'requests', u'software', u'warranty', u'app', u'scores', u'number', u'follower', u'api', u'path', u'semanal', u'counts', u'froriz50gatechedu', u'risk', u'httpwwwgooglecom', u'script', u'create', u'json', u'heroku', u'basic', u'free', u'formato', u'function', u'xlsx', u'documentation', u'worker', u'spider', u'export', u'ward', u'nicholasgonzalezyaleedu', u'true', u'11', u'06', u'morph', u'scraping', u'originally', u'work', u'jobid', u'project', u'soup', u'sqllite3', u'2013', u'install', u'error', u'virtualenv', u'example', u'lbd', u'beautiful', u'files', u'budscraper', u'links', u'sudo', u'site', u'general', u'sample', u'rent', u'file', u'pip', u'dumpxlsx', u'vagrant', u'scrape', u'note', u'scrapy', u'html', u'sqlite3', u'android', u'config', u'contacting', u'development', u'feedscraper', u'bruce', u'umbria', u'running', u'mapa', u'portal', u'plan', u'sql', u'requirements', u'beautifulsoup4', u'data', u'wwooffr', u'helpdescription', u'2009', u'database', u'url', u'httpwwwafdfrbase', u'resolving', u'requirementstxt', u'mortgages', u'contact', u'command', u'socialmediascraper', u'time', u'democratic', u'latest']

In [23]:
stop = stop + further_stops