In [2]:
# imports

import nltk
import pandas as pd
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
import os
import preprocessor as p  #pip install tweet-preprocessor

In [3]:
Home = os.getcwd() # Run this only once

Below are rules to find out concepts for Thanksgiving week

In [5]:
# fetching the list of .txt files
path = os.path.join(Home,"data/ThanksGiving")
list_doc = os.listdir(path)
os.chdir(path)


In [6]:
print("Loading\n")
doc_complete = []
for file in list_doc:
    print(file)
    content = open(file,'r').read().lower()          # lowering all the content
    #Using the tweet-preprocessor model to remove:
    #URLs, Hashtags, Mentions, Reserved, words(RT, FAV), Emojis, Smileys
    doc_complete.append(p.clean(str(content)))

Loading

Nov 21 2017.txt
Nov 25 2017.txt
Nov 23 2017.txt
Nov 24 2017.txt
Nov 26 2017.txt
Nov 22 2017.txt


In [7]:
# Removing stop words, Implementing the tf-idf Vectorizing, n-gram model
stopset = set(stopwords.words('english'))
stopset.update(["https","http","https://","thanksgiving"])
vectorizer = TfidfVectorizer(stop_words=stopset, use_idf=True, ngram_range=(1,3))

In [8]:
X =vectorizer.fit_transform(doc_complete)

LSA:

Input: X, a matrix where m is number of documents and n is the number of terms.

We are going to decompose X into three matrices U, S and T. After the decomposition we determing k, that is the number of concepts that we are going to find. 

                                           X = U S V^t

U will be m x k matrix. The rows will be documents and the columns will be the concepts.
S will be k x k diagoal matrix.The elements will be the amount of variations captured from the concepts.
V will be m x k (Transpose) matrix. The rows will be terms and the columns will be the concepts.

In [9]:
# Latent Semantic Analysis

lsa = TruncatedSVD(n_components=5,n_iter=100)
lsa.fit(X)

terms = vectorizer.get_feature_names()

In [10]:
for i,comp in enumerate(lsa.components_):
    termsInComp = zip(terms,comp)
    sortedterms = sorted(termsInComp, key=lambda x: x[1],reverse=True)[:10]
    print("Concept %d:" % i)
    for term in sortedterms:
        print(term[0])
    print(" ")

Concept 0:
happy
family
day
hope
great
weekend
dinner
break
holiday
time
 
Concept 1:
break
weekend
leftovers
hope
back
great
christmas
mlbb giveaway
via
mlbb
 
Concept 2:
break
week
america vote
america vote yes
bless america vote
concern god
concern god bless
discover rescue
discover rescue plan
ebook discover
 
Concept 3:
harvey weinstein accusers
accusers spend
accusers spend holiday
holiday together
spend holiday together
weinstein accusers spend
climb
climb record
climb record high
online sales climb
 
Concept 4:
day
leftovers
yesterday
mlbb giveaway
mlbb
hope
giveaway
man charged
friday
class cookbook
 


Below we are applying the same rules to find out concepts for Las Vegas Shooting

In [11]:
# fetching the list of .txt files
path = os.path.join(Home,"data/LasVegasShooting")
list_doc = os.listdir(path)
os.chdir(path)

print("Loading\n")
doc_complete = []
for file in list_doc:
    print(file)
    content = open(file,'r').read().lower()          # lowering all the content
    #Using the tweet-preprocessor model to remove:
    #URLs, Hashtags, Mentions, Reserved, words(RT, FAV), Emojis, Smileys
    doc_complete.append(p.clean(str(content)))

print("\n")

# Removing stop words, Implementing the tf-idf Vectorizing, n-gram model
stopset = set(stopwords.words('english'))
stopset.update(["https","http","https://","las","las vegas","vegas"])

vectorizer = TfidfVectorizer(stop_words=stopset, use_idf=True, ngram_range=(1,3))

X =vectorizer.fit_transform(doc_complete)

# Latent Semantic Analysis

lsa = TruncatedSVD(n_components=4,n_iter=100)
lsa.fit(X)

terms = vectorizer.get_feature_names()

for i,comp in enumerate(lsa.components_):
    termsInComp = zip(terms,comp)
    sortedterms = sorted(termsInComp, key=lambda x: x[1],reverse=True)[:10]
    print("Concept %d:" % i)
    for term in sortedterms:
        print(term[0])
    print(" ")

Loading

Sep 28 2017.txt
Oct 01 2017.txt
Sep 29 2017.txt
Sep 30 2017.txt
Oct 02 2017.txt
Oct 03 2017.txt


Concept 0:
shooting
new
video
like
go
see
via
get
going
one
 
Concept 1:
shooting
victims
massacre
prayers
shooter
mass shooting
tragedy
mass
gun
people
 
Concept 2:
robbery
nine years
prison
simpson
years prison
years
nine years prison
years robbery
botched robbery
simpson served
 
Concept 3:
edc
going
tickets
go
kuchar
er shooting police
jhonattan
time
like
scott
 


Below we are applying the same rules to find out concepts from 10 days of breaking news

In [13]:
# fetching the list of .txt files
path = os.path.join(Home,"data/RecentDays")
list_doc = os.listdir(path)
os.chdir(path)

print("Loading\n")
doc_complete = []
for file in list_doc:
    print(file)
    content = open(file,'r').read().lower()          # lowering all the content
    #Using the tweet-preprocessor model to remove:
    #URLs, Hashtags, Mentions, Reserved, words(RT, FAV), Emojis, Smileys
    doc_complete.append(p.clean(str(content)))

print("\n")
# Removing stop words, Implementing the tf-idf Vectorizing, n-gram model
stopset = set(stopwords.words('english'))
stopset.update(["https","http","https://","rt","really","decided"])
vectorizer = TfidfVectorizer(stop_words=stopset, use_idf=True, ngram_range=(1,3))

X =vectorizer.fit_transform(doc_complete)

# Latent Semantic Analysis

lsa = TruncatedSVD(n_components=4,n_iter=100)
lsa.fit(X)

terms = vectorizer.get_feature_names()

for i,comp in enumerate(lsa.components_):
    termsInComp = zip(terms,comp)
    sortedterms = sorted(termsInComp, key=lambda x: x[1],reverse=True)[:10]
    print("Concept %d:" % i)
    for term in sortedterms:
        print(term[0])
    print(" ")

Loading

@SkyNewBreak.txt
@BBCBreaking.txt
@CBSTopNews.txt
@ABCNewsLive.txt
@ReutersLive.txt
@AJELive.txt
@TWCBreaking.txt
@WSJbreakingnews.txt
@BreakingNews.txt
@cnnbrk.txt


Concept 0:
army
first
admit
admit army
admit army invaded
army invaded
army invaded kanu
army must
defence minister
defence minister admit
 
Concept 1:
news
8u
trump
reports
let
cnn
would
house
says
us
 
Concept 2:
wakey
wakey ignore
wakey wakey
wakey wakey ignore
aide might
aide might story
army attacks
army attacks villages
arrested
arrested exception
 
Concept 3:
reports
tillerson
says
replaced
secretary
gaddafi
cia
briefing
cia director
cia director pompeo
 
