# SMART EMAIL CLASSIFIER

## Build model classifier 
The goal of this notebook is to create the model that classifies the data

## 1) Import libraries

In [1]:
from argparse import ArgumentParser
from bs4 import BeautifulSoup
from gensim.models import KeyedVectors
from gensim.models import Word2Vec 
from gensim.models.doc2vec import TaggedDocument, Doc2Vec
from nltk import map_tag, pos_tag, sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from sklearn.model_selection import GridSearchCV, StratifiedShuffleSplit, RandomizedSearchCV,train_test_split,StratifiedKFold
from sklearn.multiclass import OneVsRestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.svm import LinearSVC, SVC
# from sklearn.cross_validation import train_test_split,StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.decomposition import TruncatedSVD
from sklearn.ensemble import RandomForestClassifier
import nltk
import numpy as np
import os
import pandas as pd
import pickle
import re
import string
import sys
import warnings
import wikipedia
from gensim import corpora
from gensim.models.ldamulticore import LdaModel as Lda
import pyLDAvis.gensim
import matplotlib.gridspec as gridspec
import matplotlib.pyplot as plt
import math
%matplotlib inline
from wordcloud import WordCloud
import time
from operator import itemgetter
warnings.filterwarnings("ignore", category=UserWarning, module='bs4')
warnings.filterwarnings("ignore", category=DeprecationWarning) 
import scikitplot as skplt
from xgboost.sklearn import XGBClassifier
from xgboost import plot_importance






## 2) Prepare the training dataset

Our email classifier would classify our emails in these categories: business, leisure, news, personal, research and spam.

We are going to look for datasets that will be our training input. For that, we use two different sources: wikipedia and a spam dataset (http://www.dt.fee.unicamp.br/~tiago/smsspamcollection/).

The inputs are two files:

- 'Pages_Names_Sample.txt', which contains these words (each one in one line): Social Media, Business, Job, Sports, Email Spam, Entertainment, Science, Politics, Trading. These will be the input for the wikipedia search
- 'spam_dataset.csv', a cleaned version of the source, downloaded from http://www.dt.fee.unicamp.br/~tiago/smsspamcollection/smsSpamCollection.arff

Both files should be in '../data/databases'

In [2]:
def get_wikiPages(filename):
    pagesFile = open(filename,"r")
    df = pd.DataFrame(columns=['title','content'])
    titles = list()
    content = list()
    for pageName in pagesFile:
        try:
            page = wikipedia.page(pageName)
            titles.append(page.title)
            content.append(page.content)
            print(('    Parsing Wikipedia Page: '+page.title))
        except:
            continue
    df['title'] = titles
    df['content'] = content    
    return df

def get_wikiPages_df(topic):
    print('  Parsing Wikipedia webpages about %s'%topic)
    filename='../data/databases/%s.csv'%topic
    pagesFile = open(filename,"r")
    df = pd.DataFrame(columns=['title','content'])
    titles = list()
    content = list()
    for pageName in pagesFile:
        try:
            page = wikipedia.page(pageName)
            titles.append(page.title)
            content.append(page.content)
            print(('   Parsing Wikipedia Page: '+page.title))
        except:
            continue
    df['title'] = titles
    df['content'] = content 
    df.to_csv('../data/databases/%s_data.csv'%topic,',',header=True, columns=["title","content"],encoding='utf-8')


def clean_BOW(original):
# This function cleans a wikipedia page, ending up in a BOW
    article_txt = BeautifulSoup(original, "lxml").text
    article_txt = re.sub(r"{{.*}}","",article_txt)
    article_txt = re.sub(r'[^\x00-\x7F]',' ', article_txt)
    article_txt = re.sub(r"\[\[File:.*\]\]","",article_txt)
    article_txt = re.sub(r"\[\[Image:.*\]\]","",article_txt)
    article_txt = " ".join([i for i in article_txt.lower().split()])
    nouns = [word for word,pos in nltk.pos_tag(nltk.word_tokenize(' '.join([i for i in article_txt.split()]))) if (pos == 'NN' or pos == 'NNP' or pos == 'NNS' or pos == 'NNPS')]
    lemmat = " ".join(lemma.lemmatize(lemma.lemmatize(word),'v').translate({ord(ch): None for ch in '0123456789'}) for word in nouns  if ("@" not in word and '.com' not in word and 'www' not in word and 'http' not in word and '/' not in word  and '\\' not in word and ':' not in word))
    normalized = re.sub('[%s]' % re.escape(string.punctuation), ' ', lemmat).split()
    x= [s for s in normalized if s not in stop]
    y = [s for s in x if (len(s) > 2 and len(s) < 15) ]
    return y

def clean_doc2vec(original):
# This function cleans a wikipedia page, ending in a document with normal sentences and normal characters.
# Prepared to be th einput of doc2vec
    article_txt = BeautifulSoup(original, "lxml").text
    article_txt = re.sub(r"{{.*}}","",article_txt)
    article_txt = re.sub(r'[^\x00-\x7F]',' ', article_txt)
    article_txt = re.sub(r"\[\[File:.*\]\]","",article_txt)
    article_txt = re.sub(r"\[\[Image:.*\]\]","",article_txt)
    article_txt = " ".join([i for i in article_txt.split()])
    return article_txt

def scrapping_wikipedia(topic):
        topic='_'.join(topic.split())
        if Path('../data/databases/%s_data.csv'%topic).exists():
            print('Topic %s exists'%topic)
        else:    
#             get_wikiPages_df(topic) 
            print('Topic %s does not exist'%topic)




def prepare_training(outfile):

    # We first parse the wikipedia webpages of the general topics (business, sports...)
    df = get_wikiPages('../data/databases/Pages_Names_Sample.txt')            
    df.to_csv('../data/databases/sample_wiki_pages_data.csv',',',header=True, columns=["title","content"],encoding='utf-8')
    # We get a BOW of these webpages, and each word of the BOW would be another input for the topic
    df['clean']=[list(set(clean_BOW(webpage))) for webpage in df['content'].tolist()]
    # We print the results in files like 'Pages_Names_Sample.txt', as new input to parse

    topics = []
    for i,group in enumerate(df['clean']):
        with open('../data/databases/%s.csv'%('_'.join(df['title'][i].split())), 'w', newline='') as g:
            topics.append('_'.join(df['title'][i].split()))
            g.write("\n".join(group))

                             
    # These are the topics whose wikipedia pages we have parsed
#     topics = ['Family','Business','Economy','Sports','Entertainment','Science','Physics','Politics']

    # We create the dataframe in which we save all the data
    df_all = pd.DataFrame(columns=['title','content','Personal','Business','Sport', 'Entertainment', 'Research', 'Politics', 'Spam'])

    # As it takes a long time to parse many wikipedia webpages, we multiprocess   
    pool = mp.Pool(processes=20)
    pool.map(scrapping_wikipedia, [i for i in topics])
    pool.close()
    pool.join()

    for i,topic in enumerate(topics):
        topic='_'.join(topic.split())
        df3=pd.read_csv('../data/databases/%s_data.csv'%topic).drop(columns=['Unnamed: 0']).dropna()
        df3['title']='%s'%topic
        df3['Personal'] = 0
        df3['Business'] = 0
        df3['Entertainment'] = 0
        df3['Sport'] = 0
        df3['Research'] = 0
        df3['Politics'] = 0
        df3['Spam'] = 0
        if topic == 'Family' or topic == 'Party' or topic == 'Event' or topic == 'Marriage':
            df3['Personal'] = 1
        elif topic == 'Business' or topic == 'Economy' or topic == 'Finance' or topic == 'Money' or topic == 'Trade':
            df3['Business'] = 1
        elif topic == 'Sport' or topic == 'Football' or topic == 'Basketball' or topic == 'FIFA':
            df3['Sport'] = 1
        elif topic == 'Entertainment' or topic == 'Cinema' or topic == 'Television' or topic == 'Theatre' or topic == 'Art':
            df3['Entertainment'] = 1
        elif topic == 'Science' or topic == 'Physics' or topic == 'Astronomy' or topic == 'Chemistry':
            df3['Research'] = 1
        elif topic == 'Politics' or topic == 'Government':
            df3['Politics'] = 1
        df_all=df_all.append(df3,sort=True)
    
    df=pd.read_csv('../data/databases/spam_dataset.csv',header=None,names=['content','class'])
    df_spam=df[df['class']==1]
    df_spam['title'] = 'Spam'
    df_spam['Personal'] = 0
    df_spam['Research'] = 0
    df_spam['Entertainment'] = 0
    df_spam['Sport'] = 0
    df_spam['Politics'] = 0
    df_spam['Spam'] = 1
    df_spam['Business'] = 0         
    df_all=df_all.append(df_spam.drop(columns=['class']),sort=True)      
    # We save the intermediate steps
    df_all['class']=0
    df_all['class'][df_all['Business']==1]=1
    df_all['class'][df_all['Entertainment']==1]=2
    df_all['class'][df_all['Politics']==1]=3
    df_all['class'][df_all['Personal']==1]=4
    df_all['class'][df_all['Research']==1]=5
    df_all['class'][df_all['Sport']==1]=6
    df_all['class'][df_all['Spam']==1]=7
    df_all.to_csv('../data/databases/all_topics_noclean.csv',index_label=False)       
    df_all['BOW']=df_all['content'].apply(clean_BOW)
    df_all['sentences']=df_all['content'].apply(clean_doc2vec)
    df_all.drop(columns=['content','title']).dropna().to_csv(outfile,index_label=False)       



## 3) Running the models
To choose the word embeddings (using word2vec), the following file should be present: '../data/GoogleNews-vectors-negative300.bin.gz'

In [3]:
# Load NLTK's English stop-words list
stop_words = set(stopwords.words('english'))
#
# embeddings vector representations
#

def tag_pos(x):
    sentences = sent_tokenize(x)
    sents = []
    for s in sentences:
        text = word_tokenize(s)
        pos_tagged = pos_tag(text)
        simplified_tags = [
            (word, map_tag('en-ptb', 'universal', tag)) for word, tag in pos_tagged]
        sents.append(simplified_tags)
    return sents


def post_tag_documents(data_df):
    x_data = []
    y_data = []
    total = len(data_df['sentences'].as_matrix().tolist())
    plots = data_df['sentences'].as_matrix().tolist()
    genres = data_df.drop(['BOW', 'sentences', 'class'], axis=1).as_matrix()
    for i in range(len(plots)):
        sents = tag_pos(plots[i])
        x_data.append(sents)
        y_data.append(genres[i])
        i += 1
        if i % 5000 == 0:
            print(i, "/", total)

    return x_data, y_data


#
# train classifiers and argument handling
#

def train_test_svm(x_data, y_data, genres):

    x_train, x_test, y_train, y_test = train_test_split(x_data, y_data, test_size=0.2, random_state=40)

    print("LinearSVC")
    pipeline = Pipeline([
        ('clf', OneVsRestClassifier(SVC(), n_jobs=1)),
    ])
    parameters = [

        {'clf__estimator__kernel': ['rbf'],
         'clf__estimator__gamma': [1e-3, 1e-4],
         'clf__estimator__C': [1, 10]
        },

        {'clf__estimator__kernel': ['poly'],
         'clf__estimator__C': [1, 10]
        }
         ]

    grid_search(x_train, y_train, x_test, y_test, genres, parameters, pipeline)

def train_test_logit(x_data, y_data, genres):

    x_train, x_test, y_train, y_test = train_test_split(x_data, y_data, test_size=0.2, random_state=40)
    print("LogisticRegression")
    pipeline = Pipeline([
        ('clf', OneVsRestClassifier(LogisticRegression(solver='sag'), n_jobs=1)),
    ])
    parameters = {
        "clf__estimator__C": [0.01, 0.1, 1],
        "clf__estimator__class_weight": ['balanced', None],
    }
    grid_search(x_train, y_train, x_test, y_test, genres, parameters, pipeline)


def train_test_nb(x_data, y_data, genres):
    x_train, x_test, y_train, y_test = train_test_split(x_data, y_data, test_size=0.2, random_state=40)

    pipeline = Pipeline([
                ('tfidf', TfidfVectorizer(stop_words=stop_words)),
                ('clf', OneVsRestClassifier(MultinomialNB(
                    fit_prior=True, class_prior=None))),
            ])
    parameters = {
                'tfidf__max_df': (0.25, 0.5, 0.75),
                'tfidf__ngram_range': [(1, 1), (1, 2), (1, 3)],
                'clf__estimator__alpha': (1e-2, 1e-3)
            }
    grid_search(x_train, y_train, x_test, y_test, genres, parameters, pipeline)
    exit(-1)



def grid_search(train_x, train_y, test_x, test_y, genres, parameters, pipeline):
    grid_search_tune = GridSearchCV(pipeline, parameters, cv=2, n_jobs=30, verbose=10)
    grid_search_tune.fit(train_x, train_y)

    print()
    print("Best parameters set:")
    print(grid_search_tune.best_estimator_.steps)
    print()

    print("Applying best classifier on test data:")
    best_clf = grid_search_tune.best_estimator_
    predictions = best_clf.predict(test_x)

    print(classification_report(test_y, predictions, target_names=genres))


def parse_arguments():
    arg_parser = ArgumentParser()

    arg_parser.add_argument(
        '--clf', dest='classifier', choices=['nb', 'linearSVC', 'logit'])

    arg_parser.add_argument(
        '--vectors', dest='vectors', type=str, choices=['tfidf', 'word2vec', 'doc2vec'])

    return arg_parser, arg_parser.parse_args()


In [4]:
def run_models(vectors,classifier):

    print("Loading already processed training data")
    data_df = pd.read_csv('../data/databases/all_topics_clean.csv')
    # We have seen that the higher accuracy is when the topics personal, sport and politics are removed
    data_df = data_df.drop(columns=['Personal', 'Sport','Politics'])
    data_df = data_df[data_df['class'] != 4]
    data_df = data_df[data_df['class'] != 6]
    data_df = data_df[data_df['class'] != 3]
    data_df['class'][data_df['class'] == 7] = 0
    data_df['class'][data_df['class'] == 5] = 3
    # all the list of genres to be used by the classification report
    genres = ['Spam','Business','Entertainment','Research']
    list_labels = data_df["class"].tolist()
    if vectors == 'tfidf':

        # split the data, leave 1/3 out for testing
        data_x = data_df[['BOW']].as_matrix()
        data_y = data_df[['class']].as_matrix()
        stratified_split = StratifiedShuffleSplit(n_splits=2, test_size=0.33)
        for train_index, test_index in stratified_split.split(data_x, data_y):
            x_train, x_test = data_x[train_index], data_x[test_index]
            y_train, y_test = data_y[train_index], data_y[test_index]

        # transform matrix of plots into lists to pass to a TfidfVectorizer
        train_x = [x[0].strip() for x in x_train.tolist()]
        test_x = [x[0].strip() for x in x_test.tolist()]

        if classifier == 'nb':
            # MultinomialNB: Multi-Class OneVsRestClassifier
            pipeline = Pipeline([
                ('tfidf', TfidfVectorizer(stop_words=stop_words)),
                ('clf', OneVsRestClassifier(MultinomialNB(
                    fit_prior=True, class_prior=None))),
            ])
            parameters = {
                'tfidf__max_df': (0.7,0.8,0.9),
                'tfidf__min_df': (1,3,5),
                'tfidf__ngram_range': [(1, 1), (1, 2), (1, 3)],
                'clf__estimator__alpha': (1e-2, 1e-3)
            }
            grid_search(train_x, y_train, test_x, y_test, genres, parameters, pipeline)
            exit(-1)

        if classifier == 'linearSVC':
            # LinearSVC
            pipeline = Pipeline([
                ('tfidf', TfidfVectorizer(stop_words=stop_words)),
                ('clf', OneVsRestClassifier(LinearSVC(), n_jobs=1)),
            ])
            parameters = {
                'tfidf__max_df': (0.7,0.8,0.9),
                'tfidf__min_df': (1,3,5),
                'tfidf__ngram_range': [(1, 1), (1, 2), (1, 3)],
                "clf__estimator__C": [0.01, 0.1, 1],
                "clf__estimator__class_weight": ['balanced', None],
            }
            grid_search(train_x, y_train, test_x, y_test, genres, parameters, pipeline)
            exit(-1)

        if classifier == 'logit':
            # LogisticRegression
            pipeline = Pipeline([
                ('tfidf', TfidfVectorizer(stop_words=stop_words)),
                ('clf', OneVsRestClassifier(LogisticRegression(solver='sag'), n_jobs=1)),
            ])
            parameters = {
                'tfidf__max_df': (0.7,0.8,0.9),
                'tfidf__min_df': (1,3,5),
                'tfidf__ngram_range': [(1, 1), (1, 2), (1, 3)],
                "clf__estimator__C": [0.01, 0.1, 1],
                "clf__estimator__class_weight": ['balanced', None],
            }
            grid_search(train_x, y_train, test_x, y_test, genres, parameters, pipeline)
            exit(-1)
            
        if classifier == 'xgboost':

            pipeline = Pipeline([
                ('tfidf', TfidfVectorizer(stop_words=stop_words)),
                ('classifier', XGBClassifier()),
            ])
            
            parameters = {'classifier__learning_rate': [0.01, 0.03], 
                            'tfidf__max_df': (0.7,0.8,0.9),
                            'tfidf__min_df': (1,3,5),
                          'classifier__n_estimators': [700, 1000, 1200], 
                          'classifier__max_depth': [3, 4, 5]}

            grid_search(train_x, y_train, test_x, y_test, genres, parameters, pipeline)
            exit(-1)



    if vectors == 'word2vec':
        
        print('Running word2vec')
        word2vec_path = "GoogleNews-vectors-negative300.bin.gz"
        word2vec = KeyedVectors.load_word2vec_format(word2vec_path, binary=True)
        embeddings = get_word2vec_embeddings(word2vec, data_df)
        if classifier == 'linearSVC':
            train_test_svm(embeddings, list_labels, genres)
            exit(-1)
        if classifier == 'nb':
            train_test_nb(embeddings, list_labels, genres)
            exit(-1)    
        if classifier == 'logit':
            train_test_logit(embeddings, list_labels, genres)
            exit(-1)
 
def get_average_word2vec(tokens_list, vector, generate_missing=False, k=300):
    if len(tokens_list)<1:
        return np.zeros(k)
    if generate_missing:
        vectorized = [vector[word] if word in vector else np.random.rand(k) for word in tokens_list]
    else:
        vectorized = [vector[word] if word in vector else np.zeros(k) for word in tokens_list]
    length = len(vectorized)
    summed = np.sum(vectorized, axis=0)
    averaged = np.divide(summed, length)
    return averaged

def get_word2vec_embeddings(vectors, data_df, generate_missing=False):
    embeddings = data_df['BOW'].apply(lambda x: get_average_word2vec(x, vectors, 
                                                                                generate_missing=generate_missing))
    return list(embeddings)



## 4) Validate with articles and emails

In [5]:
def get_theme(ldamodel,doc):
    # This function prints the most probable groups in which the doc is classified, with probabilities
    # We load the LDA model
    # This is the list of labels for the 50 topics of the model. Change with your own classification
    topics = "Education Society France US Society Literature International Chemistry US Sports_and_gaming\
    numbers_and_dates Engineering Politics_and_Government International Was_and_military biology_and_medicine medicine languages Sports America_and_months\
    Canada__and_biology Weather Nature Energy Music IT political_geography UK_and_Ireland Canada_and_UK_Empire Geography\
    Calendar History History_and_literature Music Wikipedia_and_IT Animals Music_and_technology Economy Movies California_and_movies\
    Politics Politics Russia_and_US_affairs EU_and_US_affairs Entertainment Australia Sports Movies_and_TV Australia Geography".split()
       
    theme = ""
    cleandoc = clean_BOW(doc)
    doc_bow = ldamodel.id2word.doc2bow(cleandoc)
    doc_topics = ldamodel.get_document_topics(doc_bow, minimum_probability=0.05)
    if len(doc_topics) > 2:
        doc_topics.sort(key = itemgetter(1), reverse=True)
        theme = topics[doc_topics[0][0]]
        theme2 = topics[doc_topics[1][0]]
        theme3 = topics[doc_topics[2][0]]
        complete = 'Theme %s %.2f%% Theme %s %.2f%% Theme %s %.2f%%'%(theme,doc_topics[0][1],theme2,doc_topics[1][1],theme3,doc_topics[2][1])
    else:
        theme = topics[doc_topics[0][0]]
        complete = 'Theme %s %.2f%%'%(theme,doc_topics[0][1])
    return complete

def run_examples(model_path):
        ldamodel = pickle.load(open(model_path, 'rb'))
        # Here one should save in a string the content of an email, article or document, either pasting or with read
        # For simplicity here we use an example from a wikipedia webpage
        article = " Britney Jean Spears (born December 2, 1981) is an American singer, dancer, and actress. Born in McComb, Mississippi, and raised in Kentwood, Louisiana, she appeared in stage productions and television series, before signing with Jive Records in 1997. Spears's first two studio albums, ...Baby One More Time (1999) and Oops!... I Did It Again (2000), were global successes and made her the best-selling teenage artist of all-time. Referred to as the Princess of Pop, Spears was credited with influencing the revival of teen pop during the late 1990s and early 2000s. Spears adopted more mature and provocative themes for her next two studio albums, Britney (2001) and In the Zone (2003), and made her feature film debut in a starring role in Crossroads (2002). Following a series of heavily publicized personal struggles and erratic public behavior, Spears' career underwent a brief hiatus before the release of her fifth studio album Blackout (2007), which is often critically referred to as her best work. Her erratic behavior and hospitalizations led Spears to be placed on a still ongoing conservatorship. She returned to the top of record charts with her sixth and seventh studio albums, Circus (2008) and Femme Fatale (2011). In 2012, Forbes reported that Spears was the highest paid female musician of the year, with earnings of $58 million, having last topped the list in 2002.[1] During the promotion of her eighth and ninth studio albums, Britney Jean (2013) and Glory (2016), Spears embarked on a four-year concert residency, Britney: Piece of Me, at Planet Hollywood Resort & Casino in Las Vegas. Five of Spears' singles have reached number one in the United States: ...Baby One More Time, Womanizer, 3, Hold It Against Me  and  S&M . Other singles,  Oops!... I Did It Again  and  Toxic , topped Australian and Canadian charts. Spears has earned numerous awards and accolades, including a Grammy Award, six MTV Video Music Awards, including the Video Vanguard Award, seven Billboard Music Awards, including the Millennium Award[2] and a star on the Hollywood Walk of Fame. Billboard ranked her as the eighth biggest artist of the 2000s decade.[3] One of the world's best-selling music artists, Spears has sold over 100 million records worldwide.[4] In the United States, Spears remains the fourth best-selling female album artist of the Nielsen SoundScan era,[5] as well as the best-selling female albums artist of the 2000s.[6] In 2004, she launched a perfume brand with Elizabeth Arden, Inc., from which sales exceeded US$1.5 billion, as of 2012.[7] "
        print("Theme Britney Spears -> ",get_theme(ldamodel,article))
        article="Cristiano Ronaldo dos Santos Aveiro GOIH ComM (European Portuguese: [kɾiʃˈtjɐnu ʁoˈnaɫdu]; born 5 February 1985) is a Portuguese professional footballer who plays as a forward for Italian club Juventus and the Portugal national team. Often considered the best player in the world and regarded by many as one of the greatest players of all time,[4][5][6] Ronaldo has a record-tying five Ballon d'Or awards,[note 2] the most for a European player, and is the first player to win four European Golden Shoes. He has won 26 trophies in his career, including five league titles, five UEFA Champions League titles and one UEFA European Championship. A prolific goalscorer, Ronaldo holds the records for most official goals scored in Europe's top-five leagues (399), the UEFA Champions League (120), the UEFA European Championship (9), as well as those for most assists in the UEFA Champions League (34) and the UEFA European Championship (6). He has scored 680 senior career goals for club and country.  Born and raised on the Portuguese island of Madeira, Ronaldo was diagnosed with a racing heart at age 15. He underwent an operation to treat his condition, and began his senior club career playing for Sporting CP, before signing with Manchester United at age 18 in 2003. After winning his first trophy, the FA Cup, during his first season in England, he helped United win three successive Premier League titles, a UEFA Champions League title, and a FIFA Club World Cup. By age 22, he had received Ballon d'Or and FIFA World Player of the Year nominations and at age 23, he won his first Ballon d'Or and FIFA World Player of the Year awards. In 2009, Ronaldo was the subject of the most expensive association football transfer[note 3] when he moved from Manchester United to Real Madrid in a transfer worth €94 million (£80 million).  In Madrid, Ronaldo won 15 trophies, including two La Liga titles, two Copas del Rey, four UEFA Champions League titles, two UEFA Super Cups, and three FIFA Club World Cups. Real Madrid's all-time top goalscorer, Ronaldo scored a record 34 La Liga hat-tricks, including a record-tying eight hat-tricks in the 2014–15 season[note 4] and is the only player to reach 30 goals in six consecutive La Liga seasons. After joining Madrid, Ronaldo finished runner-up for the Ballon d'Or three times, behind Lionel Messi, his perceived career rival, before winning back-to-back Ballons d'Or in 2013 and 2014. After winning the 2016 and 2017 Champions Leagues, Ronaldo secured back-to-back Ballons d'Or again in 2016 and 2017. A historic third consecutive Champions League followed, making Ronaldo the first player to win the trophy five times.[7] In 2018, he signed for Juventus in a transfer worth €100 million, the highest fee ever paid for a player over 30 years old, and the highest ever paid by an Italian club.  A Portuguese international, Ronaldo was named the best Portuguese player of all time by the Portuguese Football Federation in 2015. He made his senior debut for Portugal in 2003 at age 18, and has since had over 150 caps, including appearing and scoring in eight major tournaments, becoming Portugal's most capped player and his country's all-time top goalscorer. He scored his first international goal at Euro 2004 and helped Portugal reach the final. He took over full captaincy in July 2008, leading Portugal to their first-ever triumph in a major tournament by winning Euro 2016, and received the Silver Boot as the second-highest goalscorer of the tournament, before becoming the highest European international goalscorer of all-time.[8] One of the most marketable athletes in the world, he was ranked the world's highest-paid athlete by Forbes in 2016 and 2017, as well as the world's most famous athlete by ESPN in 2016, 2017 and 2018."
        print("Theme Cristiano Ronaldo -> ",get_theme(ldamodel,article))     
        article="Barack Obama Obama standing with his arms folded and smiling 44th President of the United States In office January 20, 2009 – January 20, 2017 Vice President	Joe Biden Preceded by	George W. Bush Succeeded by	Donald Trump United States Senator from Illinois In office January 3, 2005 – November 16, 2008 Preceded by	Peter Fitzgerald Succeeded by	Roland Burris Member of the Illinois Senate from the 13th district In office January 8, 1997 – November 4, 2004 Preceded by	Alice Palmer Succeeded by	Kwame Raoul Personal details Born	Barack Hussein Obama II August 4, 1961 (age 57) Honolulu, Hawaii, U.S. Political party	Democratic Spouse(s)	Michelle Robinson (m. 1992) Children	 MaliaSasha Parents	 Barack Obama Sr. Ann Dunham Relatives	Obama family Education	Occidental College Columbia University (BA) Harvard Law School (JD) Awards	Nobel Peace Prize (2009) Profile in Courage Award (2017) Signature	 Website	 Office of Barack and Michelle Obama Obama Foundation White House Archives President Barack Obama, 2012 portrait crop.jpg	This article is part of  a series about Barack Obama Political positions Electoral history Early life and career Family Public image Pre-presidency Illinois State Senator 2004 DNC keynote address U.S. Senator from Illinois 44th President of the United States Presidency Timeline Policies Economy Energy Foreign policy Obama Doctrine Foreign trips Pardons Social Space Appointments Cabinet Judges First term Campaign for the Presidency 2008 general election Primaries Transition 1st inauguration First 100 days Affordable Care Act Iraq Withdrawal Death of Osama bin Laden Timeline: '09 '10 '11 '12 Second term Reelection campaign 2012 general election Reactions 2nd inauguration Immigration executive action Iran deal Cuban thaw Timeline: '13 '14 '15 '16 '17 Post-presidency Planned Library Obama Foundation One America Appeal Dreams from My Father The Audacity of Hope Nobel Peace Prize  vte Part of a series on New Democrats Barack Obama and Bill Clinton Ideology[show] People[hide] Bruce Babbitt Evan Bayh Joe Biden John Carney Tom Carper Lawton Chiles Bill Clinton Hillary Clinton Gerry Connolly Jim Davis Susan Davis Cal Dooley John Edwards Harold Ford Jr. Al From Dick Gephardt Al Gore Bob Graham Jim Himes John Kerry Ron Kind Mary Landrieu Rick Larsen Joe Lieberman Blanche Lincoln Will Marshall Jim Moran Sam Nunn Barack Obama Jared Polis Chuck Robb Timothy J. Roemer Paul Tsongas Allyson Schwartz Adam Smith Organizations[show] A coloured voting box.svg Politics portal vte Barack Hussein Obama II (/bəˈrɑːk huːˈseɪn oʊˈbɑːmə/ (About this sound listen);[1] born August 4, 1961) is an American politician who served as the 44th President of the United States from January 20, 2009, to January 20, 2017. A member of the Democratic Party, he was the first African American to be elected to the presidency and previously served as a United States Senator from Illinois (2005–2008).  Obama was born in 1961 in Honolulu, Hawaii, two years after the territory was admitted to the Union as the 50th state. Raised largely in Hawaii, he also lived for a year of his childhood in the State of Washington and four years in Indonesia. After graduating from Columbia University in 1983, he worked as a community organizer in Chicago. In 1988, he enrolled in Harvard Law School, where he was the first black president of the Harvard Law Review. After graduating, he became a civil rights attorney and a professor, teaching constitutional law at the University of Chicago Law School from 1992 to 2004. He represented the 13th district for three terms in the Illinois Senate from 1997 to 2004, when he ran for the U.S. Senate. He received national attention in 2004 with his March primary win, his well-received July Democratic National Convention keynote address, and his landslide November election to the Senate. In 2008, he was nominated for president a year after his campaign began and after a close primary campaign against Hillary Clinton. He was elected over Republican John McCain and was inaugurated on January 20, 2009. Nine months later, he was named the 2009 Nobel Peace Prize laureate, accepting the award with the caveat that he felt there were others  far more deserving of this honor than I .  During his first two years in office, Obama signed many landmark bills into law. The main reforms were the Patient Protection and Affordable Care Act (often referred to as  Obamacare , shortened as the  Affordable Care Act ), the Dodd–Frank Wall Street Reform and Consumer Protection Act, and the Don't Ask, Don't Tell Repeal Act of 2010. The American Recovery and Reinvestment Act of 2009 and Tax Relief, Unemployment Insurance Reauthorization, and Job Creation Act of 2010 served as economic stimulus amidst the Great Recession. After a lengthy debate over the national debt limit, he signed the Budget Control and the American Taxpayer Relief Acts. In foreign policy, he increased U.S. troop levels in Afghanistan, reduced nuclear weapons with the United States–Russia New START treaty, and ended military involvement in the Iraq War. He ordered military involvement in Libya in opposition to Muammar Gaddafi; Gaddafi was killed by NATO-assisted forces, and he also ordered the military operation that resulted in the deaths of Osama bin Laden and suspected Yemeni Al-Qaeda operative Anwar al-Awlaki.  After winning re-election by defeating Republican opponent Mitt Romney, Obama was sworn in for a second term in 2013. During this term, he promoted inclusiveness for LGBT Americans. His administration filed briefs that urged the Supreme Court to strike down same-sex marriage bans as unconstitutional (United States v. Windsor and Obergefell v. Hodges); same-sex marriage was fully legalized in 2015 after the Court ruled that a same-sex marriage ban was unconstitutional in Obergefell. He advocated for gun control in response to the Sandy Hook Elementary School shooting, indicating support for a ban on assault weapons, and issued wide-ranging executive actions concerning climate change and immigration. In foreign policy, he ordered military intervention in Iraq in response to gains made by ISIL after the 2011 withdrawal from Iraq, continued the process of ending U.S. combat operations in Afghanistan in 2016, promoted discussions that led to the 2015 Paris Agreement on global climate change, initiated sanctions against Russia following the invasion in Ukraine and again after Russian interference in the 2016 United States elections, brokered a nuclear deal with Iran, and normalized U.S. relations with Cuba. Obama left office in January 2017 with a 60 percent approval rating and currently resides in Washington, D.C. Since then, his presidency has been favorably ranked by historians and the general public.[2][3] He also had a high global approval rating, and the United States' reputation saw a dramatic upward shift during his presidency."
        print("Theme Obama -> ",get_theme(ldamodel,article))        
        article="Jupiter is the fifth planet from the Sun and the largest in the Solar System. It is a giant planet with a mass one-thousandth that of the Sun, but two-and-a-half times that of all the other planets in the Solar System combined. Jupiter and Saturn are gas giants; the other two giant planets, Uranus and Neptune, are ice giants. Jupiter has been known to astronomers since antiquity.[17] The Romans named it after their god Jupiter.[18] When viewed from Earth, Jupiter can reach an apparent magnitude of −2.94, bright enough for its reflected light to cast shadows,[19] and making it on average the third-brightest natural object in the night sky after the Moon and Venus.  Jupiter is primarily composed of hydrogen with a quarter of its mass being helium, though helium comprises only about a tenth of the number of molecules. It may also have a rocky core of heavier elements,[20] but like the other giant planets, Jupiter lacks a well-defined solid surface. Because of its rapid rotation, the planet's shape is that of an oblate spheroid (it has a slight but noticeable bulge around the equator). The outer atmosphere is visibly segregated into several bands at different latitudes, resulting in turbulence and storms along their interacting boundaries. A prominent result is the Great Red Spot, a giant storm that is known to have existed since at least the 17th century when it was first seen by telescope. Surrounding Jupiter is a faint planetary ring system and a powerful magnetosphere. Jupiter has 79 known moons,[21] including the four large Galilean moons discovered by Galileo Galilei in 1610. Ganymede, the largest of these, has a diameter greater than that of the planet Mercury.  Jupiter has been explored on several occasions by robotic spacecraft, most notably during the early Pioneer and Voyager flyby missions and later by the Galileo orbiter. In late February 2007, Jupiter was visited by the New Horizons probe, which used Jupiter's gravity to increase its speed and bend its trajectory en route to Pluto. The latest probe to visit the planet is Juno, which entered into orbit around Jupiter on July 4, 2016.[22][23] Future targets for exploration in the Jupiter system include the probable ice-covered liquid ocean of its moon Europa."
        print("Theme Jupiter -> ",get_theme(ldamodel,article))        
        article="Judas Iscariot[a] (died c. 30 – c. 33 AD) was a disciple and one of the original Twelve Disciples of Jesus Christ. According to all four canonical gospels, Judas betrayed Jesus to the Sanhedrin in the Garden of Gethsemane by kissing him and addressing him as  Rabbi  to reveal his identity to the crowd who had come to arrest him.[1] His name is often used synonymously with betrayal or treason. Judas's epithet Iscariot most likely means he came from the village of Kerioth, but this explanation is not universally accepted and many other possibilities have been suggested.  The Gospel of Mark, the earliest gospel, gives no motive for Judas's betrayal, but does present Jesus predicting it in advance at the Last Supper, an event also described in all the later gospels. The Gospel of Matthew 26:15 states that Judas committed the betrayal in exchange for thirty pieces of silver. The Gospel of Luke 22:3 and the Gospel of John 13:27 suggest that he was possessed by Satan. According to Matthew 27:1–10, after learning that Jesus was to be crucified, Judas attempted to return the money he had been paid for his betrayal to the chief priests and committed suicide by hanging. The priests used the money to buy a field to bury strangers in, which was called the  Field of Blood  because it had been bought with blood money. The Book of Acts 1:18 quotes Peter as saying that Judas used the money to buy the field himself and, he  [fell] headlong... burst asunder in the midst, and all his bowels gushed out.  His place among the Twelve Apostles was later filled by Matthias.  Despite his notorious role in the gospel narratives, Judas remains a controversial figure in Christian history. For instance, Judas's betrayal is seen as setting in motion the events that led to Jesus's crucifixion and resurrection, which, according to traditional Christian theology, brought salvation to humanity. The Gnostic Gospel of Judas – rejected by the mainstream Church as heretical – praises Judas for his role in triggering humanity's salvation and exalts Judas as the best of the apostles. Since the Middle Ages, Judas has been portrayed as a personification of the Jewish people and his betrayal has been used to justify Christian antisemitism."
        print("Theme Judas -> ",get_theme(ldamodel,article))        
        article="Zürich or Zurich (/ˈzjʊərɪk/ ZEWR-ik; see below for other names) is the largest city in Switzerland and the capital of the canton of Zürich. It is located in north-central Switzerland[3] at the northwestern tip of Lake Zürich. The municipality has approximately 400,028[4] inhabitants, the urban agglomeration 1.315 million[5] and the Zürich metropolitan area 1.83 million.[6] Zürich is a hub for railways, roads, and air traffic. Both Zürich Airport and railway station are the largest and busiest in the country.  Permanently settled for over 2,000 years, Zürich was founded by the Romans, who, in 15 BC, called it Turicum. However, early settlements have been found dating back more than 6,400 years ago.[7] During the Middle Ages, Zürich gained the independent and privileged status of imperial immediacy and, in 1519, became a primary centre of the Protestant Reformation in Europe under the leadership of Huldrych Zwingli.[8]  The official language of Zürich is German,[a] but the main spoken language is the local variant of the Alemannic Swiss German dialect, Zürich German.  Many museums and art galleries can be found in the city, including the Swiss National Museum and the Kunsthaus.[9] Schauspielhaus Zürich is one of the most important theatres in the German-speaking world.[10]  Zürich is a leading global city and among the world's largest financial centres despite having a relatively small population.[11] The city is home to a large number of financial institutions and banking companies. Most of Switzerland's research and development centres are concentrated in Zürich and the low tax rates attract overseas companies to set up their headquarters there.  Monocle's 2012  Quality of Life Survey  ranked Zürich first on a list of the top 25 cities in the world  to make a base within .[12] According to several surveys from 2006 to 2008, Zürich was named the city with the best quality of life in the world as well as the wealthiest city in Europe in terms of GDP per capita.[13][14][15] The Economist Intelligence Unit's Global Liveability Ranking[16] sees Zürich rank among the top ten most liveable cities in the world."
        print("Theme Zurich -> ",get_theme(ldamodel,article))        
        article="Broadway theatre,[nb 1] commonly known as Broadway, refers to the theatrical performances presented in the 41 professional theatres each with 500 or more seats located in the Theater District and Lincoln Center along Broadway, in Midtown Manhattan, New York City.[1][2] Along with London's West End theatre, Broadway theatre is widely considered to represent the highest level of commercial theatre in the English-speaking world.  The Theater District is a popular tourist attraction in New York City. According to The Broadway League, for the 2017–2018 season (which ended May 27, 2018), total attendance was 13,792,614 and Broadway shows had US$1,697,458,795 in grosses, with attendance up 3.9%, grosses up 17.1%, and playing weeks up 2.8%.[3]  The majority of Broadway shows are musicals. Historian Martin Shefter argues  'Broadway musicals,' culminating in the productions of Richard Rodgers and Oscar Hammerstein, became enormously influential forms of American popular culture  and helped make New York City the cultural capital of the nation.[4]   Contents 1	History 1.1	Early theatre in New York 1.2	Birth of the musical and post-Civil War 1.3	1890s and early 1900s 1.4	1900–1925 1.5	Competing with motion pictures 1.6	Between the wars 1.7	1950–1970 1.8	1980s 2	Description 2.1	Schedule 2.2	Producers and theatre owners 2.3	Personnel 2.4	Runs 2.5	Audience 2.6	Off-Broadway and US tours 2.7	Awards 3	Broadway theatres and current productions 3.1	Upcoming productions 4	References 5	External links History[edit] Early theatre in New York[edit]  Interior of the Park Theatre, built in 1798 New York did not have a significant theatre presence until about 1750, when actor-managers Walter Murray and Thomas Kean established a resident theatre company at the Theatre on Nassau Street, which held about 280 people. They presented Shakespeare plays and ballad operas such as The Beggar's Opera.[5] In 1752, William Hallam sent a company of twelve actors from Britain to the colonies with his brother Lewis as their manager. They established a theatre in Williamsburg, Virginia and opened with The Merchant of Venice and The Anatomist. The company moved to New York in the summer of 1753, performing ballad operas and ballad-farces like Damon and Phillida. The Revolutionary War suspended theatre in New York, but thereafter theatre resumed in 1798, the year the 2,000-seat Park Theatre was built on Chatham Street (now called Park Row).[5] The Bowery Theatre opened in 1826,[6] followed by others.  By the 1840s, P.T. Barnum was operating an entertainment complex in lower Manhattan. In 1829, at Broadway and Prince Street, Niblo's Garden opened and soon became one of New York's premiere nightspots. The 3,000-seat theatre presented all sorts of musical and non-musical entertainments. In 1844, Palmo's Opera House opened and presented opera for only four seasons before bankruptcy led to its rebranding as a venue for plays under the name Burton's Theatre. The Astor Opera House opened in 1847. A riot broke out in 1849 when the lower-class patrons of the Bowery objected to what they perceived as snobbery by the upper class audiences at Astor Place:  After the Astor Place Riot of 1849, entertainment in New York City was divided along class lines: opera was chiefly for the upper middle and upper classes, minstrel shows and melodramas for the middle class, variety shows in concert saloons for men of the working class and the slumming middle class. [7]  The plays of William Shakespeare were frequently performed on the Broadway stage during the period, most notably by American actor Edwin Booth who was internationally known for his performance as Hamlet. Booth played the role for a famous 100 consecutive performances at the Winter Garden Theatre in 1865 (with the run ending just a few months before Booth's brother John Wilkes Booth assassinated Abraham Lincoln), and would later revive the role at his own Booth's Theatre (which was managed for a time by his brother Junius Brutus Booth, Jr.). Other renowned Shakespeareans who appeared in New York in this era were Henry Irving, Tommaso Salvini, Fanny Davenport, and Charles Fechter."
        print("Theme Broadway -> ",get_theme(ldamodel,article))        
        article="Superman Superman with his cape billowing Art by Alex Ross Publication information Publisher	DC Comics First appearance	Action Comics #1 (cover date June 1938 / published April 18, 1938)[1] Created by	Jerry Siegel (writer) Joe Shuster (artist) In-story information Alter ego	Kal-El (birth name) Clark Kent (adopted name) Species	Kryptonian Place of origin	Krypton Team affiliations	Justice League Legion of Super-Heroes Partnerships	 Supergirl Superboy Superdog (Krypto) Power Girl Abilities	 See list[show] Superman is a fictional superhero created by writer Jerry Siegel and artist Joe Shuster. He first appeared in Action Comics #1, a comic book published on April 18, 1938.[1] He appears regularly in American comic books published by DC Comics, and has been adapted to radio shows, newspaper strips, television shows, movies, and video games.  Superman was born on the planet Krypton, and as a baby named Kal-El, was sent to Earth in a small spaceship by his scientist father Jor-El, moments before Krypton was destroyed in a natural cataclysm. His ship landed in the American countryside, where he was discovered and adopted by Jonathan and Martha Kent, a farming couple. They named him Clark. Clark displayed various superhuman abilities such as incredible strength and impervious skin, and his foster parents advised him to use his gifts for the benefit of humanity. Clark Kent resides in the fictional American city of Metropolis, where he works as a journalist for the Daily Planet, a newspaper. To protect his privacy, he changes into a colorful costume and uses the alias  Superman  when fighting crime. Superman's love interest is his fellow journalist Lois Lane, and his classic archenemy is the genius inventor Lex Luthor. He is a friend of many other superheroes in the DC Universe, such as Batman and Wonder Woman.  Superman is widely considered a cultural icon of the United States.[2][3][4][5] Superman popularized the superhero genre and defined its conventions. He is to this day one of the most lucrative superhero franchises."
        print("Theme Superman -> ",get_theme(ldamodel,article))                
        article="Ms. Buchanan:  Attached for your consideration is a draft of a Master Firm Purchase/Sale  Agreement between Enron North America Corp. and TrizecHahn.  Upon your  review, please give me a call to discuss any issues or questions you may have  regarding this matter.     Regards, Debra Perlingiere Enron North America Corp. Legal Department 1400 Smith Street, EB 3885 Houston, Texas 77002 dperlin@enron.com Phone 713-853-7658 Fax  713-646-3490 "
        print("ENRON email about Agreement between companies -> ",get_theme(ldamodel,article))        
        article="Hi Santi,  There is a letter for you which arrived today. I can leave it on your desk or send it to your home address. It might be from a bank, insurance etc.  If you rather have it sent home, could you please give me your address?  Cheers, Stanija"
        print("My email, personal -> ",get_theme(ldamodel,article))        
        article="Dear all    I’m pleased to share with you a newer version of my paper: The faint end of the z ~ 3-7 LAE LF behind lensing clusters with MUSE. We intend to submit this version of the paper to A&A by next Monday if there are no major comments. If you have any comments, please send them before the week-end (so before October 13-th ).   The mains modifications with respect to the previous version are :         We  added one figure with only our LF points (Fig. 11) to have a better view of the evolution of the faint end shape of our LFs.       We estimated more realistic error bars for the parameters recovered from the Schechter fits. This estimation is based on the tests made with different luminosity binnings. Using these new error bars, we re-computed the SFRD points. This did not change any of our previous conclusions.     We added one figure in the appendix to provide a global overview of the volume computation with all the steps shortly described.     Correction of some typos + clarifications on some of the technical parts.    This new version of the paper is available on the Wiki here and on a dropbox here.   Best regards, Geoffroy and Roser        On 06-08-2018 16:46, Geoffroy de la Vieuville wrote: > > Dear all,  > > I am pleased to share with you the second draft of my paper on the LAE luminosity function in lensing clusters. Thanks to all of you who have provided helpful comment on the previous version, most of them were accounted for in this new version. To those of you who still have not received an individual answer to their comments, i will do it as soon as i get back from holidays. > >   > > In agreement with the comments we received,  the most significant changes are as follows :  > >     We did some test on the fit of the LFs, with different luminosity bins >     We made Sect. 6.1.1  more clear >     We softened our conclusion on the relative contribution of LAE to reionization > >   > > The paper is available on the wiki, and also on a dropbox here.  > > Comments and coautorship requests are expected by  August 31th .  Thanks to all the coauthors for checking their name and affiliation ! I hope i did not forget anyone in the list, and if so, do not hesitate to tell me. > > Best, > > Geoffroy and Roser >          _______________________________________________ Musescience mailing list Musescience@lists.musemail.de http://lists.musemail.de/mailman/listinfo/musescience  "
        print("My email, Research in astronomy -> ",get_theme(ldamodel,article))        
        article="  ISG News has posted a new item, 'Groupware migration'  On Tuesday, October 2, starting at 07:00, we will migrate our groupware instance to another server. For about 1 hour you won't have access to your calendar. If you're one of the few people who also sync their email via groupware, mail will be offline too (you can always use webmail). After the migration your clients should just reconnect and resume syncing. If you notice any issues after we're done, please get in touch.  You may view the latest post at https://nic.phys.ethz.ch/news/2018/09/27/groupware-migration/  You received this e-mail because you asked to be notified when new updates are posted. Best regards, Christian Herzog daduke@phys.ethz.ch  "
        print("My email, IT -> ",get_theme(ldamodel,article))        
        article="Your license for the use of Microsoft Office 2011 Mac EN [OM12530] will expire soon!  Dear Erroz Ferrer Santiago,  Your license for the use of Microsoft Office 2011 Mac EN is going to expire on 26.09.2018 02:00!  Please renew (free software) or reorder (software that requires payment) the license in IT Shop if you plan to continue using the software otherwise you will have to uninstall the software once the license expires. For older software versions that are not avilable in IT Shop anymore just order the latest version and continue using the installed version.  Information regarding the actual status of the license status can be found in IT Shop → My Software → Order Info.  Best regards, IT Shop Team "
        print("My email, IT -> ",get_theme(ldamodel,article))        
        article="We find cheating girls in your area that you want to fuck. Warning! not a dating site. Continue if you want to fuck"
        print("My email, spam -> ",get_theme(ldamodel,article))        
        article="Want me? wanna me? Ohhhh.... ok, come to me )) Here my foto and address, find me :)  "
        print("My email, spam -> ",get_theme(ldamodel,article))        
        article="Dear Mr Erroz Ferrer  Attached please find your Mobility invoice as a PDF file.  Please find below the important details:  Amount:CHF 70.00 Payment method:Registered credit card  Please visit our customer portal to find and download a detailed breakdown of your invoice in the form of a CSV file.  Please call our 24h Service Center 0848 824 812 if you have any questions.  We wish you a nice journey.  Yours sincerely"
        print("My email, economy -> ",get_theme(ldamodel,article))       
        article="Message-ID: <12691783.1075863608825.JavaMail.evans@thyme>Date: Mon, 7 Jan 2002 02:23:16 -0800 (PST)From: no-reply@mail.southwest.comTo: diana.scholtes@enron.comSubject: Ticketless Travel Passenger ItineraryMime-Version: 1.0Content-Type: text/plain; charset=us-asciiContent-Transfer-Encoding: 7bitX-From: Southwest Airlines <no-reply@mail.southwest.com>X-To: Scholtes, Diana </O=ENRON/OU=NA/CN=RECIPIENTS/CN=DSCHOLT>X-cc: X-bcc: X-Folder: \ExMerge - Scholtes, Diana\TravelX-Origin: SCHOLTES-DX-FileName: ************ !!! IMPORTANT NOTICE !!! ************** BRING A COPY OF THIS ITINERARY WITH YOU TO   **** THE AIRPORT FOR FLIGHT CHECK-IN.             **** For important information needed prior* to traveling, including information on transporting* wrapped packages, please visit:* http://www.southwest.com/travel_center/travelAdvisory.html***************************************************This e-mail contains Southwest Airlines Ticketless Travel information and is being sent to you at the request of the Purchaser, Passenger, or individual responsible for arranging this air travel. ************ CONFIRMATION NUMBER ************* M2ABTEImportant: Please provide confirmation number and positive I.D. at the gate 1 hour prior to scheduled departure to receive a boarding pass for your flight. Snack Service:  If your total flight itinerary includes a seriesof flights that each are less than two hours in duration, youwill be served peanuts/raisins on each flight segment.  If your itinerary includes any nonstop flight longer than twohours, you will be served a packaged snack on that flightsegment.  Southwest Airlines does not serve sandwiches or meals, however, you may bring something to eat on board.***************** PASSENGER(S) ********************* WILLIAM SCHOLTES***************** ITINERARY ************************** Sunday, January 13 - Portland(PDX) to Sacramento(SMF)Flight 1709Depart Portland(PDX) at 08:55 AM andarrive in Sacramento(SMF) at 10:20 AMMonday, January 14 - Sacramento(SMF) to Portland(PDX)Flight 822Depart Sacramento(SMF) at 02:15 PM andarrive in Portland(PDX) at 03:40 PMThank you for purchasing Southwest Airlines Ticketless Travel. For questions or changes concerning your reservation, call 1-800-I-FLY-SWA (1-800-435-9792). This is an itinerary only and is not considered a receipt. *See Southwest Airlines Checkin Requirements, Refund Information, and Conditions of Contract below. *************** CLICK 'N SAVE ************************ Subscribe at www.southwest.com/email to receive emailnotice of the lowest one-way fares available only on southwest.com  ********** CHECKIN REQUIREMENTS ***************Southwest Airlines Ticketless Travel is nontransferable. Positive identification is required at time of check-in. Customer Check-in Requirement - Customers who do not claim their reservations at the departure gate desk at least ten (10) minutes before scheduled departure time will have their reserved space canceled and will not be eligible for denied boarding compensation.  ********* REFUND INFORMATION ****************** Any change to this itinerary may result in a fare increase. If you do not travel on this itinerary, you may qualify for a refund or an exchange. To make application for a refund of any unused air fare, please write: Southwest Airlines Refunds Department 6RF, P.O. Box 36611, Dallas, TX 75235-1611. Refund requests must include your confirmation number, date of travel and flight number, and all credit card billing information including the amount and purchase reference numbers. *********** CONDITIONS OF CONTRACT *********** Southwest Airlines Co. - Notice of Incorporated Terms - This notice is part of the Conditions of Contract. Air transportation by Southwest Airlines is subject to Southwest Airlines' Passenger Contract of Carriage, the terms of which are herein incorporated by reference. Incorporated terms include, but are not restricted to: (1) Limits on liability for baggage, including fragile or perishable goods, and availability of excess valuation coverage. Baggage liability is limited to $2,500 per Customer unless you purchase excess valuation liability coverage. Exception: Carrier will not be responsible for money, jewelry, cameras, video and electronic equipment, including computers, silverware, negotiable papers, securities, business documents, samples, items intended for sale, paintings, antiques, artifacts, manuscripts, furs, irreplacable books or publications, and similar valuables contained in checked or unchecked baggage. (2) Claims restrictions, including time periods in which Customers must file a claim or sue Southwest. (3) Our rights to change terms of the Contract. (4) Rules on reservations, check-in times, refusal to carry and smoking. (5) Our rights and limits of liability for delay or failure to perform service, including schedule changes, substitution of alternate air carriers or aircraft and rerouting. (6) Airline flights may be overbooked. If we deny you boarding due to an oversale and you have checked in at the gate at least 10 minutes before scheduled departure, with few exceptions, we compensate you. (7) Southwest reserves the right to refuse carriage to any person who is not able to produce positive identification. You may inspect the Contract of Carriage, or obtain a copy by sending a request to: Southwest Airlines Co. VP of Customer Relations, P.O. Box 36647, Love Field, Dallas, TX 75235-1647. **************** PRIVACY POLICY ****************** Read about Southwest Airlines' privacy policy atwww.southwest.com/traveler_info/privacy_policy.html . Should you wish to forward or distribute this message to others, please do so only with the express permission of the passenger(s) traveling. If you are not an intended recipient or if you have received this message in error, please promptly delete this message. Thank you for your cooperation and consideration. This is a post only mailing from Southwest Airlinesregarding your requested itinerary.Please do not attempt to respond to this message."
        print("ENRON email, Travel -> ",get_theme(ldamodel,article))        
        article="Jim, I spoke with Seab briefly on the current budget.  As you are aware, the initial budget scope for the CA proceeding was rather limited, and a significant portion of that budget was a catch all- actually relating to the PAC NW proceeding and various trips to Houston and DC.  Watkiss sees a more expanded role developing for Seab in the CA proceeding, given the issues and options that are arising.  I have asked Seab to give us a breakdown of where we are right now on the current budget.  I plan to meet with Seab and Watkiss next Wednesday to discuss scope of services further, and will get back to you.  Ray"
        print("ENRON email, Trade/Business -> ",get_theme(ldamodel,article))        
        article="Happy Halloween!!!"
        print("ENRON email, personal -> ",get_theme(ldamodel,article))        
        article="i would categorize things in the following manner: day ahead markets we want bid information (e.g., ancillary service markets, transmission markets, generation adjustment bids, load adjustment bids, import adjustment bids, export adjustment bids), initial preferred schedules, final schedules.hour ahead markets we want the same thing.real time market we want bid information, beep dispatch instructions.after the fact we want actual meter information to get actual unit by unit production and load zone consumption.per bob badeer, we want to see transmission information broken down between new firm use and existing transmission contracts.  we need adjustment bids, day ahead schedules, hour ahead schedules, real time adjustments, actual flows, and transmission availability by category of ownership (etcs, nfu).  we want this for all transmission lines.unit outage information.  planned outages and forced outages.rmr calls."
        print("ENRON email, marketing -> ",get_theme(ldamodel,article))        
        article="The Enron Center Garage has opened.Employees who work for business units that are scheduled to move to the new building and currently park in the Allen Center and Metropolitan garage are being offered a parking space in the new Enron Center garage.This is the only offer you will receive during the initial migration to the new garage. Spaces will be filled on a first come first served basis. The cost for the new garage will be the same as Allen Center garage which is currently $165.00 per month, less the company subsidy, leaving a monthly employee cost of $94.00.If you choose not to accept this offer at this time, you may add your name to the Enron Center garage waiting list at a later day and offers will be made as spaces become available. The Saturn Sky Ring  that connects the garage and both buildings will not be opened until summer 2001. All initial parkers will have to use the street level entrance to Enron Center North until Saturn Sky Ring access is available. Garage stairways next to the elevator lobbies at each floor may be used as an exit in the event of elevator trouble. If you are interested in accepting this offer, please reply via email to Parking and Transportation as soon as you reach a decision. Following your email, arrangements will be made for you to turn in your old parking card and receive a parking transponder along with a new information packet for the new garage.The Parking and Transportation desk may be reached via email at Parking and Transportation/Corp/Enron or 713-853-7060  with any questions. (You may enter & exit on Clay St. and Bell St., also pedestrians, will have to use the garage stairwell located on the corner of Bell & Smith.)Please respond via e-mail if you are interested in acquiring one of these spaces."
        print("ENRON email, parking -> ",get_theme(ldamodel,article))



## 5) Main function

In [6]:
if __name__ == '__main__':

# We select whether we want to check an email, read the csv from the splitted database or directly ingest the data    
    prepare_database=False
    train_model=True


    check_result=True

    stoplist = ['also use make people know many call include part find become like mean often different usually take with come give well get since type list say change see refer actually iii kinds ask would way something need things want every str =09 0909 image'.split(' ')][0]
    stop = set(list(stopwords.words('english'))+stoplist)
    exclude = set(string.punctuation) 
    lemma = WordNetLemmatizer()

    
    if prepare_database == True:
        print('Preparing the training dataset')
        outfile='../data/databases/all_topics_clean.csv'
        prepare_training(outfile)

    if train_model == True:
        t_start = time.time()

        vectors='tfidf'     # choose from: tfidf,word2vec 
        classifier = 'linearSVC' # choose from: nb,linearSVC,logit
        
        run_models(vectors,classifier)
        t_end = time.time()
        
        print("Finished! Time elapsed: %.2f [minutes]"
              % ((t_end - t_start) / 60. ), end='\n')
        
    if check_result == True:
        model_path = '../data/analysis/wikipedia/all_wiki/lda_model.pkl'
        run_examples(model_path)



        