In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import re
import string
import numpy as np
import nltk
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
nltk.data.path.append("../local_packages/nltk_data")

In [None]:
# load
tenders = pd.read_csv("../data/uk_tenders_cleaned.csv")


In [None]:
tenders.head()

### Clean up

In [None]:
# can we extract anything helpful from tender title?

tenders["title_lower"] = tenders['tender_title'].str.lower()

def remove_punct(ptext):
    # replace any punctuation with nothing "", effectively removing it
    ptext = re.sub(string=ptext,
                   pattern="[{}]".format(string.punctuation), 
                   repl="")
    return ptext

# TODO this doesn't parse out all our "[]" chars


In [None]:
# remove empty titles
tenders["title_lower"].replace('', np.nan, inplace=True)
tenders.dropna(subset=['title_lower'], inplace=True)


In [None]:
# apply removing punctuation function to all elements in the column "abstract"
tenders['title_processed'] = tenders['title_lower'].apply(remove_punct)

### Tokenize

In [None]:
# tokenize
tenders['title_tokens'] = tenders['title_processed'].apply(nltk.word_tokenize)

In [None]:
### Stops

In [None]:
### Stop words
stop_words = set(stopwords.words('english'))

# Define a function to remove stopwords from list of tokens
def clean_stopwords(tokens):
    # define stopwords
    stop_words = set(stopwords.words('english'))
    # add bespoke for this
    stop_words.add("amp")
    stop_words.add("test")
    # loop through each token and if the word isn't in the set 
    # of stopwords keep it
    return [item for item in tokens if item not in stop_words]
tenders['tokens_no_stops'] = tenders['title_tokens'].apply(clean_stopwords)
tenders.tokens_no_stops.head(35)

In [None]:
### Stem

In [None]:
def stemming(ptoken):
    # create stemming object
    stemmer = PorterStemmer()
    return [stemmer.stem(token) for token in ptoken]  

In [None]:
tenders["title_stemmed"] = tenders["tokens_no_stops"].apply(stemming)

### Lemmatize

In [None]:
# Define the lemmatise() function

def lemmatise(ptokens):
    lemmatizer = WordNetLemmatizer()
    return [lemmatizer.lemmatize(token) for token in ptokens]

In [None]:
tenders["title_lemmatised"] = tenders["tokens_no_stops"].apply(lemmatise)

In [None]:
tenders.title_lemmatised.head(10)

### Results

In [None]:
from collections import Counter

In [None]:
tenders.tokens_no_stops = tenders.tokens_no_stops.astype("string")

In [None]:
tenders.title_lemmatised = tenders.title_lemmatised.astype("string")
tenders.title_stemmed = tenders.title_stemmed.astype("string")

In [None]:

results = Counter()
tenders["title_lemmatised"].str.split("'").apply(results.update)
#tenders.title_stemmed.str.split("'").apply(results.update)


In [None]:
print("Tokens:", sum(dict(results).values()))


In [None]:
results.most_common()

In [None]:
# separate out the tokens and counts into lists
tokens, counts = zip(*results.most_common())

In [None]:
def plotall(px, py):
    
    plt.xticks(fontsize=12, rotation=90)
    plt.ylabel('Frequency')
    plt.xlabel("Tokens")
    plt.bar(px, py)
    plt.show()

In [None]:
plotall(tokens[3:30], counts[3:30])

## wordcloud

In [None]:
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator

In [None]:
# Join all the token data as string
#tenders['tokens_lem'] = tenders['tokens_no_stops'].apply(return_tostring)
text = " ".join(tenders['title_lemmatised'])
text = text.replace("'", "")
 

# The text string is then passed to the wordcloud function:
wordcloud = WordCloud(max_font_size=50, 
                      max_words=100, 
                      background_color="white").generate(text)

# Display the generated image:
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.show()


In [185]:
plt.savefig("uk_tender_titles_world.png")

<Figure size 640x480 with 0 Axes>