# Imports Always First

In [125]:
import nltk
import pandas as pd
import os
import numpy as np
from encodings.aliases import aliases
import chardet
import re
from nltk.tokenize import word_tokenize
from nltk.stem.wordnet import WordNetLemmatizer
from sklearn.model_selection import train_test_split

# Download Nessessary Files

In [15]:
nltk.download(['punkt', 'wordnet'])

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\sammy\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\sammy\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

# Figure Eight Data

This data was provided by Figure 8 on a Attribution 4.0 International license:
    https://creativecommons.org/licenses/by/4.0/
    
The data can be retreived here: https://www.figure-eight.com/data-for-everyone/

In [14]:
csv_filename = [x for x in os.listdir() if '.csv' in x][0]
print(csv_filename)

Corporate-messaging-DFE.csv


# Extract Data

## Using Chardet

In [37]:
with open('Corporate-messaging-DFE.csv', 'rb') as file:
    encoding_dict = chardet.detect(file.read())
print(encoding_dict)

{'encoding': 'Windows-1254', 'confidence': 0.456508447473103, 'language': 'Turkish'}


## Looping Encoding Aliases

In [50]:
def extract_enc_data(document_name):
    encoder_list = list(set(aliases.values()))
    for encoder in encoder_list:
        try:
            return pd.read_csv('Corporate-messaging-DFE.csv', encoding=encoder)
            print(f"The first succesful encoding was {encoder}.")
            break
        except:
            pass

In [53]:
df = extract_enc_data('Corporate-messaging-DFE.csv')

# Preparing Data

## Extracting Labels and Messages

In [105]:
df = df[(df['category:confidence'] == 1) & (df.category != 'Exclude')]
X = df.text.str.lower().values
y = df.category.values

X, y

(array(['barclays ceo stresses the importance of regulatory and cultural reform in financial services at brussels conference  http://t.co/ge9lp7hpyg',
        'barclays announces result of rights issue http://t.co/lbiqqh3wwg',
        'barclays publishes its prospectus for its õú5.8bn rights issue: http://t.co/yzk24ie8g6',
        ...,
        'we╠óëô┬ëòóre working hard to do all we can to promote healthier lifestyles and diets for kids http://t.co/hw8oihymai',
        'yesterday, these #healthykids lit up broadway with #nestle, @iaaforg and some sporting stars: http://t.co/ydtbj60ofz',
        'z bhutta: problems with food&amp;land systems include land acquistion, commodity speculation affecting food prices&amp;lack of discussion #nins2013'],
       dtype=object),
 array(['Information', 'Information', 'Information', ..., 'Information',
        'Information', 'Information'], dtype=object))

## Extracting Urls from Text

In [106]:
url_regex = 'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'

In [123]:
def extract_url(message):
    detected_urls = re.findall(url_regex, message)
    
    return detected_urls

## Tokenazing and Lemmatizing

In [127]:
for i, message in enumerate(X):   
    
    message_original = message

    for url in extract_url(message):
        message = message.replace(url, 'placeholder')        
        
    # Tokenizing
    tokens = word_tokenize(message)
    
    # Lemmatizing
    lemmatizer = WordNetLemmatizer()    
    clean_tokens = []
    for token in tokens:
        lemma = lemmatizer.lemmatize(token).strip()
        clean_tokens.append(lemma)
    
    print(message_original)
    print(clean_tokens, '\n')
    
    if i > 3: break

barclays ceo stresses the importance of regulatory and cultural reform in financial services at brussels conference  http://t.co/ge9lp7hpyg
['barclays', 'ceo', 'stress', 'the', 'importance', 'of', 'regulatory', 'and', 'cultural', 'reform', 'in', 'financial', 'service', 'at', 'brussels', 'conference', 'placeholder'] 

barclays announces result of rights issue http://t.co/lbiqqh3wwg
['barclays', 'announces', 'result', 'of', 'right', 'issue', 'placeholder'] 

barclays publishes its prospectus for its õú5.8bn rights issue: http://t.co/yzk24ie8g6
['barclays', 'publishes', 'it', 'prospectus', 'for', 'it', 'õú5.8bn', 'right', 'issue', ':', 'placeholder'] 

barclays group finance director chris lucas is to step down at the end of the week due to ill health http://t.co/nkuhoafnsd
['barclays', 'group', 'finance', 'director', 'chris', 'lucas', 'is', 'to', 'step', 'down', 'at', 'the', 'end', 'of', 'the', 'week', 'due', 'to', 'ill', 'health', 'placeholder'] 

barclays announces that irene mcdermott

# Machine Learning Portion