# IST 736 Text Mining
### Web Scrap NFL Active Players List


Ryan Timbrook (RTIMBROO)  
DATE:10/18/2019<br>
Topic: <br>


## 1. Learning Objective
_____________________________________________________________________________________________
In this assignment, you have the freedom to find an existing text corpus, or create a new text corpus of your interest. If you are creating a new corpus, make sure it is not too large and thus too time-consuming to create.  
 
Then you will vectorize the text corpus using any tool that you are comfortable with: Weka, R, Python, etc. Explain the decisions you made during the vectorization process, e.g., did you merge lower- and uppercase? Then you will explore the text vectors and see if you can find anything interesting. The lectures showed some examples of comparative analysis and trend analysis. But you have the freedom to define what would be interesting patterns as long as you can explain it in a sensible way.


In [1]:
# toggle for working with colab
isColab = False

In [None]:
#*ONLY RUN WHEN WORKING ON COLAB*
#===================================================
# mount google drive for working in colab

#from google.colab import drive
#drive.mount('/content/gdrive', force_remount=True)

# working within colab, set base working directory
#base_dir = "./gdrive/My Drive/IST707_PRJ_Realestate/buy_rent_sell/"

# validate directory mapping
#ls f'{base_dir}'

# upload custome python files
#from google.colab import files
#uploaded_files = files.upload()

# print files uploaded
#for f in uploaded_files.keys():
#  print(f'file name: {f}')

#isColab = True

______________________________________________________________________________________________
### Coding Environment Setup
Import packages

In [2]:
# import packages for analysis and modeling
import pandas as pd #data frame operations
import numpy as np #arrays and math functions
import matplotlib.pyplot as plt #2D plotting
%matplotlib inline
import seaborn as sns #
import requests
import os
import io
import string
import urllib
from bs4 import BeautifulSoup
import pprint
import lxml.html
from datetime import date
from datetime import time
from datetime import datetime

In [3]:
# packages for twitter
import tweepy as tw
import codecs
from tweepy import OAuthHandler
import json
from tweepy import Stream
from tweepy.streaming import StreamListener

# packages for NLTK
import nltk
from nltk.tokenize import word_tokenize
from nltk.tokenize import TweetTokenizer
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
import re

import sys
from os import path

In [None]:
# nltk downloads
nltk.download('punkt')

In [None]:
# custome python packages
import rtimbroo_utils as br             # custome python helper functions

In [None]:
# set global properties
notebook_file_name = 'text_mine_nfl_players_list'
report_file_name = 'Text Mine Web Crawl'
app_name = 'Web Crawl NFL Players List'
log_level = 10 # 10-DEBUG, 20-INFO, 30-WARNING, 40-ERROR, 50-CRITICAL

# setup working directory structure
# set global properties
if not isColab:
    dataDir = './data'
    outputDir = './output'
    configDir = './config'
    logOutDir = './logs'
    imageDir = './images'
    modelDir = './models'
    corpusDir = './corpus'
else:
    # working within colab
    dataDir = f'{base_dir}data'
    outputDir = f'{base_dir}output'
    configDir = f'{base_dir}config'
    logOutDir = f'{base_dir}logs'
    imageDir = f'{base_dir}images'
    modelDir = f'{base_dir}models'
    corpusDir = f'{base_dir}corpus'

In [None]:
# create base output directories if they don't exist
if not os.path.exists(outputDir): os.mkdir(outputDir)
if not os.path.exists(logOutDir): os.mkdir(logOutDir)
if not os.path.exists(imageDir): os.mkdir(imageDir)
if not os.path.exists(modelDir): os.mkdir(modelDir)
if not os.path.exists(dataDir): os.mkdir(dataDir)
if not os.path.exists(configDir): os.mkdir(configDir)
if not os.path.exists(corpusDir): os.mkdir(corpusDir)

In [None]:
# get a logger for troubleshooting / data exploration
logger = br.getFileLogger(logOutDir+'/',app_name,level=log_level)
np.random.seed(42) # NumPy

### Compile Classes and Local Functions

In [None]:
class Team(object):
    
    def __init__(self,team_name):
        self.team_name = team_name
        self.roster_year = ''
        self.roster_players = []
        self.team_roster_player_stats = {}
        self.team_text = {}
        self.player_text = {}
            
    def set_roster_year(self,year):
        self.roster_year = year
        
    def set_roster_players(self,player):
        self.roster_players.append(player)
        
    def set_team_text(self,source,text_topic,text):
        now = datetime.utcnow().isoformat()
        logger.info(f'set_team_text: source:[{source}] | text_topic: [{text_topic}] | text: {text}')
        
        if source in self.team_text.keys():
            logger.info(f'source: {source} is in team_tex: {self.team_text}')
            
            if text_topic in self.team_text[source].keys():
                logger.info(f'text_topic: {text_topic} is in team_text: {self.team_text}')
                self.team_text[source][text_topic].update({now:text})
            else:
                logger.info(f'text_topic: {text_topic} not in team_text: {self.team_text}')
                self.team_text[source][text_topic] = {now:text}
        else:
            logger.info(f'source: {source} not in team_text: {self.team_text}')
            self.team_text[source] = {text_topic:{now:text}}
        
    def set_player_text(self,text_topic,text):
        now = datetime.utcnow().isoformat()
        if text_topic in self.player_text:
            self.player_text[text_topic].update({now:text})
        else:
            self.player_text[text_topic] = {now:text}
            
    def set_team_roster_players_stats(self,stats):
        now = datetime.utcnow().isoformat()
        key = re.findall('^[0-9]{4}-[0-9]{2}-[0-9]{2}',now)
        if key in self.team_roster_player_stats:
            self.team_roster_player_stats[key].update({key:text})
        else:
            self.team_roster_player_stats[key] = {key:text}
    
    def get_team_name(self):
        return self.team_name
    
    def get_roster_year(self):
        return self.roster_year
    
    def get_roster_players(self):
        return self.roster_players
    
    def get_team_text_by_topic(self,topic):
        team_text = []
        logger.info(f'get_team_text_by_topic: topic:[{topic}]')
        for s,t in self.team_text.items():
            logger.info(f'get_team_text_by_topic: key:[{s}] | value:[{t}]')
            if topic in self.team_text[s].keys():
                logger.info(f'get_team_text_by_topic: topic:[{s}] is in {self.team_text[s].keys()}' )
                team_text.append(self.team_text[s])
        
        return team_text
    
    def get_team_text_by_source(self,source):
        return self.team_text[source]
    
    def get_player_text_by_topic(self,topic):
        return self.player_text[topic]
    
    def get_team_roster_players_stats(self):
        return self.team_roster_player_stats
            

In [None]:
# team 
def create_team(parser):
    # get team name and year
    page_title = parser.title.string.strip()
    team = page_title.replace('Roster','')
    team = team.strip()
    year = re.findall('^\d+',team)
    year = year[0]
    team = team.replace(year,'').lstrip().lower()
    team = team.replace(' ','_')
    team = team.replace('-','_')
    
    logger.info(f'[{team}]')
    
    # instantiate the nfl team object
    new_team = Team(team)
    new_team.set_roster_year(year)
    
    return new_team

In [None]:
# get team text - anything on the roster page
# scrape team text
def get_team_text(parser):
    
    page_text = parser.find_all('p')
    page_text = lxml.html.fromstring(str(page_text)).text_content()
    #logger.info(page_text)
    return page_text

In [None]:
# player search
def player_search(parser):
    player_name_search = re.compile('^.+(player-name-col-lg).+')
    team_players = set()
    
    # get list of team players
    span = parser.find_all('span')
    for s in span:
        if re.match(player_name_search,str(s)):
            name = lxml.html.fromstring(str(s)).text_content()
            team_players.add(name)
    
    return list(team_players)

In [None]:
def replace_str_index(text,index=0,replacement=''):
    return '%s%s%s'%(text[:index],replacement,text[index+1:])

## 2. OBTAIN the data   
________________________________________________________________________________________________
* Step 1: [Get Active NFL Players List](https://www.lineups.com/nfl/rosters)
    * 


In [None]:
print(datetime.utcnow().isoformat())

In [None]:
# Web Page's to scrape
nfl_roster_url = 'https://www.lineups.com/nfl/rosters'
nfl_roster_root_url = 'https://www.lineups.com'
team_source = 'lineups.com'
people_source = 'lineups.com'

In [None]:
html = urllib.request.urlopen(nfl_roster_url).read()
soup = BeautifulSoup(html,'html.parser')

In [None]:
print(soup.title.string)

In [None]:
# Retrieve all of the anchor tags
roster_tags = soup('a')
tags = []
tag_urls = []
tag_content = []
tag_attr = []
for tag in roster_tags:
    # Look at the parts of a tag
    tags.append(tag)
    tag_urls.append(tag.get('href', None))
    tag_content.append(tag.contents[0])
    tag_attr.append(tag.attrs)

In [None]:
# create a roster dataframe
roster_df = pd.DataFrame()
roster_df['Tag'] = tags
roster_df['URL'] = tag_urls
roster_df['Content'] = tag_content
roster_df['Attrs'] = tag_attr
roster_df.head()
#logger.debug(roster_df)

In [None]:
# filter urls to just roster '/nfl/roster'
#nfl_roster_root_url
team_roster_uris = set()
pattern = re.compile(r'^(/nfl/roster/).+')

for url in roster_df['URL']:
    #logger.info(url)
    if not url == None:
        if re.match(pattern,url):
            logger.debug(url)
            team_roster_uris.add(nfl_roster_root_url+url)

logger.info(f'NFL Team Roster URL Count: {len(team_roster_uris)}')


### HTML Page Search Detail Examples


In [None]:
# loop over the Team Roster URLs - searching each of the subpages for the team roster
# Capture Title - Page Text about the Team - Player Names
#Player Names: <span class="player-name-col-lg">Matt Ryan</span>

player_name_search = re.compile('^.+(player-name-col-lg).+')
nfl_teams = []
team_roster_urls = list(team_roster_uris)

for u in team_roster_urls:
    nfl_team = None
    # get NFL Team Roster HTML Page
    logger.info(f'NFL Roster HTML Page to scrape: {u}')
    try:
        html = urllib.request.urlopen(u).read()
        soup = BeautifulSoup(html,'html.parser')

        # create a new team object
        nfl_team = create_team(soup)
        team_text = get_team_text(soup)
        
        # dump team text - having issues with URLs being found - site is in the process of updating their pages
        dumpDir = f'{corpusDir}/dump'
        if not os.path.exists(dumpDir): os.makedirs(dumpDir)
        with io.open(f'{dumpDir}/{nfl_team.get_team_name()}.txt','w+',encoding='utf8') as f:
            f.write(team_text)
        
        nfl_team.set_team_text(team_source,'team_roster_news',team_text)

        # get list of team players
        players = player_search(soup)
        for player in players:
            nfl_team.set_roster_players(player)

        nfl_teams.append(nfl_team)
        
    except BaseException as be:
        logger.warning(f'**WARNING** Caught Exception: {be} | URL: {u}')
        pass


In [None]:
len(nfl_teams)
for team in nfl_teams:
    print(team)
    print(team.get_team_name())
    logger.debug(team.get_team_text_by_topic('team_roster_news'))
    
    

In [None]:
# create the teams corpus
teams = []
team_years = []
team_texts = []
team_rosters = []
    
# load lists for dataframe
for t in nfl_teams:
    teams.append(t.get_team_name())
    team_years.append(t.get_roster_year())
    team_texts.append(t.get_team_text_by_source(team_source)) # not working 
    team_rosters.append(t.get_roster_players()) # not working
    
    # save each team text to it's own file - creating a corpus
    # create a directory for each team under the corpusDir
    teamDir = f'{corpusDir}/teams/{team_source}/{t.get_team_name()}'
    if not os.path.exists(teamDir): os.makedirs(teamDir)
    
    for topic,texts in t.get_team_text_by_source(team_source).items():
        topicDir = f'{teamDir}/{topic}'
        if not os.path.exists(topicDir): os.makedirs(topicDir)
        for key,text in texts.items():
            k = re.findall('^[0-9]{4}-[0-9]{2}-[0-9]{2}',key)
            #print(k[0])
            with io.open(f'{topicDir}/_{k[0]}_{t.get_team_name()}_team_text.txt','w+',encoding='utf8') as f:
                f.write(text)

# create dataframe
nfl_team_df = pd.DataFrame()   
nfl_team_df['team'] = teams
nfl_team_df['year'] = team_years
nfl_team_df['roster'] = team_rosters
nfl_team_df['text'] = team_texts

# save to csv as new datasource
save_as = f'{dataDir}/nfl_team_data_scrapped.csv'
nfl_team_df.to_csv(save_as,index=False)
    
    #break
nfl_team_df.head()

### 2.1 SCRUB / CLEAN
Perform vectorization tasks

Goal: Vectorize NFL Team Text data scrapped from web<br>
Each Team Text is considered an individual document<br>

Determin **what to count** and **how to count it**<br>

Basic text preparation pipeline:

* Load the raw text.
* Split into tokens.
* Convert to lowercase. -> not for sentiment analysis
* Remove punctuation from each token.
* Filter out remaining tokens that are not alphabetic.
* Filter out tokens that are stop words.
* Perform stemming -> [nltk reference](https://pythonprogramming.net/stemming-nltk-tutorial/)

In [None]:
# global variables
initial_words_count = 0
cleaned_words_count = 0
feature_thres = 2
rare_thres = 5

In [None]:
def wordcloud_draw(data, color='black', width=1000, height=750, max_font_size=50, max_words=100):
    words = ' '.join([word for word in data])
    #cleaned_word = " ".join([word for word in words])
    wordcloud = WordCloud(stopwords=STOPWORDS,
                    background_color=color,
                    width=width,
                    height=height,
                    max_font_size=max_font_size,
                    max_words=max_words,
                     ).generate(words)
    plt.figure(1,figsize=(10.5, 7))
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis('off')
    plt.show()

In [None]:
def count_features(dic,save_as):
    bow = []
    # collect kept feature set after cleaning - and count frequencey
    kept_features = {}
    for _id,features in dic.items():
        for word in features:
            bow.append(word)
            if not word in kept_features:
                kept_features[word] = 1
            else:
                word_count = kept_features[word]
                kept_features[word] = word_count+1

    # put the feature word counts into named dictionary and data frame for simpler sorting and observation
    kept_features_named = {'feature':[],'feature_count':[]}
    for feature, count in kept_features.items():
        kept_features_named['feature'].append(feature)
        kept_features_named['feature_count'].append(count)

    # convert dictionary to dataframe for easier sorting
    kept_features_df = pd.DataFrame(kept_features_named)
    kept_features_df_sorted = kept_features_df.sort_values(by=['feature_count','feature'],ascending=False)

    # save df as new data source
    #save_as = f'{dataDir}/kept_feature_counts.csv'
    kept_features_df_sorted.to_csv(save_as,index=False)

    #kept_features_df_sorted.head()
    
    return kept_features_df_sorted,bow

In [None]:
# clean text
def clean_text(text_dic,
                     custom_stop_words=[],
                     remove_pun=True,
                     remove_non_alphabetic=True,
                     remove_stop_words=True,
                     lower_case=False,
                     stemming=False,
                    ):
    
    total_tokens_prior = 0
    total_tokens_after = 0
    
    regex_hash=re.compile('^#.+')
    regex_url=re.compile('^http*')

    for _id, tokens in text_dic.items():
        hashes = []
        urls = []
        numbers = []
        non_words = []
        logger.info(f'text: {_id} | feature length prior to text cleaning steps: {len(tokens)}')

        total_tokens_prior = total_tokens_prior+len(text_dic[_id])
        logger.info(f'Total Tokens Prior To Cleaning: {total_tokens_prior}')
        
        try:
            for t in tokens:
                if((re.match(regex_hash,t))):
                    hashes.append(t)

                elif((re.match(regex_url,t))):
                    urls.append(t)

            # remove hash tags
            if len(hashes) > 0:
                # remove these hash tokens from text_tokens
                cleaned_text_tokens = [x for x in tokens if (x not in hashes)]
                text_dic[_id] = cleaned_text_tokens
                tokens = text_dic[_id]
                logger.info(f'text: {_id} | After hash tag removal: {len(tokens)}')

            # remove urls
            if len(urls) > 0:
                cleaned_text_tokens = [x for x in tokens if (x not in urls)]
                text_dic[_id] = cleaned_text_tokens
                tokens = text_dic[_id]
                logger.info(f'text: {_id} | After URL removal: {len(tokens)}')

            # remove punctuation
            if remove_pun:
                table = str.maketrans('','',string.punctuation)
                stripped = [w.translate(table) for w in tokens]
                if len(stripped) > 0:
                    text_dic[_id] = stripped
                    tokens = text_dic[_id]
                    logger.info(f'text: {_id} | After punctuation removal: {len(tokens)}')

            # remove tokens that are not in alphabetic
            if remove_non_alphabetic:
                alpha_words = [word for word in tokens if word.isalpha()]
                if len(alpha_words) > 0:
                    text_dic[_id] = alpha_words
                    tokens = text_dic[_id]
                    logger.info(f'text: {_id} | After non alphabetic removal: {len(tokens)}')
            
            # lower case
            if lower_case:
                lower_words = [word.lower() for word in tokens]
                text_dic[_id] = lower_words
                tokens = text_dic[_id]
                logger.info(f'text: {_id} | After lower case: {len(tokens)}')

            
            # filter out stop words
            if remove_stop_words:
                stop_words = set(stopwords.words('english'))
                new_list = set(list(stop_words) + custom_stop_words)
                not_stop_words = [w for w in tokens if not w in stop_words]
                if len(not_stop_words) > 0:
                    text_dic[_id] = not_stop_words
                    tokens = text_dic[_id]
                    logger.info(f'text: {_id} | After stop word removal: {len(tokens)}')
            
            # consider stemming...???
            if stemming:
                ps = PorterStemmer()
                stem_words = [ps.stem(word) for word in tokens]
                text_dic[_id] = stem_words
                tokens = text_dic[_id]
                logger.info(f'text: {_id} | After stemming: {len(tokens)}')
            
            # count tokens
            total_tokens_after = total_tokens_after+len(text_dic[_id])
            
        except BaseException as be:
            logger.warning(f'**WARNING** Caught BaseException: {be}')
            pass

    logger.info(f'Total Tokens Prior To Cleaning: {total_tokens_prior}')
    logger.info(f'Total Tokens After Cleaning: {total_tokens_after}')
    
    
    return text_dic

**Split Team text into it's tokens(feature words) using [NLTK word_tokenizer](https://www.nltk.org/api/nltk.tokenize.html)**<br>

**Parameters**


In [None]:
# get raw team corpus text from file system
_files = []
_filenames = {}
filenames_index = 0

#path=f'{corpusDir}'
path=f'{corpusDir}/dump_archive'

for dirpath, dirs, files in os.walk(path):
    logger.debug(f'Raw Text Corpus Directory Search: {dirpath}')
    logger.debug(f'Raw Text Corpus Files Search: {files}')
    logger.debug(f'Raw Text Corpus Files Search - file count: {len(files)}')
    if len(files) > 0:
        for f in files:
            _files.append(f'{dirpath}/{f}')
            _filenames[filenames_index] = f
            filenames_index+=1
logger.info(f'Raw Text Corpus Files Search - Files List Found:\n{_files}')
logger.info(f'Raw Text Corpus Files Search - Filenames Dictionary Found:\n{_filenames}')

In [None]:
# tokenize each teams text data
team_tokens = {}
teams = []
t_tokens = []
teams_token_totals = []
raw_corpus_token_count = 0
for f in _files:
    logger.info(f'team text file: {f}')
    team_name = f.split('/')[-1]
    #team_name = team_name.split('.')[0]
    logger.info(f'team name: {team_name}')
    tokens = []
    file_token_count = 0
    
    with open(f'{f}','r') as f:
        team_text = f.readlines()
        
        for i,line in enumerate(team_text):
            logger.debug(f'team text line: {i}')
            logger.debug(f'team text line: {line}')
            
            tokens = word_tokenize(line)
            file_token_count = file_token_count+len(tokens)
            raw_corpus_token_count = raw_corpus_token_count+len(tokens)
            
            logger.debug(tokens)
            
            #break
        team_tokens[team_name] = tokens
        teams.append(team_name)
        t_tokens.append(tokens)
        teams_token_totals.append(file_token_count)
        logger.info(f'file token count: {file_token_count}')
        
      
    #break
logger.info(f'Raw Corpus Token Count: {raw_corpus_token_count}')
#wordcloud_draw(str(team_text))

In [None]:
tt = pd.DataFrame()
tt['team'] = teams
tt['tokens'] = t_tokens
tt['total_tokens'] = teams_token_totals
tt.sort_values(by="total_tokens", ascending=False).head(10)

In [None]:
tt.total_tokens.describe()

In [None]:
# get sample of word clouds for teams
for i,team in enumerate(team_tokens):
    #print(team_tokens[team])
    wordcloud_draw(team_tokens[team],color='white',max_words=250)
    if i > 5: break

#### INIT FEATURE BoW Count
Perform initial Bag Of Words Count - save off for reference and insights into vocabular size reduction

In [None]:
save_as = f'{dataDir}/init_feature_counts.csv'
iF = count_features(team_tokens,save_as=save_as)
bag_of_words = iF[1]
logger.info(f'Initial Bag Of Word Feature Count: {len(bag_of_words)}')
iF[0].head(20)

In [None]:
# look at initial word cloud of bag of words
wordcloud_draw(bag_of_words, color='white', max_words=300)

*Cleaning Vocabular Size Reduction* <br>
* Total Tokens Prior To Cleaning: 18647
* Total Tokens After Cleaning: 9879

In [None]:
cleaned_text = clean_text(team_tokens,
                     custom_stop_words=[],
                     remove_pun=True,
                     remove_non_alphabetic=True,
                     remove_stop_words=True,
                     lower_case=True,
                     stemming=False,
                    )

kept_feats = cleaned_text

#### KEPT FEATURES BoW Count
Perform word frequency count for kept feature list

In [None]:
# save text as csv
# put the feature word counts into named dictionary and data frame for simpler sorting and observation
kept_features = {'_id':[],'features':[]}
for _id, features in kept_feats.items():
    kept_features['_id'].append(_id)
    kept_features['features'].append(features)

# convert dictionary to dataframe for easier sorting
kept_features_df = pd.DataFrame(kept_features)
kept_features_df_sorted = kept_features_df.sort_values(by=['_id'],ascending=True)

# save df as new data source
save_as = f'{dataDir}/kept_features.csv'
kept_features_df_sorted.to_csv(save_as,index=False)

In [None]:
kept_features_df_sorted.head()

In [None]:
# save kept features as new corpus to be consumed by vectorization objects
for _id, features in kept_features_df_sorted.iterrows():
    line = ' '.join([feat for feat in kept_features_df_sorted.iloc[_id].features])
    
    cleanedDir = f'{corpusDir}/teams/v2/cleaned'
    if not os.path.exists(cleanedDir): os.makedirs(cleanedDir)
    
    with io.open(f'{cleanedDir}/{kept_features_df_sorted.iloc[_id]._id}_nfl_team_text.txt','w+',encoding='utf8') as f:
        f.write(line)

In [None]:
save_as = f'{dataDir}/kept_lower_feature_counts.csv'
kf = count_features(kept_feats,save_as=save_as)
clean_bag_of_words = kf[1]
kf[0].head(20)

In [None]:
#clean_bag_of_words

In [None]:
# look at cleaned word cloud of bag of words
wordcloud_draw(clean_bag_of_words, color='white', max_words=500)

In [None]:
# get team text feature files
#kept_feats_file = 'kept_feature_counts.csv'
kept_lower_feats_file = 'kept_lower_feature_counts.csv'
#kept_feats_file = 'kept_lower_stem_feature_counts.csv'
kept_feats_file = 'kept_features.csv'

kept_feats_counts = pd.read_csv(f'{dataDir}/kept_lower_feature_counts.csv',error_bad_lines=False, encoding = "ISO-8859-1")
kept_feats = pd.read_csv(f'{dataDir}/{kept_feats_file}',error_bad_lines=False, encoding = "ISO-8859-1")
#kept_feats_counts.head()
kept_feats_counts.head()

In [None]:
# save off each team text as it's own file
import io
for row in kept_feats.iterrows():
    _id = row[1]['_id']
    features = row[1]['features']
    features = features.replace('[','')
    features = features.replace(']','')
    features = features.replace('\'','')
    features = features.replace(',','')
    
    cleanedDir = f'{corpusDir}/teams/v1/cleaned'
    if not os.path.exists(cleanedDir): os.makedirs(cleanedDir)
    
    with io.open(f'{cleanedDir}/{_id}_nfl_team_text.txt','w+',encoding='utf8') as f:
        f.write(features)

### 3. Vectorization Models


In [None]:
#create integer feature vector mappings
feature_id_map = {}
id_feature_map = {}

feats = kept_feats_counts.feature

for i,f in enumerate(feats):
    id_feature_map[i] = f
    feature_id_map[f] = i

In [None]:
from sklearn import metrics
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.feature_selection import SelectKBest, chi2
from time import time
import shorttext
#from keras.preprocessing.text import Tokenizer
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
analyzer = SentimentIntensityAnalyzer()

[sklearn CountVectorizer](https://scikit-learn.org/0.15/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html#sklearn.feature_extraction.text.CountVectorizer)<br>
Convert a collection of text documents to a matrix of token counts<br>

This implementation produces a sparse representation of the counts using scipy.sparse.coo_matrix.<br>

If you do not provide an a-priori dictionary and you do not use an analyzer that does some kind of feature selection then the number of features will be equal to the vocabulary size found by analyzing the data<br>

In text mining, it is important to create the document-term matrix (DTM) of the corpus we are interested in. A DTM is basically a matrix, with documents designated by rows and words by columns, that the elements are the counts or the weights (usually by tf-idf). Subsequent analysis is usually based creatively on DTM.<br>

CountVectorizer supports counts of N-grams of words or consecutive characters. Once fitted, the vectorizer has built a dictionary of feature indices:<br>
The index value of a word in the vocabulary is linked to its frequency in the whole training corpus.<br>

In [None]:
# get cleaned team text files from path
_files = []
_filenames = {}
#cleanedDir = f'{corpusDir}/teams/v2/cleaned'
path=f'{corpusDir}/teams/v2/cleaned/'
for dirpath, dirs, files in os.walk(path):
    print(dirpath)
    print(dirs)
    print(files)
    for i,f in enumerate(files):
        _files.append(dirpath+f)
        _filenames[i] = f
#_files

In [None]:
_files[:5]

## Vectorization Objects to Explore
### CountVectorizer & TfidfVectorizer
* unigrams
* bigrams
* trigrams

In [None]:
def inst_vectorizer(ngram_type,vectorizer_type,input='filename',max_df=1.0,min_df=1,stop_words='english',analyzer='word',max_features=None):
    vectorizer = None
    ngram = (1,1)
    # set ngram type
    if ngram_type == 'unigram':
        ngram = (1,1)
    elif ngram_type == 'bigram':
        ngram = (1,2)
    elif ngram_type == 'trigram':
        ngram = (1,3)
    else:
        ngram = (1,1)
        
    if vectorizer_type == 'count':
        vectorizer = CountVectorizer(input=input,ngram_range=ngram,max_df=max_df,min_df=min_df,stop_words=stop_words,analyzer=analyzer,max_features=max_features)
    elif vectorizer_type == 'tfidf':
        vectorizer = TfidfVectorizer(input=input,ngram_range=ngram,max_df=max_df,min_df=min_df,stop_words=stop_words,analyzer=analyzer,max_features=max_features)
    else:
        vectorizer = CountVectorizer(input=input,ngram_range=ngram,max_df=max_df,min_df=min_df,stop_words=stop_words,analyzer=analyzer,max_features=max_features)

    return vectorizer

In [None]:
# initialize a CountVectorizer object:
# Initialize a unigram vector object
count_vec_unigram = inst_vectorizer('unigram','count')
tfidf_vec_unigram = inst_vectorizer('unigram','tfidf')
#-------------------------------------------------------#
# Initialize a bigram vector object
count_vec_bigram = inst_vectorizer('bigram','count')
tfidf_vec_bigram = inst_vectorizer('bigram','tfidf')
#-------------------------------------------------------#
# Initialize a trigram vector object
count_vec_trigram = inst_vectorizer('trigram','count')
tfidf_vec_trigram = inst_vectorizer('trigram','tfidf')
#-------------------------------------------------------#

### CountVectorize Unigrams

In [None]:
# Transform the data into a bag of words
count_unigram = count_vec_unigram.fit(_files)
count_bow_unigram = count_vec_unigram.transform(_files)

count_unigram_features = count_unigram.get_feature_names()

# print a few of the features
logger.info(f'CountVectorizer unigram transformed shape: {count_bow_unigram.shape}')
logger.info(f'CountVectorizer unigram transformed size: {count_bow_unigram.size}')
logger.info(f'CountVectorizer unigram transformed type: {type(count_bow_unigram)}')
#logger.info(f'List of all ngram features:\n{count_unigram_features}')

In [None]:
len(count_unigram_features)
count_unigram_features[:5]

In [None]:
logger.info(f'unigram vocabulary size: {len(count_vec_unigram.vocabulary_)}')
#logger.info(f'ngram vocabulary content:\n {count_vec_unigram.vocabulary_}')

In [None]:
cnt_uni_voc_dict = dict(count_unigram.vocabulary_)
cnt_uni_voc_df = pd.DataFrame.from_dict(cnt_uni_voc_dict, orient='index').reset_index()
cnt_uni_voc_df.columns=('feature','feature_index')
cnt_uni_voc_df.sort_values(by='feature_index', ascending=False)[::10].head(20)

#### Output CountVectorize unigram feature vectors

In [None]:
word_id_map = {}
id_word_map = {}

words = count_unigram.get_feature_names()
logger.debug(words)

for i,f in enumerate(words):
    word_id_map[i] = f
    id_word_map[f] = i

In [None]:
cols = count_unigram.get_feature_names()
tdm_vec_df = pd.DataFrame(count_bow_unigram.toarray(),columns=cols)
non_zero_field_count = 0
# output feature vector term frequence vector as 'doc feature frequency'
with open(f'{outputDir}/count_unigram_feature_vector_tf.txt','w+') as f:

    for i in range(0,tdm_vec_df.shape[0]):
        a = [index for index,value in enumerate(tdm_vec_df.iloc[i]) if value > 0]
        sent = _filenames[i]
        non_zero_field_count = non_zero_field_count+len(a)
        
        for col in a:
            v = tdm_vec_df.iloc[i,col]
            sent = sent+' '+word_id_map[col]+' '+str(v)
        
        f.write(sent+'\n')

In [None]:
tdm_vec_df[::5].head()

### TfidfVectorize Unigrams

In [None]:
# Transform the data into a bag of words
tfidf_unigram = tfidf_vec_unigram.fit(_files)
tfidf_bow_unigram = tfidf_vec_unigram.transform(_files)

tfidf_unigram_features = tfidf_unigram.get_feature_names()

# print a few of the features
logger.info(f'TfidfVectorizer unigram transformed shape: {tfidf_bow_unigram.shape}')
logger.info(f'TfidfVectorizer unigram transformed size: {tfidf_bow_unigram.size}')
logger.info(f'TfidfVectorizer unigram transformed type: {type(tfidf_bow_unigram)}')
#logger.info(f'List of all ngram features:\n{count_unigram_features}')

In [None]:
len(tfidf_unigram_features)
tfidf_unigram_features[:5]

In [None]:
logger.info(f'tfidf_unigram vocabulary size: {len(tfidf_unigram.vocabulary_)}')
#logger.info(f'ngram vocabulary content:\n {tfidf_unigram.vocabulary_}')

In [None]:
# get and eval the IDF: The inverse document frequency
idf_unigram = tfidf_unigram.idf_
idf_weights = dict(zip(tfidf_unigram.get_feature_names(), idf_unigram))

idf_weights_df = pd.DataFrame.from_dict(idf_weights,orient='index').reset_index()
idf_weights_df.columns=('feature','weight')
idf_weights_df = idf_weights_df.sort_values(by='weight',ascending=False)

logger.info(f'IDF Top 10 List:\n{idf_weights_df.head(10)}')
logger.info(f'IDF Lowest 10 List:\n{idf_weights_df.tail(10).sort_values(by="weight",ascending=True)}')

In [None]:
# plot
sns.barplot(x='feature', y='weight', data=idf_weights_df)            
plt.title("Unigram Inverse Document Frequency(idf) per token")
fig=plt.gcf()
fig.set_size_inches(10,5)
plt.show()

In [None]:
word_id_map = {}
id_word_map = {}

words = tfidf_unigram.get_feature_names()
logger.debug(words)

for i,f in enumerate(words):
    word_id_map[i] = f
    id_word_map[f] = i

In [None]:
cols = tfidf_unigram.get_feature_names()
tdm_vec_df = pd.DataFrame(tfidf_bow_unigram.toarray(),columns=cols)
non_zero_field_count = 0
# output feature vector term frequence vector as 'doc feature frequency'
with open(f'{outputDir}/tfidf_unigram_feature_vector_tf.txt','w+') as f:

    for i in range(0,tdm_vec_df.shape[0]):
        a = [index for index,value in enumerate(tdm_vec_df.iloc[i]) if value > 0]
        sent = _filenames[i]
        non_zero_field_count = non_zero_field_count+len(a)
        
        for col in a:
            v = tdm_vec_df.iloc[i,col]
            sent = sent+' '+word_id_map[col]+' '+str(v)
        
        f.write(sent+'\n')

In [None]:
max_tfidf = tfidf_bow_unigram.max(axis=0).toarray().ravel()
sort_by_tfidf = max_tfidf.argsort()
logger.info(f'Features weith lowest tfidf:\n{sort_by_tfidf[:5]}')
logger.info(f'Features weith heights tfidf:\n{sort_by_tfidf[-5:]}')


In [None]:
tfidf_uni_voc_dict = dict(tfidf_unigram.vocabulary_)
tfidf_uni_voc_df = pd.DataFrame.from_dict(tfidf_uni_voc_dict, orient='index').reset_index()
tfidf_uni_voc_df.columns=('feature','feature_index')
tfidf_uni_voc_df.sort_values(by='feature_index', ascending=False)[::10].head(20)

In [None]:
# features with the lowest 5 tfidf
low_5_tfidf = sort_by_tfidf[:5]
low_tfidf = tfidf_uni_voc_df[tfidf_uni_voc_df.feature_index.isin(low_5_tfidf)]
low_tfidf

In [None]:
# features with the highest 5 tfidf
high_5_tfidf = sort_by_tfidf[-5:]
high_tfidf = tfidf_uni_voc_df[tfidf_uni_voc_df.feature_index.isin(high_5_tfidf)]
high_tfidf

### CountVectorize - Bigrams

In [None]:
# Eval Bigrams
count_bigram = count_vec_bigram.fit(_files)
cnt_bow_bigram = count_vec_bigram.transform(_files)

cnt_bigrams = count_vec_bigram.get_feature_names()

# print a few of the features
logger.info(f'CountVectorizer ngram transformed shape: {cnt_bow_bigram.shape}')
logger.info(f'CountVectorizer ngram transformed size: {cnt_bow_bigram.size}')
logger.info(f'CountVectorizer ngram transformed type: {type(cnt_bow_bigram)}')
#logger.info(f'List of all bigram features:\n{cnt_bigrams}')

In [None]:
logger.info(f'bigram vocabulary size: {len(count_bigram.vocabulary_)}')
#logger.info(f'bigram vocabulary content:\n {count_vec_bigram.vocabulary_}')

In [None]:
#logger.info(f'CountVectorizer Fit: \n{fit_vec.vocabulary_}')
voc_dict = dict(count_bigram.vocabulary_)
voc_df = pd.DataFrame.from_dict(voc_dict, orient='index').reset_index()
voc_df.columns=('feature','feature_index')
voc_df.sort_values(by='feature_index', ascending=False)[::10].head(20)

#### Output CountVectorize bigram feature vector to file

In [None]:
word_id_map = {}
id_word_map = {}

words = count_bigram.get_feature_names()
logger.debug(words)

for i,f in enumerate(words):
    word_id_map[i] = f
    id_word_map[f] = i

In [None]:
cols = count_bigram.get_feature_names()
tdm_vec_df = pd.DataFrame(cnt_bow_bigram.toarray(),columns=cols)
non_zero_field_count = 0
# output feature vector term frequence vector as 'doc feature frequency'
with open(f'{outputDir}/cnt_bigrams_feature_vector_tf.txt','w+') as f:

    for i in range(0,tdm_vec_df.shape[0]):
        a = [index for index,value in enumerate(tdm_vec_df.iloc[i]) if value > 0]
        sent = _filenames[i]
        non_zero_field_count = non_zero_field_count+len(a)
        
        for col in a:
            v = tdm_vec_df.iloc[i,col]
            sent = sent+' '+word_id_map[col]+' '+str(v)
        
        f.write(sent+'\n')

In [None]:
tdm_vec_df[::5].head()

### TfidfVectorize Bigrams

In [None]:
# Transform the data into a bag of words
tfidf_bigram = tfidf_vec_bigram.fit(_files)
tfidf_bow_bigram = tfidf_vec_bigram.transform(_files)

tfidf_bigram_features = tfidf_bigram.get_feature_names()

# print a few of the features
logger.info(f'TfidfVectorizer bigram transformed shape: {tfidf_bow_bigram.shape}')
logger.info(f'TfidfVectorizer bigram transformed size: {tfidf_bow_bigram.size}')
logger.info(f'TfidfVectorizer bigram transformed type: {type(tfidf_bow_bigram)}')
#logger.info(f'List of all bigram features:\n{count_unigram_features}')

In [None]:
len(tfidf_bigram_features)
tfidf_bigram_features[:5]

In [None]:
# get and eval the IDF: The inverse document frequency
idf_bigram = tfidf_bigram.idf_
idf_weights = dict(zip(tfidf_bigram.get_feature_names(), idf_bigram))

idf_weights_df = pd.DataFrame.from_dict(idf_weights,orient='index').reset_index()
idf_weights_df.columns=('feature','weight')
idf_weights_df = idf_weights_df.sort_values(by='weight',ascending=False)

logger.info(f'IDF bigram Top 10 List:\n{idf_weights_df.head(10)}')
logger.info(f'IDF bigram Lowest 10 List:\n{idf_weights_df.tail(10).sort_values(by="weight",ascending=True)}')

In [None]:
# plot
#sns.barplot(x='feature', y='weight', data=idf_weights_df)            
#plt.title("Bigram Inverse Document Frequency(idf) per token")
#fig=plt.gcf()
#fig.set_size_inches(10,5)
#plt.show()

In [None]:
word_id_map = {}
id_word_map = {}

words = tfidf_bigram.get_feature_names()
logger.debug(words)

for i,f in enumerate(words):
    word_id_map[i] = f
    id_word_map[f] = i

In [None]:
cols = tfidf_bigram.get_feature_names()
tdm_vec_df = pd.DataFrame(tfidf_bow_bigram.toarray(),columns=cols)
non_zero_field_count = 0
# output feature vector term frequence vector as 'doc feature frequency'
with open(f'{outputDir}/tfidf_bigram_feature_vector_tf.txt','w+') as f:

    for i in range(0,tdm_vec_df.shape[0]):
        a = [index for index,value in enumerate(tdm_vec_df.iloc[i]) if value > 0]
        sent = _filenames[i]
        non_zero_field_count = non_zero_field_count+len(a)
        
        for col in a:
            v = tdm_vec_df.iloc[i,col]
            sent = sent+' '+word_id_map[col]+' '+str(v)
        
        f.write(sent+'\n')

In [None]:
tfidf_bi_voc_dict = dict(tfidf_bigram.vocabulary_)
tfidf_bi_voc_df = pd.DataFrame.from_dict(tfidf_bi_voc_dict, orient='index').reset_index()
tfidf_bi_voc_df.columns=('feature','feature_index')
tfidf_bi_voc_df.sort_values(by='feature_index', ascending=False)[::10].head(20)

In [None]:
max_tfidf = tfidf_bow_bigram.max(axis=0).toarray().ravel()
sort_by_tfidf = max_tfidf.argsort()
logger.info(f'Features weith lowest tfidf:\n{sort_by_tfidf[:5]}')
logger.info(f'Features weith heights tfidf:\n{sort_by_tfidf[-5:]}')
# features with the lowest 5 tfidf
low_5_tfidf = sort_by_tfidf[:5]
low_tfidf = tfidf_bi_voc_df[tfidf_bi_voc_df.feature_index.isin(low_5_tfidf)]
low_tfidf

In [None]:
# features with the highest 5 tfidf
high_5_tfidf = sort_by_tfidf[-5:]
high_tfidf = tfidf_bi_voc_df[tfidf_bi_voc_df.feature_index.isin(high_5_tfidf)]
high_tfidf

### Vectorizer - Trigram Eval

#### CountVectorize Trigrams

In [None]:
# Eval Trigram
count_trigram = count_vec_trigram.fit(_files)
cnt_bow_trigram = count_vec_trigram.transform(_files)

cnt_trigrams = count_vec_trigram.get_feature_names()

# print a few of the features
logger.info(f'CountVectorizer trigram transformed shape: {cnt_bow_trigram.shape}')
logger.info(f'CountVectorizer trigram transformed size: {cnt_bow_trigram.size}')
logger.info(f'CountVectorizer trigram transformed type: {type(cnt_trigrams)}')
#logger.info(f'List of all trigrams features:\n{trigrams}')

In [None]:
logger.info(f'trigram vocabulary size: {len(count_vec_trigram.vocabulary_)}')
#logger.info(f'trigram vocabulary content:\n {count_vec_trigram.vocabulary_}')

In [None]:
#logger.info(f'CountVectorizer Fit: \n{fit_vec.vocabulary_}')
voc_dict = dict(count_trigram.vocabulary_)
voc_df = pd.DataFrame.from_dict(voc_dict, orient='index').reset_index()
voc_df.columns=('feature','feature_index')
voc_df.sort_values(by='feature_index', ascending=False)[::10].head(20)

#### Output CountVectorize trigram feature vectors

In [None]:
word_id_map = {}
id_word_map = {}

words = count_trigram.get_feature_names()
logger.debug(words)

for i,f in enumerate(words):
    word_id_map[i] = f
    id_word_map[f] = i

In [None]:
cols = count_trigram.get_feature_names()
tdm_vec_df = pd.DataFrame(cnt_bow_trigram.toarray(),columns=cols)
non_zero_field_count = 0
# output feature vector term frequence vector as 'doc feature frequency'
with open(f'{outputDir}/cnt_trigrams_feature_vector_tf.txt','w+') as f:

    for i in range(0,tdm_vec_df.shape[0]):
        a = [index for index,value in enumerate(tdm_vec_df.iloc[i]) if value > 0]
        sent = _filenames[i]
        non_zero_field_count = non_zero_field_count+len(a)
        
        for col in a:
            v = tdm_vec_df.iloc[i,col]
            sent = sent+' '+word_id_map[col]+' '+str(v)
        
        f.write(sent+'\n')

In [None]:
tdm_vec_df[::5].head()

### TfidfVectorize Trigrams

In [None]:
def join_words(words):
    l = words.split(' ')
    combined = ''
    if len(l)>1:
        for i,w in enumerate(l):
            if i==0:
                combined = w
            else:
                combined = combined+'_'+w
    else:
        combined = words
    return combined

In [None]:
# classify trigram for comparative analysis
def sentiment_classify(sentence):
    vs = analyzer.polarity_scores(sentence)
    #pol_scores.append(vs)
    label = classify_vader_score_threshold(vs['compound'])
    return label

In [None]:
# Transform the data into a bag of words
tfidf_trigram = tfidf_vec_trigram.fit(_files)
tfidf_bow_trigram = tfidf_vec_trigram.transform(_files)

tfidf_trigram_features = tfidf_trigram.get_feature_names()

# print a few of the features
logger.info(f'TfidfVectorizer trigram transformed shape: {tfidf_bow_trigram.shape}')
logger.info(f'TfidfVectorizer trigram transformed size: {tfidf_bow_trigram.size}')
logger.info(f'TfidfVectorizer trigram transformed type: {type(tfidf_bow_trigram)}')
#logger.info(f'List of all bigram features:\n{count_unigram_features}')

In [None]:
len(tfidf_trigram_features)
tfidf_trigram_features[:5]

In [None]:
# get and eval the IDF: The inverse document frequency
idf_trigram = tfidf_trigram.idf_
idf_weights = dict(zip(tfidf_trigram.get_feature_names(), idf_trigram))

idf_weights_df = pd.DataFrame.from_dict(idf_weights,orient='index').reset_index()
idf_weights_df.columns=('feature','weight')
idf_weights_df = idf_weights_df.sort_values(by='weight',ascending=False)

logger.info(f'IDF trigram Top 10 List:\n{idf_weights_df.head(10)}')
logger.info(f'IDF trigram Lowest 10 List:\n{idf_weights_df.tail(10).sort_values(by="weight",ascending=True)}')

In [None]:
# plot
sns.barplot(x='feature', y='weight', data=idf_weights_df)            
plt.title("Trigram Inverse Document Frequency(idf) per token")
fig=plt.gcf()
fig.set_size_inches(10,5)
plt.show()

In [None]:
word_id_map = {}
id_word_map = {}

words = tfidf_trigram.get_feature_names()
logger.debug(words)

for i,f in enumerate(words):
    word_id_map[i] = f
    id_word_map[f] = i

In [None]:
cols = tfidf_trigram.get_feature_names()
tdm_vec_df = pd.DataFrame(tfidf_bow_trigram.toarray(),columns=cols)
non_zero_field_count = 0
# output feature vector term frequence vector as 'doc feature frequency'
with open(f'{outputDir}/tfidf_trigram_feature_vector_tf.txt','w+') as f:

    for i in range(0,tdm_vec_df.shape[0]):
        a = [index for index,value in enumerate(tdm_vec_df.iloc[i]) if value > 0]
        sent = _filenames[i]
        non_zero_field_count = non_zero_field_count+len(a)
        
        for col in a:
            v = tdm_vec_df.iloc[i,col]
            sent = sent+' '+word_id_map[col]+' '+str(v)
        
        f.write(sent+'\n')

In [None]:
# labled with sentiment
cols = tfidf_trigram.get_feature_names()
tdm_vec_df = pd.DataFrame(tfidf_bow_trigram.toarray(),columns=cols)
non_zero_field_count = 0
# output feature vector term frequence vector as 'doc feature frequency'
with open(f'{outputDir}/tfidf_labeled_trigram_feature_vector_tf.txt','w+') as f:

    for i in range(0,tdm_vec_df.shape[0]):
        a = [index for index,value in enumerate(tdm_vec_df.iloc[i]) if value > 0]
        sent = _filenames[i]
        non_zero_field_count = non_zero_field_count+len(a)
        
        for col in a:
            v = tdm_vec_df.iloc[i,col]
            sent = sent+' '+join_words(word_id_map[col])+'|'+str(v)+'|'+sentiment_classify(word_id_map[col])
        
        f.write(sent+'\n')

In [None]:
max_tfidf = tfidf_bow_trigram.max(axis=0).toarray().ravel()
sort_by_tfidf = max_tfidf.argsort()
logger.info(f'Features weith lowest tfidf:\n{sort_by_tfidf[:5]}')
logger.info(f'Features weith heights tfidf:\n{sort_by_tfidf[-5:]}')

In [None]:
tfidf_tri_voc_dict = dict(tfidf_trigram.vocabulary_)
tfidf_tri_voc_df = pd.DataFrame.from_dict(tfidf_tri_voc_dict, orient='index').reset_index()
tfidf_tri_voc_df.columns=('feature','feature_index')
tfidf_tri_voc_df.sort_values(by='feature_index', ascending=False)[::10].head(20)

In [None]:
# features with the lowest 5 tfidf
low_5_tfidf = sort_by_tfidf[:5]
low_tfidf = tfidf_tri_voc_df[tfidf_tri_voc_df.feature_index.isin(low_5_tfidf)]
low_tfidf

In [None]:
# features with the highest 5 tfidf
high_5_tfidf = sort_by_tfidf[-5:]
high_tfidf = tfidf_tri_voc_df[tfidf_tri_voc_df.feature_index.isin(high_5_tfidf)]
high_tfidf

[sklearn TfidfVectorizer](https://scikit-learn.org/0.15/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html#sklearn.feature_extraction.text.TfidfVectorizer)<br>

To avoid these potential discrepancies it suffices to divide the number of occurrences of each word in a document by the total number of words in the document: these new features are called tf for Term Frequencies.<br>

Another refinement on top of tf is to downscale weights for words that occur in many documents in the corpus and are therefore less informative than those that occur only in a smaller portion of the corpus.<br>

This downscaling is called tf–idf for “Term Frequency times Inverse Document Frequency”.<br>

Both tf and tf–idf can be computed as follows using TfidfTransformer:<br>

**Weight of tokens per document**<br>
the more times a token appears in a document, the more weight it will have. However, the more documents the token appears in, it is 'penalized' and the weight is diminished. For example, the weight for token 'not' is 4, but if it did not appear in all documents (that is, only in one document) its weight would have been 8.3

**TF-IDF - Maximum token value throughout the dataset**

In [None]:
def classify_vader_score_threshold(compound_score):
    pos_sent = 'positive'
    neu_sent = 'neutral'
    neg_sent = 'negative'
    sentiment_class = ''
    
    if compound_score >= 0.05:
        sentiment_class = pos_sent;
    elif compound_score > -0.05 and compound_score < 0.05:
        sentiment_class = neu_sent;
    elif compound_score <= -0.05:
        sentiment_class = neg_sent;
    else:
        logger.warning(f'classify_vader_score_threshold: compound score not in range: {compound_score}')
    return sentiment_class

In [None]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
analyzer = SentimentIntensityAnalyzer()
pol_scores=[]
trigram_sentences = []
with open(f'{outputDir}/vader_trigram_sentiment.txt','w+') as f:
    for sentence in tfidf_trigram.vocabulary_:
        trigram_sentences.append(sentence)
        vs = analyzer.polarity_scores(sentence)
        pol_scores.append(vs)
        label = classify_vader_score_threshold(vs['compound'])
        f.write("{0:-<65} {1} {2}".format(sentence, str(vs), label)+'\n')
        print("{0:-<65} {1} {2}".format(sentence, str(vs), label)+'\n')

In [None]:
# evaluate the polarity scoring
pol_scores_df = pd.DataFrame(pol_scores)
logger.info(pol_scores_df.describe())
#pol_scores_df.head()

# classify each sentence as 'positive', 'negative' or 'neutral' - see function above
sentiment_classes = [classify_vader_score_threshold(c) for c in pol_scores_df['compound'] ]
pol_scores_df['sentiment_label'] = sentiment_classes
pol_scores_df['ngram'] = trigram_sentences
pol_scores_df.head()

In [None]:
# evaluate the polarity scoring
pol_scores_df = pd.DataFrame(pol_scores)
logger.info(pol_scores_df.describe())
#pol_scores_df.head()

# classify each sentence as 'positive', 'negative' or 'neutral' - see function above
sentiment_classes = [classify_vader_score_threshold(c) for c in pol_scores_df['compound'] ]
pol_scores_df['sentiment_label'] = sentiment_classes
pol_scores_df.head()
sns.scatterplot(x='neg',y='pos', hue='sentiment_label', data=pol_scores_df);

In [None]:
logger.info(f'Negative Count: {len(pol_scores_df[pol_scores_df["sentiment_label"] == "negative"])}')
logger.info(f'Positive Count: {len(pol_scores_df[pol_scores_df["sentiment_label"] == "positive"])}')
logger.info(f'Neutral Count: {len(pol_scores_df[pol_scores_df["sentiment_label"] == "neutral"])}')

neg_count = len(pol_scores_df[pol_scores_df["sentiment_label"] == "negative"])
pos_count = len(pol_scores_df[pol_scores_df["sentiment_label"] == "positive"])

print(neg_count/pos_count)
print(pos_count/neg_count)    

In [None]:
sns.countplot(x='sentiment_label', data=pol_scores_df);

In [None]:
# read in trigram classified per team document
#teams = {}
tri_labeled_df = pd.DataFrame()

with open(f'{outputDir}/tfidf_labeled_trigram_feature_vector_tf.txt','r') as f:
    lines = f.readlines()
    team_names = []
    terms = []
    tfidf_weights = []
    term_labels = []
    for team_line in lines:
        
        #logger.info(tokens[:5])
        tokens = team_line.split(' ')
        teams = tokens[0].split('_')
        team = teams[0]+'_'+teams[1]
        
        for token in tokens[1:]:
            team_names.append(team)
            tok = token.split('|')
            terms.append(' '.join(tok[0].split('_')))
            tfidf_weights.append(tok[1])
            term_labels.append(tok[2].replace('\n',''))
            
        #logger.info(len(terms))
        #logger.info(len(tfidf_weights))
        #logger.info(len(term_labels))
        #logger.info(len(team_names))
            #break
        #teams[team] = {'terms':terms, 'tfidf_weights':tfidf_weights, 'term_label':term_labels}
        logger.info(team)
        #break
    tri_labeled_df['team'] = team_names
    tri_labeled_df['term'] = terms
    tri_labeled_df['tfidf_weight'] = tfidf_weights
    tri_labeled_df['term_label'] = term_labels

tri_labeled_df.head(20)

In [None]:
def score_label(l):
    if l == 'positive':
        return 1
    elif l == 'negative':
        return -1
    elif l == 'neutral':
        return 0

In [None]:
labels = tri_labeled_df['term_label']
label_scores = [score_label(l) for l in labels]
tri_labeled_df['label_scores'] = label_scores

In [None]:
tri_labeled_df.head()

In [None]:
teams_sentiment_rank = {}
team_score = tri_labeled_df.groupby(['team']).sum()['label_scores'].reset_index()
#team_score = 
#team_score.columns('team','label_score_sum')
team_score.head()

In [None]:
#team_score.plot.bar(x='team',y='label_scores',orient='h')

# Initialize the matplotlib figure
sns.set(style="whitegrid")
f, ax = plt.subplots(figsize=(6, 15))

# Plot the total crashes
sns.set_color_codes("pastel")
sns.barplot(x="label_scores", y="team", data=team_score.sort_values(by='label_scores', ascending=False),
            label="Total", color="b")

# Add a legend and informative axis label
ax.legend(ncol=2, loc="lower right", frameon=True)
ax.set(ylabel="NFL Teams",
       xlabel="Sentiment Score Total",
      title="NFL Team Media Sentiment Classification Score Ranking")
sns.despine(left=True, bottom=True)
