In [438]:
import argparse
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import ParameterGrid
import csv
import logging

In [439]:
runParams={'tfidf_maxdf':      [0.5],
           'input_file':       ['./data/articles_combined_formattedv2.csv'],
           'story_threshold':  [0.26],
           'process_date':     ['2016-09-01'],
           'parts_of_speech':  [['PROPER', 'VERB']],
           'lemma_conversion': [False],
           'ngram_max':        [3],
           'tfidf_binary':     [False],
           'tfidf_norm':       ['l2'],
           'nlp_library':      ['nltk'],
           'max_length':       [50],
           'stop_words_file':  ['./data/stopWords.txt'],
           'tfidf_mindf':      [2],
           'display_graph':    [True],
           'article_stats':    [False]}

# Use parameter grid even if there is only set of parameters
parameterGrid=ParameterGrid(runParams)

In [440]:
# Load and initialise required NLP libraries
pos_nlp_mapping={}
nl=None
wordnet_lemmatizer=None
nlp=None
if 'spaCy' in runParams['nlp_library']:
    import spacy
    nlp=spacy.load('en')
    pos_nlp_mapping['spaCy']={'VERB':['VERB'],'PROPER':['PROPN'],'COMMON':['NOUN']}
    
if 'nltk' in runParams['nlp_library']:
    import nltk as nl
    if True in runParams['lemma_conversion']:
        from nltk.stem import WordNetLemmatizer
    else:
        wordnet_lemmatizer=None
    pos_nlp_mapping['nltk']={'VERB': ['VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ'],'PROPER':['NNP','NNPS'],'COMMON':['NN','NNS']}

In [441]:
def getInputDataAndDisplayStats(filename,processDate,printSummary=False):
    df = pd.read_csv(filename)
    logging.warning("Length of df before reading: " + str(df.shape[0]))
    df = df.drop_duplicates('content')
    df = df[~df['content'].isnull()]
    df=df[df['content'].str.len()>=200]

    logging.warning("Length of df after >=200 strlen: " + str(df.shape[0]))
    
    
    targetString="(Want to get this briefing by email?"
    df['NYT summary']=df['content'].map(lambda d: d[:len(targetString)]==targetString)
    df=df[df['NYT summary']==False]

    # The following removes a warning that appears in many of the Atlantic articles.
    # Since it is commonly at the beginning, it brings a lot of noise to the search for similar articles
    # And subsequently to the assessment of sentiment
    targetString="For us to continue writing great stories, we need to display ads.             Please select the extension that is blocking ads.     Please follow the steps below"
    df['content']=df['content'].str.replace(targetString,'')

    # This is also for some Atlantic articles for the same reasons as above
    targetString="This article is part of a feature we also send out via email as The Atlantic Daily, a newsletter with stories, ideas, and images from The Atlantic, written specially for subscribers. To sign up, please enter your email address in the field provided here."
    df=df[df['content'].str.contains(targetString)==False]

    # This is also for some Atlantic articles for the same reasons as above
    targetString="This article is part of a feature we also send out via email as Politics  Policy Daily, a daily roundup of events and ideas in American politics written specially for newsletter subscribers. To sign up, please enter your email address in the field provided here."
    df=df[df['content'].str.contains(targetString)==False]

    # More Atlantic-specific removals (for daily summaries with multiple stories contained)
    df=df[df['content'].str.contains("To sign up, please enter your email address in the field")==False]

    # Remove daily CNN summary
    targetString="CNN Student News"
    df=df[df['content'].str.contains(targetString)==False]
    
    # Remove "Yahoo ist Teil von Verizon Media"
    targetString = "Yahoo ist Teil von  Verizon Media"
    df=df[df['content'].str.contains(targetString)==False]
    
    logging.warning("Length after other content filtering: " + str(df.shape[0]))
    
    print("\nArticle counts by publisher:")
    print(df['publication'].value_counts())

    print("\nArticle counts by date:")
    print(df['date'].value_counts())

#     Restrict to articles on the provided input date.
#     This date is considered mandatory for topic clustering but is not required for sentiment
#     since sentiment only processes a specified list of articles.
#     For topic clustering it is essential to have the date as it is
#     enormously significant in article matching.

    if processDate!=None:
        df=df[df['date']==processDate]
    df.reset_index(inplace=True, drop=True)

    # Remove non-ASCII characters
    df.reset_index(inplace=True, drop=True)
    df['content no nonascii']=df['content'].map(lambda x: removeNonASCIICharacters(x))

    print("\nFinal dataset:\n\nDate:",processDate,"\n")
    print(df['publication'].value_counts())
    
    print("\nFinal Dataset Article Count:")
    print(df['date'].value_counts())
    df.to_csv(r'C:\Users\goldm\Capstone\tracking files\getInputDataAndDisplayStats.csv')
    
    return df

##########################################################################################

def removeNonASCIICharacters(textString): 
    return "".join(i for i in textString if ord(i)<128)

In [442]:
articleDataFrame=getInputDataAndDisplayStats(runParams['input_file'][0],
                                             runParams['process_date'][0],
                                             runParams['article_stats'][0])




Article counts by publisher:
Breitbart             104
NY Post                61
Reuters                59
CNN                    58
NPR                    54
                     ... 
The Atlantic            1
The Times of India      1
Forbes                  1
PEOPLE.com              1
The Economist           1
Name: publication, Length: 63, dtype: int64

Article counts by date:
9/1/2016                                                                                                                                                                      402
12/2/2016                                                                                                                                                                     362
 a tale of child abuse as long and as involved as what Gypsy experienced might have inspired public sympathy. But something about the fraud element deeply offended people      1
Name: date, dtype: int64

Final dataset:

Date: 2016-09-01 

Series([], Name: pub

In [443]:
getInputDataAndDisplayStats(runParams['input_file'][0],
                            runParams['process_date'][0],
                            printSummary=True)




Article counts by publisher:
Breitbart             104
NY Post                61
Reuters                59
CNN                    58
NPR                    54
                     ... 
The Atlantic            1
The Times of India      1
Forbes                  1
PEOPLE.com              1
The Economist           1
Name: publication, Length: 63, dtype: int64

Article counts by date:
9/1/2016                                                                                                                                                                      402
12/2/2016                                                                                                                                                                     362
 a tale of child abuse as long and as involved as what Gypsy experienced might have inspired public sympathy. But something about the fraud element deeply offended people      1
Name: date, dtype: int64

Final dataset:

Date: 2016-09-01 

Series([], Name: pub

Unnamed: 0.1,Unnamed: 0,id,publication,date,content,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 8,Unnamed: 9,...,Unnamed: 354,Unnamed: 355,Unnamed: 356,Unnamed: 357,Unnamed: 358,Unnamed: 359,Unnamed: 360,Unnamed: 361,NYT summary,content no nonascii


In [444]:
## Analyzing if any of the articles were "kicked out"
def listExcludedArticles(articleDataFrame, storyMap):
    excluded_list = []
    for story, storyArticles in storyMap.items():
        for article in storyArticles:
            if article in articleDataFrame['id']:
                pass
            else:
                excluded_list.append(article)
    return excluded_list
#                 print(str(article) + " was excluded from the dataset")
# print(articleDataFrame.shape[0])


In [360]:
def loadStopWords(stopWordsFileName):
    stop_words=[]
    f=open(stopWordsFileName, 'r')
    for l in f.readlines():
        stop_words.append(l.replace('\n', ''))
    return stop_words

stop_words=loadStopWords(runParams['stop_words_file'][0])

In [361]:
def stringNLTKProcess(nl,stringToConvert,partsOfSpeech,stop_words,maxWords=None,lemmatizer=None):
    sentences=nl.sent_tokenize(stringToConvert)
    str=[]
    for sentence in sentences:
        wordString=[]
        for word,pos in nl.pos_tag(nl.word_tokenize(sentence)):
            # The following condition avoids any POS which corresponds to punctuation (and takes all others)
            if partsOfSpeech==None:
                if pos[0]>='A' and pos[0]<='Z':
                    wordString.append(word)
            elif pos in partsOfSpeech:
                wordString.append(word)
        for wrd in wordString:
            wrdlower=wrd.lower()
            if wrdlower not in stop_words and wrdlower!="'s":
                if maxWords==None or len(str)<maxWords:
                    if lemmatizer==None:
                        str.append(wrdlower)
                    else:
                        str.append(lemmatizer.lemmatize(wrd.lower(), pos='v'))
            if maxWords!=None and len(str)==maxWords:
                return ' '.join(str)
    return ' '.join(str)

def removeSpacesAndPunctuation(textString):
    return "".join(i for i in textString if (ord(i)>=48 and ord(i)<=57) or (ord(i)>=97 and ord(i)<=122))

In [362]:
def setupStoryMapAndReportList(args=None, reportArticleList=None,storyMapFileName=None):
    # Story Map is used in fitting if grid search is applied (As ground truth)
    # It is also used in graph if no threshold provided (to determine colours, not to determine location)
    # Report Article List is used at the end to create a report with, for each
    # article in the list, the set of articles within tolerance, and the key words for each
    if args==None:
        articleList=reportArticleList
        fileName=storyMapFileName
    else:
        articleList=args['article_id_list']
        fileName=args['story_map_validation']
    
    reportArticleList=articleList
    if fileName!=None:
        storyMap=readStoryMapFromFile(fileName)
        if reportArticleList==None:
            reportArticleList=[]
            for story, articleList in storyMap.items():
                reportArticleList.append(articleList[0])
    else:
        storyMap=None
    return storyMap,reportArticleList

def readStoryMapFromFile(filename):
    return readDictFromCsvFile(filename,'StoryMap')

def readGridParameterRangeFromFile(filename):
    return readDictFromCsvFile(filename, 'GridParameters')

def readDictFromCsvFile(filename,schema):
    gridParamDict={} 
    with open(filename,'r') as f:
        for row in f:
            row=row[:-1] # Exclude the carriage return
            row=row.split(',')
            key=row[0]
            vals=row[1:]
            
            if schema=='GridParameters':
                if key in ['story_threshold','tfidf_maxdf']:
                    finalVals=list(float(n) for n in vals)
                elif key in ['ngram_max','tfidf_mindf','max_length']:
                    finalVals=list(int(n) for n in vals)
                elif key in ['lemma_conversion','tfidf_binary']:
                    finalVals = list(str2bool(n) for n in vals)
                elif key in ['parts_of_speech']:
                    listlist=[]
                    for v in vals:
                        listlist.append(v_split('+'))
                    finalVals = listlist
                elif key in ['tfidf_norm','nlp_library']:
                    finalVals=vals
                else:
                    print(key)
                    print("KEY ERROR")
                    return
            elif schema == 'StoryMap':
                finalVals = list(int(n) for n in vals if n!='')
            else:
                print(schema)
                print('SCHEMA ERROR')
                return
            
            gridParamDict[key]=finalVals
    return gridParamDict

In [363]:
storyMap,reportArticleList=setupStoryMapAndReportList(storyMapFileName='storyMapForValidation_expanded.csv')

In [364]:
for story, articleList in storyMap.items():
    print(story,":",articleList)

Trump meeting : [151832, 110126, 172078, 48306, 57365, 190512, 26536, 71335, 21499, 23872, 142033, 110133, 23888, 71336, 57366, 71339]
Brazil impeachment : [120639, 80103, 25225, 21502, 57362, 120636, 110141]
Kaepernick : [40617, 40543, 39520, 80109, 80101, 47403]
Clinton Guccifer : [214888, 85803, 47979]
Farage : [37252, 37468, 46175]
Anthony Weiner : [49480, 110144, 142300, 214934]
SpaceX : [38658, 134545, 172095, 214894]
Safe space : [21448, 78169, 78171]
Lauer debate : [43447, 47078, 138709]
Venezuela : [172079, 57375, 190522]
Iran deal : [158005, 48823, 57373, 120634]
Penn State : [80094, 157527, 214892]
David Brown : [172085, 80096, 141886]
haiti AND president : [217418, 217419, 217420, 217421, 217422, 217423, 217424, 217425, 217426, 217427, 217428, 217429, 217430, 217431, 217432, 217433, 217434, 217435, 217436, 217437, 217438, 217439, 217440, 217441]
inflation AND economy AND fed : [217490, 217491, 217492, 217493, 217494, 217495, 217496, 217497, 217498, 217499, 217500, 217501, 2

In [365]:
## print out whether any of the labeled articles have been excluded
excluded_list = listExcludedArticles(articleDataFrame,storyMap)
for story, articleList in storyMap.items():
    print(story,":",[article for article in articleList if article in list(articleDataFrame['id'])])
print(172078 in list(articleDataFrame['id']))

Trump meeting : [151832, 110126, 172078, 48306, 57365, 190512, 26536, 71335, 21499, 23872, 142033, 110133, 23888, 71336, 57366, 71339]
Brazil impeachment : [120639, 80103, 25225, 21502, 57362, 120636, 110141]
Kaepernick : [40617, 40543, 39520, 80109, 80101, 47403]
Clinton Guccifer : [214888, 85803, 47979]
Farage : [37252, 37468, 46175]
Anthony Weiner : [49480, 110144, 142300, 214934]
SpaceX : [38658, 134545, 172095, 214894]
Safe space : [21448, 78169, 78171]
Lauer debate : [43447, 47078, 138709]
Venezuela : [172079, 57375, 190522]
Iran deal : [158005, 48823, 57373, 120634]
Penn State : [80094, 157527, 214892]
David Brown : [172085, 80096, 141886]
haiti AND president : [217418, 217419, 217420, 217421, 217422, 217424, 217425, 217426, 217427, 217428, 217429, 217430, 217431, 217433, 217434, 217435, 217436, 217437, 217438, 217439, 217440, 217441]
inflation AND economy AND fed : [217490, 217492, 217493, 217494, 217495, 217496, 217497, 217498, 217500, 217501, 217502, 217504, 217505, 217506, 2

In [366]:
def preprocessAndVectorize(articleDataFrame,args,pos_nlp_mapping,nlp,nl,wordnet_lemmatizer,stop_words):
    # Map the input parts of speech list to the coding required for the specific NLP library
    if args['parts_of_speech'][0]!='ALL':
        partsOfSpeech=[]
        for pos in args['parts_of_speech']:
            partsOfSpeech.append(pos_nlp_mapping[args['nlp_library']][pos])
        partsOfSpeech=[item for sublist in partsOfSpeech for item in sublist]
    else:
        partsOfSpeech=None
    
    # Processing of text depends on NLP library choice
    if args['nlp_library']=='spaCy':
        articleDataFrame['input to vectorizer']=articleDataFrame['content no nonascii'].map(lambda x: stringSpaCyProcess(nlp,
                                                                                                                         x,
                                                                                                                         partsOfSpeech=partsOfSpeech,
                                                                                                                         maxWords=args['max_length'],
                                                                                                                         stop_words=stop_words,
                                                                                                                         lemmatize=args['lemma_conversion']))
    elif args['nlp_library']=='nltk':
        articleDataFrame['input to vectorizer']=articleDataFrame['content no nonascii'].map(lambda x: stringNLTKProcess(nl,
                                                                                                                        x,
                                                                                                                        partsOfSpeech=partsOfSpeech,
                                                                                                                        stop_words=stop_words,
                                                                                                                        maxWords=args['max_length'],
                                                                                                                        lemmatizer=wordnet_lemmatizer))
    else:
        print("PROBLEM... NO VALID NLP LIBRARY... MUST BE nltk OR spaCy")

    # To get default values a couple of parameters need to be not passed if not specified on the command line
    # Passing as None behaves differently to passing no parameter (which would invoke the default value)
    optArgsForVectorizer={}
    if args['tfidf_maxdf'] != None:
        optArgsForVectorizer['max_df']=args['tfidf_maxdf']
    if args['tfidf_mindf'] != None:
        optArgsForVectorizer['min_df']=args['tfidf_mindf']
    # Create and run the vectorize
    vectorizer=TfidfVectorizer(analyzer='word',
                               ngram_range=(1,args['ngram_max']),
                              lowercase=True,
                              binary=args['tfidf_binary'],
                              norm=args['tfidf_norm'],
                              **optArgsForVectorizer)
    tfidfVectors=vectorizer.fit_transform(articleDataFrame['input to vectorizer'])
    terms=vectorizer.get_feature_names()
    logging.warning('tfidfVector shape: '+ str(tfidfVectors.shape)) 
    
    return tfidfVectors, terms

In [404]:

def scoreCurrentParamGuess(tfidfVectors,storyMap,articleDataFrame,threshold,printErrors=False):
    # Work with distances relative to first item in each cluster - even though this is clearly arbitrary since that
    # point could be an outlier in the cluster and hence might cause problems.
    # But I have to start somewhere - and can refine it later if needed.

    nonZeroCoords=initialiseAllNonZeroCoords(tfidfVectors)
    score=0
    outGood=0
    outBad=0
    inGood=0
    inBad=0
     
    #### ---- Richard Modification- adding in code to print out a df of articles in the story map and their respective category       
    final_mapping_list = []
    
    for story, storyArticles in storyMap.items():
        leadArticleIndex=articleDataFrame[articleDataFrame['id']==storyArticles[0]].index[0]
        comparisonArticle = articleDataFrame[articleDataFrame['id']==storyArticles[0]]['id']
        # Compute score of all articles in corpus relative to first article in story (.product)
        # Then count through list relative to threshold (add one for a good result, subtract one for a bad result)
        scores=productRelatednessScores(tfidfVectors,nonZeroCoords,leadArticleIndex)
        rankedIndices=np.argsort(scores)
        foundRelatedArticles=[]
        # THE SORTING HERE IS NOT STRICTLY REQUIRED, BUT I COULD USE IT SO THAT ONCE THE THRESHOLD IS PASSED
        # IN THE LOOP, THEN I INFER THE REMAINING RESULTS
        for article in reversed(rankedIndices):
            thisArticleIndex=articleDataFrame['id'][article]
            if thisArticleIndex in storyArticles:
                if scores[article]>=threshold: # article IS supposed to be in range
                    score+=1
                    inGood+=1
                    #appending the article and its mapping according to the predictions of our model
                    final_mapping_list.append([comparisonArticle, thisArticleIndex, story, story, 'TP', scores[article]])
                else:
                    score-=1
                    inBad+=1
                    final_mapping_list.append([comparisonArticle, thisArticleIndex, 'No Mapping', story, 'FN', scores[article]])
                    if printErrors:
                        print("ERROR:",thisArticleIndex,"should be in",story)
            else: # article not supposed to be in range
                if scores[article]<=threshold:
                    score+=1
                    outGood+=1
                    final_mapping_list.append([comparisonArticle, thisArticleIndex, 'No Mapping', 'No Mapping', 'TN', scores[article]])
                else:
                    score-=1
                    outBad+=1
                    final_mapping_list.append([comparisonArticle, thisArticleIndex, story, 'No Mapping', 'FP', scores[article]])
                    if printErrors:
                        print("ERROR:",thisArticleIndex,"should NOT be in",story)
    
    #### ---- Richard Modification- adding in code to print out a df of articles in the story map and their respective categor
    final_mapping_df = pd.DataFrame(final_mapping_list, columns = ['root_article', 'article_compared', 'predicted_mapping', 'true_mapping"','FP/FN/TP/TN', "score"])
    final_mapping_df = final_mapping_df[final_mapping_df.score != 0]
    final_mapping_df.to_csv("PredictedMappingsv4-storythreshold={}.csv".format(threshold))
    
    scoreDict={'score':score,'inGood':inGood,'inBad':inBad,'outGood':outGood,'outBad':outBad}
    return scoreDict

##########################################################################################

def initialiseAllNonZeroCoords(tfidfVectors):
# This function just exists since it seems to be expensive and I'd rather not call it multiple times
# Hence it is intended to be called outside of loops in order to simplify the row specific processing
    values=[]
    nzc=zip(*tfidfVectors.nonzero())

    # In Python 3 the zip can only be iterated through one time before it is automatically released
    # So need to copy the results otherwise the main loop below will no longer work
    pointList=[]
    for i,j in nzc:
        pointList.append([i,j])		

    for row in range(tfidfVectors.shape[0]):
        rowList=[]
        for i,j in pointList:
            if row==i:
                rowList.append(j)
        values.append(rowList)

    return values

In [368]:
def productRelatednessScores(tfidfVectors,nonZeroCoords,refRow):
    # instantiates a matrix of zeros with tfidVectors.shape[0] rows corresponding to the number of rows in the tfidVectors array
    scores = [0]*tfidfVectors.shape[0]
    for toRow in range(tfidfVectors.shape[0]):
        scores[toRow] = sum([(tfidfVectors[toRow,w]*tfidfVectors[refRow,w]) for w in nonZeroCoords[refRow] if w in nonZeroCoords[toRow]])
    return scores

In [369]:
import nltk; nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\goldm\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [405]:
# Loop across all parameter combinations in grid to determine best set
# If not doing grid search, will just pass through the loop once
bestParamScoreDict={'score':-1000000}
bestParams=parameterGrid[0]
for i,currentParams in enumerate(parameterGrid):
    if len(parameterGrid)>1:
        print("Combination:",i+1,"of",len(parameterGrid))
        print(currentParams)
        
        # Determine tf-idf vectors
        # terms is just used later on if analysis of final results is requested
    tfidfVectors,terms=preprocessAndVectorize(articleDataFrame,
                                              currentParams,
                                              pos_nlp_mapping,
                                              nlp,
                                              nl,
                                              wordnet_lemmatizer,
                                              stop_words)

    # Compute scores if threshold provided (meaning as part of grid search)
    if 'story_threshold' in currentParams and currentParams['story_threshold']!=None:
        scoreDict = scoreCurrentParamGuess(tfidfVectors,storyMap,articleDataFrame,currentParams['story_threshold'])
        print(scoreDict)

        # Update best so far
        if scoreDict['score']>=bestParamScoreDict['score']:
            if len(parameterGrid)>1:
                print(i+1,"is the best so far!")
            bestParams=currentParams
            bestParamScoreDict=scoreDict
    # End grid/parameter loop



{'score': 7064, 'inGood': 80, 'inBad': 82, 'outGood': 7070, 'outBad': 4}


In [210]:
# Set threshold to input value from best (and possibly only) run for use in results analysis
# Unless not specified at all
if 'story_threshold' in bestParams and bestParams['story_threshold']!=None:
    threshold=bestParams['story_threshold']
else:
    threshold=None


# If there was a real parameter grid, then output/refresh results
if len(parameterGrid)>=1:
    print("BEST PARAMETERS:")
    print(bestParams)
    print(bestParamScoreDict)
    scoreCurrentParamGuess(tfidfVectors,storyMap,articleDataFrame,threshold,printErrors=True)
    # Recreate vector for best results in loop
    # terms is just used later on if analysis of final results is requested
    tfidfVectors,terms=preprocessAndVectorize(articleDataFrame,
                                            bestParams,
                                            pos_nlp_mapping,
                                            nlp,
                                            nl,
                                            wordnet_lemmatizer,
                                            stop_words)

BEST PARAMETERS:
{'article_stats': False, 'display_graph': True, 'input_file': './data/articles_combined_formattedv1.csv', 'lemma_conversion': False, 'max_length': 50, 'ngram_max': 3, 'nlp_library': 'nltk', 'parts_of_speech': ['PROPER', 'VERB'], 'process_date': '2016-09-01', 'stop_words_file': './data/stopWords.txt', 'story_threshold': 0.2, 'tfidf_binary': False, 'tfidf_maxdf': 0.5, 'tfidf_mindf': 2, 'tfidf_norm': 'l2'}
{'score': 7356, 'inGood': 86, 'inBad': 95, 'outGood': 7381, 'outBad': 16}
ERROR: 214876 should NOT be in Trump meeting
ERROR: 80098 should NOT be in Trump meeting
ERROR: 78170 should NOT be in Trump meeting
ERROR: 44928 should NOT be in Trump meeting
ERROR: 71339 should be in Trump meeting
ERROR: 85801 should NOT be in Clinton Guccifer
ERROR: 44642 should NOT be in Safe space
ERROR: 85799 should NOT be in Lauer debate
ERROR: 39232 should NOT be in David Brown
ERROR: 71350 should NOT be in David Brown
ERROR: 217425 should be in haiti AND president
ERROR: 217437 should be



### lets generate a confusion matrix to evaluate our results

In [399]:
true_labels= pd.read_csv(r"C:\Users\goldm\Capstone\full_article_classifications_v1.csv")

In [400]:
true_labels.columns = ['id','true-label']
true_labels.head()

Unnamed: 0,id,true-label
0,151832,Trump meeting
1,110126,Trump meeting
2,172078,Trump meeting
3,48306,Trump meeting
4,57365,Trump meeting


In [402]:
len(true_labels)

174

In [406]:
predicted_mappings_df = pd.read_csv(r"C:\Users\goldm\Capstone\PredictedMappingsv4-storythreshold=0.26.csv")

In [407]:
predicted_mappings_df.head()

Unnamed: 0.1,Unnamed: 0,root_article,article_compared,predicted_mapping,"true_mapping""",FP/FN/TP/TN,score
0,0,"227 151832\nName: id, dtype: int64",151832,Trump meeting,Trump meeting,TP,1.0
1,1,"227 151832\nName: id, dtype: int64",48306,Trump meeting,Trump meeting,TP,0.421409
2,2,"227 151832\nName: id, dtype: int64",172078,Trump meeting,Trump meeting,TP,0.400392
3,3,"227 151832\nName: id, dtype: int64",110126,Trump meeting,Trump meeting,TP,0.374014
4,4,"227 151832\nName: id, dtype: int64",26536,Trump meeting,Trump meeting,TP,0.346913


In [431]:
predicted_mappings_df['predicted_mapping'].unique()

array(['Trump meeting', 'No Mapping', 'Brazil impeachment', 'Kaepernick',
       'Clinton Guccifer', 'Farage', 'Anthony Weiner', 'SpaceX',
       'Safe space', 'Lauer debate', 'Venezuela', 'Iran deal',
       'Penn State', 'David Brown', 'haiti AND president',
       'inflation AND economy AND fed', 'afghanistan AND war AND end',
       'capitol AND riot AND trump', 'critical AND race AND theory'],
      dtype=object)

In [409]:
label_set = predicted_mappings_df[['predicted_mapping', 'true_mapping"']]

In [428]:
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report

y_test = label_set['true_mapping"']
y_pred = label_set['predicted_mapping']

def get_metrics(y_test, y_predicted):  
    # true positives / (true positives+false positives)
    precision = precision_score(y_test, y_predicted, pos_label=None,
                                    average='micro')             
    # true positives / (true positives + false negatives)
    recall = recall_score(y_test, y_predicted, pos_label=None,
                              average='micro')
    
    # harmonic mean of precision and recall
    f1 = f1_score(y_test, y_predicted, pos_label=None, average='micro')
    
    # true positives + true negatives/ total
    accuracy = accuracy_score(y_test, y_predicted)
    return accuracy, precision, recall, f1

accuracy, precision, recall, f1 = get_metrics(y_test, y_pred)
print("accuracy = %.3f, precision = %.3f, recall = %.3f, f1 = %.3f" % (accuracy, precision, recall, f1))

accuracy = 0.979, precision = 0.979, recall = 0.979, f1 = 0.979


In [427]:
predicted_mappings_df['FP/FN/TP/TN'].value_counts()

TN    3760
TP      80
FN      78
FP       4
Name: FP/FN/TP/TN, dtype: int64