In [1]:
# -*- coding: utf-8 -*-
"""
Created on Sun Mar 28 14:04:56 2021
@author: Saptarshi mukhopadhaya / Salim Hafid
"""
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import pandas as pd
import senti_bignomics
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error,mean_absolute_error,r2_score
import numpy as np
import matplotlib.pyplot as plt
from textblob import TextBlob
import pysentiment2 as ps
from nltk import sent_tokenize
#from xgboost import XGBClassifier
from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer
from sklearn.metrics import classification_report, confusion_matrix, make_scorer, accuracy_score, f1_score, precision_score, recall_score
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)
#warnings.suppress(label_encoder_deprecation_msg, UserWarning)
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
import spacy
import re
import math
import operator
import constants
from sklearn.model_selection import cross_validate, RandomizedSearchCV


# Graph functions

In [2]:
def plot_graph(y_test,y_pred_df):
    plt.figure(figsize=(12,5));
    plt.title('Comparzison actual vs predicted')
    plt.plot(y_test)
    y_test.plot(legend=True) 
    plt.plot(y_pred_df)
    plt.show()
    
def correlation_raph(frame):
    frame['change'] = frame['close_days'] -frame['close']
    frame = frame.tail(100)
    
    fig, ax1 = plt.subplots()
    
    color = 'tab:red'
    ax1.set_xlabel('time')
    ax1.set_ylabel('polarity score', color=color)
    ax1.plot(frame.index, frame['rel_pol'], color=color)
    ax1.tick_params(axis='y', labelcolor=color)
    
    ax2 = ax1.twinx()  # instantiate a second axes that shares the same x-axis
    
    color = 'tab:blue'
    ax2.set_ylabel('change in $', color=color)  # we already handled the x-label with ax1
    ax2.plot(frame.index, frame['change'], color=color)
    ax2.tick_params(axis='y', labelcolor=color)
    
    fig.tight_layout()  # otherwise the right y-label is slightly clipped
    plt.show()

# Preprocessing

In [3]:
def gather_window_news():
    pass

def avg_news(df,pop):
    period = len(df)
    total_news = 0
    for i in df['text']:
        if not isinstance(i, float):
            total_news = total_news+len(i)
    buckets = period/pop
    return total_news/buckets
    
def prepare_data(path,column):
    path = path.sort_values(by=['time'])
    path = path.mask(path.eq('None')).dropna()
    path = path.drop_duplicates()
    path['date'] = pd.to_datetime(path['time']).dt.date
    path['time'] = pd.to_datetime(path['time']).dt.time
    path['date'] = pd.to_datetime(path['date'])
    path = path[['time','date',column]]
    return(path)


def load_text_data(path,column):
    df = sort_news_by_time(prepare_data(pd.read_csv(path),column),column)
    df['time'] = pd.to_datetime(df['time'])
    #print(df.dtypes)
    return df

def merge_frame(df,data,col):
    data = data.set_index('time')
    frame = df.join(data,on=col,how = 'left')
    #frame['pol'] = frame['pol'].fillna (0)
    return frame

def load_price_data(path):
    data = pd.read_csv(path)
    data = data[['time','open','close']]
    data = data.sort_values(by=['time'])
    #data = data.where(data['time']>'2019-01-01')
    data['time'] = pd.to_datetime(data['time'])
    #data = data.set_index('time')
    #print(data.dtypes)
    data = data.dropna()
    return data


'''Sort the news by time and date'''
def sort_news_by_time(frame,col):
    company = {}
    new_frame = {}
    words = None
    prev = None
    for i in frame.values:
        #print(i)
        time,key,val=str(i[0]),str(i[1]),i[2]
        #print(i)
        if (key>='2018-12-29' and key<='2020-10-01'):
            if prev == key and time<'21:00:00':
                words.append(val)
            else:
                if prev is not None:
                    company[prev] = words
                words = []
                prev = key
                words.append(val)
        company[prev] = words
    #print(company.keys())
    
    new_frame['time'] = company.keys()
    new_frame['text'] = company.values()
    return pd.DataFrame.from_dict(new_frame).iloc[1:,:]

def cleaning(sentences):
    nlp = spacy.load('en_core_web_sm',disable=['ner','parser'])
    company_stock_names = ['fb', 'facebook','googl','google', 'amzn', 'amazon', 'aapl', 'apple', 'amgn', 'amgen', 'aligntechnology', 'msft', 'microsoft','nflx','netflix','aal','americanairlines']
    day_names = ['monday','tuesday','wednesday','friday','saturday','sunday']
    month_names = ['january','february','march','april','may','june','july','august','september','october','november','december']
    time_period_names = ['today','tomorrow','yesterday','day','week','month','year','daily','weekly','monthly','yearly']
    cleaned_sentences = []
    for sentence in sentences:
        doc = nlp(sentence)
        # Lemmatizing + Removing stopwords + Removing non-alphabetical characters + Removing company stock names
        # + Removing useless frequent words (day names, time period names, company stock names)
        tokens = [token.lemma_ for token in doc if
                  not token.is_stop 
                  and str(token) not in company_stock_names
                  and str(token) not in day_names
                  and str(token) not in month_names
                  and str(token) not in time_period_names
                 ]
        # Remove non-alphabetic characters
        tokens = [re.sub("[^A-Za-z']+", '', str(token)).lower() for token in tokens]
        tokens = [token for token in tokens if len(token)>1]
        # Since Word2Vec uses surrounding words in a sentence, a sentence that contains less than 3 tokens is not useful
        if len(tokens) > 2:
            cleaned_sentences.append(tokens)
        else:
            cleaned_sentences.append(['blank_to_keep_the_index_coherent_with_the_sentiment_feature'])            
    return cleaned_sentences

# Sentiment Analysis

In [4]:
'''Calculate sentiment with weighted approch using vader'''  
def weight_sentiment_scores(news): 
    count = 0
    polarity = 0
    # Create a SentimentIntensityAnalyzer object. 
    sid_obj = SentimentIntensityAnalyzer() 
    weight = len(news)
    # polarity_scores method of SentimentIntensityAnalyzer 
    # oject gives a sentiment dictionary. 
    # which contains pos, neg, neu, and compound scores. 
    for i in range(len(news)):
        
        #print(i)
        if not isinstance(news[i],float):
            weight = weight - i
            sens = sent_tokenize(news[i])
            weight = weight - i
            
            #for j in sens:
                #if comp.lower() in j.lower():
            sentiment_dict = sid_obj.polarity_scores(news[i]) 
            polarity = polarity+(sentiment_dict['compound']*weight)
            count = count+weight
            
    if count:
        return polarity/count
    else:
        return 0.0
    
'''Calculate sentiment with weighted approch using textblob'''    
def weight_sentiment_textblob(news):
    count = 0
    polarity = 0
    weight = len(news)
    for i in news:
        sen = TextBlob(i)
        if sen.sentiment.polarity != 0.0:
            polarity = polarity+(sen.sentiment.polarity*weight)
            count = count+weight
    if count:
        return polarity/count
    else:
        return 0.0
    
'''Calculate sentiment with weighted approch using pysentiment'''   
def weight_sentiment_pysentiment(news):
    lm = ps.LM()
    count = 0
    polarity = 0
    weight = len(news)
    for i in news:
        #print(i)
        tokens = lm.tokenize(i)
        score = lm.get_score(tokens)
        if score['Subjectivity']!= 0.0:
            #print(score['Polarity'])
            polarity = polarity+score['Polarity']*weight
            count = count+weight
    if count:
        return polarity/count
    else:
        return 0.0
    
'''update the vader lexicon with news corpus'''
def update_vader(vader,corpus):
    return vader.lexicon.update(corpus)

sid_obj = SentimentIntensityAnalyzer() 

'''Scale the scores of the words in Sentibignomics'''
for i in senti_bignomics.senti_bignomics:
    if not type(senti_bignomics.senti_bignomics[i]) == float:
        senti_bignomics.senti_bignomics[i] = 4*float(senti_bignomics.senti_bignomics[i][0])
update_vader(sid_obj,senti_bignomics.senti_bignomics)    

'''Calculate the error'''
def get_error(y_test,y_pred):
    return np.sqrt(np.mean(np.square(((y_test - y_pred) / y_test)), axis=0))*100


'''Calculate sentiment using window'''    
def window_sentiment(data,window):
    news_list = []
    news = []
     
    data['news'] = data['pol']
    for i in range(0,window+1):
        #print(data['news'].shift(i).fillna('[]'))
        data['news'] = data['news']+data['pol'].shift(i)
    
    #print(len(news_list))
    return data['news']/(window)  

#def update_vader(vader,corpus):
 #   return vader.lexicon.update(corpus)
def change_sentiment(frame,window):
    change = []
    frame['a']=frame['close'].shift(window)-frame['close']
    #print(frame)
    for i in frame['a']:
        if i >= 0:
            change.append(1)
        else:
            change.append(0)
        '''
        if i > 0:
            change.append(1)
        elif i<0:
            change.append(-1)
        else:
            change.append(0) 
        '''
    frame['change'] = change
    #print(frame)
    return frame

def get_sentiment(frame):
    pol_list = []
    for i in frame['text']:
        if not isinstance(i,float):
            pol_list.append(weight_sentiment_scores(i))
        else:
            pol_list.append(0)
    frame['pol'] = pol_list
    return frame

'''Calculate sentiment using vader without weight'''
def sentiment_scores(news): 
    count = 0
    polarity = 0
    # Create a SentimentIntensityAnalyzer object. 
    
    
    for i in news:
        
        if not isinstance(i,float):
            
            sentiment_dict = sid_obj.polarity_scores(i) 
            polarity = polarity+(sentiment_dict['compound'])
            count = count+1
            
    if count:
        return polarity/count
    else:
        return 0.0

# Event Extraction

In [None]:
# Word count dictionary
word_freq = {}
for sent in cleaned_articles:
    for i in sent:
        if i in word_freq.keys():
            word_freq[i] += 1
        else:
            word_freq[i] = 1
len(word_freq)

In [None]:
word_freq

In [None]:
# Most frequent words
sorted(word_freq, key=word_freq.get, reverse=True)[:100]

In [5]:
# Event-extraction

def create_event_vector(data):
    event_vector = {}
    for idx,article in enumerate(data):
        event_vector[idx] = {}
        for event in constants.FINANCIAL_EVENTS:
            event_vector[idx][event] = 0
        for token in article:
            for event in constants.FINANCIAL_EVENTS:
                if token in constants.FINANCIAL_EVENTS[event]:
                    if event in event_vector[idx]:
                        event_vector[idx][event] += 1
                    else:
                        event_vector[idx][event] = 1
    return event_vector
                

def convert_event_vector(event_vector):
    for idx in event_vector:
        for event in event_vector[idx]:
            if event_vector[idx][event] != 0:
                event_vector[idx][event] = math.log(event_vector[idx][event])
            else:
                event_vector[idx][event] = -999
    return event_vector

def get_predicted_event(event_vector):
    predicted_events = []
    for article in event_vector:
        max_val = max(event_vector[article].items(), key=operator.itemgetter(1))[1]
        if max_val > 0:
            key_with_max_val = max(event_vector[article].items(), key=operator.itemgetter(1))[0]
            predicted_events.append(key_with_max_val)
        else:
            predicted_events.append('none')
    return predicted_events

def get_weighted_scores(predicted_events):
    weighted_scores = []
    for event in predicted_events:
        if event != 'none':
            weighted_scores.append(constants.FINANCIAL_EVENTS_WEIGHTS[event])
        else:
            weighted_scores.append(1)
    return weighted_scores

# Classification

In [7]:
def classification_model(df):
    model = RandomForestClassifier()
    df = df.drop(['text'], axis=1)
    
    labels = df.change
    
    #print(labels)
    df = df.drop(['change'], axis=1)
    df = df[['close','rel_pol']]
    X_train, X_test, y_train, y_test, indices_train, indices_test = train_test_split(df, labels, df.index, test_size=0.30, random_state=0,shuffle=False)
    print(y_train.value_counts(normalize = False))
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    print('Classification report:\n ',classification_report(y_test,y_pred))
    print('confusion_matrix:\n',confusion_matrix(y_test,y_pred))
    #print(pd.crosstab(y_test,y_pred))


'''Get the prediction'''    
def get_prediction(data,no_news):
    data_new = data
    if no_news:
        "Specify input and output variable"
        x_data = data_new.drop(['close_days','rel_pol'], axis=1)
        y_data = data_new['close_days']
    else:
        x_data = data_new.drop(['close_days'], axis=1)
        y_data = data_new['close_days']
    
    x_train, x_test, y_train, y_test = train_test_split(x_data, y_data, test_size=0.3,shuffle=False)
    
    lr_model=LinearRegression()
    
    lr_model.fit(x_train,y_train)
    
    y_pred=lr_model.predict(x_test)
    y_pred_df = pd.DataFrame(y_pred, index= y_test.index)
    #print(x_data.columns)
    plot_graph(y_test,y_pred_df)
    print('r2-score:',r2_score(y_test,y_pred))
    print('mean_squared_error:',mean_squared_error(y_test,y_pred))
    return get_error(y_test,y_pred)

def nested_classification_report(y_true,y_pred):
    print('Classification report:\n ',classification_report(y_true,y_pred))
    print('confusion_matrix:\n',confusion_matrix(y_true,y_pred))
    return accuracy_score(y_true,y_pred)

def optimized_classifier(df):
    model = RandomForestClassifier()
    df = df.drop(['text'], axis=1)
    labels = df.change
    df = df.drop(['change'], axis=1)
    #df = df[['close','rel_pol']]
    #df = df[['close']]
    df = df[['rel_pol']]
    
    tuning_grid = {
        'max_features': ["sqrt", "log2"],
        'bootstrap': [True, False]
    }

    # RF Model
    random_forest_tuned = RandomizedSearchCV(model, tuning_grid, n_iter=100, scoring="f1_micro", n_jobs=-1, cv=5, verbose=2, random_state=42)
    
    # Cross-validation version : 
    #scores = cross_validate(best_random_forest, df, labels, cv=5, scoring=["f1_micro"], return_train_score=True)
    #scores = cross_validate(best_random_forest, df, labels, cv=5, scoring=make_scorer(nested_classification_report))
    #print(scores)
    
    # Single-validation version
    x_train, x_test, y_train, y_test = train_test_split(df, labels, test_size=0.3,shuffle=False)
    _ = random_forest_tuned.fit(x_train, y_train)
    best_random_forest = random_forest_tuned.best_estimator_
    
    
    #print('x_test :\n',x_test)
    #print('y_test :\n',y_test)
    #print('y_pred :\n',best_random_forest.predict(x_test))
    
    print('Classification report:\n ',classification_report(y_test,best_random_forest.predict(x_test)))
    
    print('f1-micro :',f1_score(y_test,best_random_forest.predict(x_test),average='micro'))
    print('precision :',precision_score(y_test,best_random_forest.predict(x_test),average='micro'))
    print('recall :',recall_score(y_test,best_random_forest.predict(x_test),average='micro'))
    
    print('confusion_matrix:\n',confusion_matrix(y_test,best_random_forest.predict(x_test)))
    tn, fp, fn, tp = confusion_matrix(y_test,best_random_forest.predict(x_test)).ravel()
    print('TN={},FP={},FN={},TP={}'.format(tn,fp,fn,tp))


# Runner

In [9]:
for pop in [2,7,12,17,22,27,32,37,42,47,52,57]:
    #text = load_text_data("/home/salim/Coding/Masters Project/Dataset/stocktwits_by_company_name/AAL_tweet_score.csv",'text')
    text = load_text_data("/home/salim/Coding/Masters Project/Dataset/news_by_company/AAL .csv",'messages')
    price = load_price_data("/home/salim/Coding/Masters Project/Dataset/price_data_by_company/AAL.csv") #Note that it's 0/1 now, not -1/0/1
    frame = merge_frame(price,text,'time')
    frame = frame.sort_values(by=['time'])
    frame = frame.set_index('time')
    frame = get_sentiment(frame)
    frame = frame.tail(458)
    l = []
    for i in frame['text']:
        if not isinstance(i,float):
            l.append(i[0])
        else:
            l.append('')
    frame['text'] = l
    frame['rel_pol'] = window_sentiment(frame,pop)
    frame['rel_pol'] = frame['rel_pol'].fillna(0.0)
    frame = frame['2019-02-01':'2020-10-01']
    print('PoP :',pop)
    print('Avg news count within the pop:',avg_news(frame, pop))
    #frame = frame.head(441)
    #frame['rel_pol'] = frame['rel_pol']
    frame['rel_pol'] = frame['rel_pol']  / frame['rel_pol'].abs().max()*4
    #frame['close_days'] = frame['close'].shift(-1)
    frame = change_sentiment(frame,-1)
    frame = frame.dropna()
    #print(frame['text'].tolist())
    
    #Adding the weighted event scores to the sentiment feature
    # Uncomment this to use event-extraction
    '''
    cleaned_articles = cleaning(frame['text'])
    event_vector = create_event_vector(cleaned_articles)
    predicted_events = get_predicted_event(event_vector)
    weighted_scores = get_weighted_scores(predicted_events)
    frame['rel_pol'] = frame['rel_pol'] * weighted_scores
    '''
    frame = frame[['close','rel_pol','text','change']]
    #print(frame)
    
    #classification_model(frame)
    optimized_classifier(frame)
    
    
    #print(frame['rel_pol'].tolist())
    #correlation_raph(frame)
    #print('RMSPE:',get_prediction(frame,False))
    print('-'*25)

PoP : 2
Avg news count within the pop: 110.86935866983373
Fitting 5 folds for each of 4 candidates, totalling 20 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   8 out of  20 | elapsed:    0.3s remaining:    0.5s
[Parallel(n_jobs=-1)]: Done  20 out of  20 | elapsed:    0.6s finished


Classification report:
                precision    recall  f1-score   support

           0       0.54      0.84      0.66        69
           1       0.39      0.12      0.19        57

    accuracy                           0.52       126
   macro avg       0.46      0.48      0.42       126
weighted avg       0.47      0.52      0.44       126

f1-micro : 0.5158730158730159
precision : 0.5158730158730159
recall : 0.5158730158730159
confusion_matrix:
 [[58 11]
 [50  7]]
TN=58,FP=11,FN=50,TP=7
-------------------------
PoP : 7
Avg news count within the pop: 388.042755344418
Fitting 5 folds for each of 4 candidates, totalling 20 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   8 out of  20 | elapsed:    0.3s remaining:    0.5s
[Parallel(n_jobs=-1)]: Done  20 out of  20 | elapsed:    0.6s finished


Classification report:
                precision    recall  f1-score   support

           0       0.57      0.74      0.65        69
           1       0.51      0.33      0.40        57

    accuracy                           0.56       126
   macro avg       0.54      0.54      0.52       126
weighted avg       0.55      0.56      0.54       126

f1-micro : 0.5555555555555556
precision : 0.5555555555555556
recall : 0.5555555555555556
confusion_matrix:
 [[51 18]
 [38 19]]
TN=51,FP=18,FN=38,TP=19
-------------------------
PoP : 12
Avg news count within the pop: 665.2161520190024
Fitting 5 folds for each of 4 candidates, totalling 20 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   8 out of  20 | elapsed:    0.3s remaining:    0.5s
[Parallel(n_jobs=-1)]: Done  20 out of  20 | elapsed:    0.6s finished


Classification report:
                precision    recall  f1-score   support

           0       0.52      0.64      0.58        69
           1       0.40      0.30      0.34        57

    accuracy                           0.48       126
   macro avg       0.46      0.47      0.46       126
weighted avg       0.47      0.48      0.47       126

f1-micro : 0.48412698412698413
precision : 0.48412698412698413
recall : 0.48412698412698413
confusion_matrix:
 [[44 25]
 [40 17]]
TN=44,FP=25,FN=40,TP=17
-------------------------
PoP : 17
Avg news count within the pop: 942.3895486935867
Fitting 5 folds for each of 4 candidates, totalling 20 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   8 out of  20 | elapsed:    0.3s remaining:    0.4s
[Parallel(n_jobs=-1)]: Done  20 out of  20 | elapsed:    0.6s finished


Classification report:
                precision    recall  f1-score   support

           0       0.49      0.57      0.52        69
           1       0.35      0.28      0.31        57

    accuracy                           0.44       126
   macro avg       0.42      0.42      0.42       126
weighted avg       0.42      0.44      0.43       126

f1-micro : 0.43650793650793657
precision : 0.4365079365079365
recall : 0.4365079365079365
confusion_matrix:
 [[39 30]
 [41 16]]
TN=39,FP=30,FN=41,TP=16
-------------------------
PoP : 22
Avg news count within the pop: 1219.562945368171
Fitting 5 folds for each of 4 candidates, totalling 20 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   8 out of  20 | elapsed:    0.3s remaining:    0.4s
[Parallel(n_jobs=-1)]: Done  20 out of  20 | elapsed:    0.6s finished


Classification report:
                precision    recall  f1-score   support

           0       0.53      0.74      0.61        69
           1       0.38      0.19      0.26        57

    accuracy                           0.49       126
   macro avg       0.45      0.47      0.44       126
weighted avg       0.46      0.49      0.45       126

f1-micro : 0.49206349206349204
precision : 0.49206349206349204
recall : 0.49206349206349204
confusion_matrix:
 [[51 18]
 [46 11]]
TN=51,FP=18,FN=46,TP=11
-------------------------
PoP : 27
Avg news count within the pop: 1496.7363420427553
Fitting 5 folds for each of 4 candidates, totalling 20 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   8 out of  20 | elapsed:    0.3s remaining:    0.5s
[Parallel(n_jobs=-1)]: Done  20 out of  20 | elapsed:    0.6s finished


Classification report:
                precision    recall  f1-score   support

           0       0.54      0.71      0.61        68
           1       0.44      0.28      0.34        57

    accuracy                           0.51       125
   macro avg       0.49      0.49      0.48       125
weighted avg       0.50      0.51      0.49       125

f1-micro : 0.512
precision : 0.512
recall : 0.512
confusion_matrix:
 [[48 20]
 [41 16]]
TN=48,FP=20,FN=41,TP=16
-------------------------
PoP : 32
Avg news count within the pop: 1773.9097387173397
Fitting 5 folds for each of 4 candidates, totalling 20 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   8 out of  20 | elapsed:    0.3s remaining:    0.4s
[Parallel(n_jobs=-1)]: Done  20 out of  20 | elapsed:    0.6s finished


Classification report:
                precision    recall  f1-score   support

           0       0.57      0.84      0.68        68
           1       0.52      0.22      0.31        55

    accuracy                           0.56       123
   macro avg       0.55      0.53      0.49       123
weighted avg       0.55      0.56      0.51       123

f1-micro : 0.5609756097560976
precision : 0.5609756097560976
recall : 0.5609756097560976
confusion_matrix:
 [[57 11]
 [43 12]]
TN=57,FP=11,FN=43,TP=12
-------------------------
PoP : 37
Avg news count within the pop: 2051.083135391924
Fitting 5 folds for each of 4 candidates, totalling 20 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   8 out of  20 | elapsed:    0.3s remaining:    0.5s
[Parallel(n_jobs=-1)]: Done  20 out of  20 | elapsed:    0.6s finished


Classification report:
                precision    recall  f1-score   support

           0       0.54      0.78      0.63        68
           1       0.35      0.15      0.21        54

    accuracy                           0.50       122
   macro avg       0.44      0.46      0.42       122
weighted avg       0.45      0.50      0.45       122

f1-micro : 0.5
precision : 0.5
recall : 0.5
confusion_matrix:
 [[53 15]
 [46  8]]
TN=53,FP=15,FN=46,TP=8
-------------------------
PoP : 42
Avg news count within the pop: 2328.2565320665085
Fitting 5 folds for each of 4 candidates, totalling 20 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   8 out of  20 | elapsed:    0.3s remaining:    0.5s
[Parallel(n_jobs=-1)]: Done  20 out of  20 | elapsed:    0.6s finished


Classification report:
                precision    recall  f1-score   support

           0       0.55      0.78      0.65        67
           1       0.42      0.21      0.28        53

    accuracy                           0.53       120
   macro avg       0.49      0.49      0.46       120
weighted avg       0.50      0.53      0.48       120

f1-micro : 0.525
precision : 0.525
recall : 0.525
confusion_matrix:
 [[52 15]
 [42 11]]
TN=52,FP=15,FN=42,TP=11
-------------------------
PoP : 47
Avg news count within the pop: 2605.4299287410927
Fitting 5 folds for each of 4 candidates, totalling 20 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   8 out of  20 | elapsed:    0.3s remaining:    0.4s
[Parallel(n_jobs=-1)]: Done  20 out of  20 | elapsed:    0.6s finished


Classification report:
                precision    recall  f1-score   support

           0       0.57      0.61      0.59        67
           1       0.45      0.40      0.42        52

    accuracy                           0.52       119
   macro avg       0.51      0.51      0.51       119
weighted avg       0.52      0.52      0.52       119

f1-micro : 0.5210084033613446
precision : 0.5210084033613446
recall : 0.5210084033613446
confusion_matrix:
 [[41 26]
 [31 21]]
TN=41,FP=26,FN=31,TP=21
-------------------------
PoP : 52
Avg news count within the pop: 2882.603325415677
Fitting 5 folds for each of 4 candidates, totalling 20 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   8 out of  20 | elapsed:    0.3s remaining:    0.4s
[Parallel(n_jobs=-1)]: Done  20 out of  20 | elapsed:    0.6s finished


Classification report:
                precision    recall  f1-score   support

           0       0.60      0.80      0.69        66
           1       0.55      0.31      0.40        51

    accuracy                           0.59       117
   macro avg       0.58      0.56      0.54       117
weighted avg       0.58      0.59      0.56       117

f1-micro : 0.5897435897435898
precision : 0.5897435897435898
recall : 0.5897435897435898
confusion_matrix:
 [[53 13]
 [35 16]]
TN=53,FP=13,FN=35,TP=16
-------------------------
PoP : 57
Avg news count within the pop: 3159.776722090261
Fitting 5 folds for each of 4 candidates, totalling 20 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   8 out of  20 | elapsed:    0.3s remaining:    0.5s
[Parallel(n_jobs=-1)]: Done  20 out of  20 | elapsed:    0.6s finished


Classification report:
                precision    recall  f1-score   support

           0       0.54      0.65      0.59        66
           1       0.36      0.26      0.30        50

    accuracy                           0.48       116
   macro avg       0.45      0.46      0.45       116
weighted avg       0.46      0.48      0.47       116

f1-micro : 0.4827586206896552
precision : 0.4827586206896552
recall : 0.4827586206896552
confusion_matrix:
 [[43 23]
 [37 13]]
TN=43,FP=23,FN=37,TP=13
-------------------------


In [22]:
frame

Unnamed: 0_level_0,close,rel_pol,text,change
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2019-03-12,184.18,0.794556,,1
2019-03-13,187.35,7.704619,top health care stockstop health care stocks j...,1
2019-03-14,187.76,1.167740,,1
2019-03-15,191.24,1.167740,,1
2019-03-18,191.31,9.445689,top health care stockstop health care stocks j...,1
...,...,...,...,...
2020-09-24,240.32,-0.320250,,1
2020-09-25,243.82,-0.320250,,1
2020-09-28,247.03,-0.156313,,1
2020-09-29,248.30,-0.156313,,1
