# Count time-based sentimental polarity distribution

# Load Data

In [2]:
import pandas as pd

In [3]:
df = pd.read_csv('preprocessed_news_218posts.csv', sep='|')

In [4]:
df.head(1)

Unnamed: 0,item_id,date,title,content,sentiment,summary,top_key_freq,tokens,tokens_rm_stopwords,entities,token_pos,link,photo_link,category
0,aipl_20220929_1,2022-09-29,台灣國際漁業展、智慧農業週開幕 逾15國參展,外交部政次田中光今天出席2022年「台灣國際漁業展」與「台灣智慧農業週」聯合開幕儀式，他表示...,0.68,"['外交部政次田中光今天出席2022年「台灣國際漁業展」與「台灣智慧農業週」聯合開幕儀式',...","[('展覽', 8), ('台灣', 6), ('廠商', 6), ('參展', 6), (...","['外交部', '政次', '田中光', '今天', '出席', '2022年', '「',...","['外交部', '政次', '田中光', '出席', '台灣', '國際', '漁業展', ...","{(657, 658, 'GPE', '台'), (143, 146, 'PERSON', ...","[('外交部', 'Nc'), ('政次', 'Na'), ('田中光', 'Nb'), (...",https://www.cna.com.tw/news/aipl/202209290375....,https://imgcdn.cna.com.tw/www/WebPhotos/200/20...,政治


# Filter dataframe

## All-in-one function: filter_dataFrame_fullText


In [5]:
from datetime import datetime, timedelta

def filter_dataFrame(user_keywords, cond, cate, weeks):

    # end date: the date of the latest record of news
    end_date = df.date.max()
    
    # start date
    start_date = (datetime.strptime(end_date, '%Y-%m-%d').date() - timedelta(weeks=weeks)).strftime('%Y-%m-%d')

    # proceed filtering
    if (cate == "全部") & (cond == 'and'):
        df_query = df[(df.date >= start_date) & (df.date <= end_date) & df.tokens_rm_stopwords.apply(
            lambda row: all((qk in row) for qk in user_keywords))]
    elif (cate == "全部") & (cond == 'or'):
        queryKey = '|'.join(user_keywords)
        df_query = df[(df['date'] >= start_date) & (df['date'] <= end_date) & df.tokens_rm_stopwords.str.contains(queryKey)]
    elif (cond == 'and'):
        df_query = df[(df.category == cate) & (df.date >= start_date) & (df.date <= end_date) & df.tokens_rm_stopwords.apply(
            lambda row: all((qk in row) for qk in user_keywords))]
    elif (cond == 'or'):
        queryKey = '|'.join(user_keywords)
        df_query = df[(df.category == cate) & (df['date'] >= start_date) & (df['date'] <= end_date) & df[
            'tokens_rm_stopwords'].str.contains(queryKey)]

    return df_query

In [6]:
def filter_dataFrame_fullText(user_keywords, cond, cate,weeks):

    # end date: the date of the latest record of news
    end_date = df.date.max()
    
    # start date
    start_date = (datetime.strptime(end_date, '%Y-%m-%d').date() - timedelta(weeks=weeks)).strftime('%Y-%m-%d')

    # proceed filtering
    if (cate == "全部") & (cond == 'and'):
        df_query = df[(df.date >= start_date) & (df.date <= end_date) & df.content.apply(
            lambda row: all((qk in row) for qk in user_keywords))]
    elif (cate == "全部") & (cond == 'or'):
        queryKey = '|'.join(user_keywords)
        df_query = df[(df['date'] >= start_date) & (df['date'] <= end_date) & df.content.str.contains(queryKey)]
    elif (cond == 'and'):
        df_query = df[(df.category == cate) & (df.date >= start_date) & (df.date <= end_date) & df.content.apply(
            lambda row: all((qk in row) for qk in user_keywords))]
    elif (cond == 'or'):
        queryKey = '|'.join(user_keywords)
        df_query = df[(df.category == cate) & (df['date'] >= start_date) & (df['date'] <= end_date) & df[
            'content'].str.contains(queryKey)]

    return df_query

In [9]:
user_keywords = ['智慧']
cond = 'and'
cate = '全部'
weeks = 4
df_query = filter_dataFrame(user_keywords, cond, cate, weeks)

In [10]:
len(df_query)

11

# Count number of article with Positive, Negative, and Neutral polarity

## All-in-one function: get_article_sentiment

In [11]:
def get_article_sentiment(df_query):
    sentiCount = {'Positive': 0, 'Negative': 0, 'Neutral': 0}
    sentiPercnt = {'Positive': 0, 'Negative': 0, 'Neutral': 0}
    numberOfArticle = len(df_query)
    for senti in df_query.sentiment:
        # determine sentimental polarity
        if float(senti) >= 0.75:
            sentiCount['Positive'] += 1
        elif float(senti) <= 0.4:
            sentiCount['Negative'] += 1
        else:
            sentiCount['Neutral'] += 1
    for polar in sentiCount :
        # calcuate percentage value
        sentiPercnt[polar]=int(sentiCount[polar]/numberOfArticle*100)
        #sentiPercnt[polar]=round(sentiCount[polar]/numberOfArticle,2)
    return sentiCount, sentiPercnt


In [12]:
get_article_sentiment(df_query)

({'Positive': 4, 'Negative': 6, 'Neutral': 1},
 {'Positive': 36, 'Negative': 54, 'Neutral': 9})

# Get frequency of news with negative and positive sentiment for line chart

Count sentimental polarity distribution

In [13]:
# This is what we used to calulate daily freqncy of keyword in our previous app
pd.DataFrame({'date_index':pd.to_datetime( df_query.date ),'freq':[1 for _ in range(len(df_query))]})

Unnamed: 0,date_index,freq
0,2022-09-29,1
3,2022-09-29,1
19,2022-09-29,1
24,2022-09-28,1
39,2022-09-22,1
44,2022-09-29,1
84,2022-09-29,1
99,2022-09-29,1
136,2022-09-29,1
168,2022-09-29,1


In [14]:
# Now we need to modify the above line as follows.
pd.DataFrame({'date_index':pd.to_datetime( df_query.date ),'pos':[ (lambda x: 1 if x >= 0.7 else 0)(s) for s in df_query.sentiment]})

Unnamed: 0,date_index,pos
0,2022-09-29,0
3,2022-09-29,0
19,2022-09-29,0
24,2022-09-28,1
39,2022-09-22,1
44,2022-09-29,1
84,2022-09-29,1
99,2022-09-29,0
136,2022-09-29,0
168,2022-09-29,0


In [12]:
# What is the following lambda function? It is a little bite hard to understand.

In [15]:
[ (lambda x: 1 if x >= 0.7 else 0)(s) for s in df_query.sentiment]

[0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0]

In [16]:
lambda x: 1 if x >= 0.7 else 0

<function __main__.<lambda>(x)>

In [17]:
(lambda x: 1 if x >= 0.7 else 0)(0.9)

1

In [18]:
(lambda x: 1 if x >= 0.7 else 0)(0.2)

0

In [19]:
pd.DataFrame({'date_index':pd.to_datetime( df_query.date ),'pos':[ (lambda x: 1 if x >= 0.7 else 0)(s) for s in df_query.sentiment]})

Unnamed: 0,date_index,pos
0,2022-09-29,0
3,2022-09-29,0
19,2022-09-29,0
24,2022-09-28,1
39,2022-09-22,1
44,2022-09-29,1
84,2022-09-29,1
99,2022-09-29,0
136,2022-09-29,0
168,2022-09-29,0


In [20]:
{'date_index':pd.to_datetime( df_query.date ),'pos':[ (lambda x: 1 if x >= 0.7 else 0)(s) for s in df_query.sentiment]}

{'date_index': 0     2022-09-29
 3     2022-09-29
 19    2022-09-29
 24    2022-09-28
 39    2022-09-22
 44    2022-09-29
 84    2022-09-29
 99    2022-09-29
 136   2022-09-29
 168   2022-09-29
 184   2022-09-29
 Name: date, dtype: datetime64[ns],
 'pos': [0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0]}

In [21]:
query_freq = pd.DataFrame({'date_index':pd.to_datetime( df_query.date ),'pos':[ (lambda x: 1 if x >= 0.7 else 0)(s) for s in df_query.sentiment]})

In [22]:
data = query_freq.groupby(pd.Grouper(key='date_index',freq='D')).sum()

In [23]:
data

Unnamed: 0_level_0,pos
date_index,Unnamed: 1_level_1
2022-09-22,1
2022-09-23,0
2022-09-24,0
2022-09-25,0
2022-09-26,0
2022-09-27,0
2022-09-28,1
2022-09-29,2


In [24]:
time_data = []
for i, idx in enumerate(data.index):
    row = {'x':idx.strftime('%Y-%m-%d'),'y':int(data.iloc[i].pos)}
    time_data.append(row)

In [25]:
time_data

[{'x': '2022-09-22', 'y': 1},
 {'x': '2022-09-23', 'y': 0},
 {'x': '2022-09-24', 'y': 0},
 {'x': '2022-09-25', 'y': 0},
 {'x': '2022-09-26', 'y': 0},
 {'x': '2022-09-27', 'y': 0},
 {'x': '2022-09-28', 'y': 1},
 {'x': '2022-09-29', 'y': 2}]

## All-in-one function: Get frequency of news with negative and positive sentiment 

In [26]:
def get_key_time_based_sentiment(df_query, freq_type='D'):
    
    # date samples
    date_samples = df_query.date

    # positive
    pos_freq = pd.DataFrame({'date_index': pd.to_datetime(date_samples),
                             'pos': [(lambda x: 1 if x >= 0.6 else 0)(s) for s in df_query.sentiment]})
    data = pos_freq.groupby(pd.Grouper(key='date_index', freq= freq_type)).sum()
    data_pos = []
    for i, idx in enumerate(data.index):
        row = {'x': idx.strftime('%Y-%m-%d'), 'y': int(data.iloc[i].pos)}
        data_pos.append(row)

    # negative
    neg_freq = pd.DataFrame({'date_index': pd.to_datetime(date_samples),
                             'neg': [(lambda x: 1 if x <= 0.4 else 0)(s) for s in df_query.sentiment]})
    data = neg_freq.groupby(pd.Grouper(key='date_index', freq= freq_type)).sum()
    data_neg = []
    for i, idx in enumerate(data.index):
        row = {'x': idx.strftime('%Y-%m-%d'), 'y': int(data.iloc[i].neg)}
        data_neg.append(row)

    return data_pos, data_neg

In [27]:
user_keywords = ['台灣']
cond = 'and'
cate = '全部'
weeks = 4
df_query = filter_dataFrame(user_keywords, cond, cate, weeks)

In [28]:
result = get_key_time_based_sentiment(df_query)

In [29]:
result

([{'x': '2022-09-22', 'y': 0},
  {'x': '2022-09-23', 'y': 0},
  {'x': '2022-09-24', 'y': 0},
  {'x': '2022-09-25', 'y': 0},
  {'x': '2022-09-26', 'y': 0},
  {'x': '2022-09-27', 'y': 1},
  {'x': '2022-09-28', 'y': 4},
  {'x': '2022-09-29', 'y': 15}],
 [{'x': '2022-09-22', 'y': 2},
  {'x': '2022-09-23', 'y': 0},
  {'x': '2022-09-24', 'y': 0},
  {'x': '2022-09-25', 'y': 0},
  {'x': '2022-09-26', 'y': 0},
  {'x': '2022-09-27', 'y': 1},
  {'x': '2022-09-28', 'y': 7},
  {'x': '2022-09-29', 'y': 48}])

In [30]:
data_pos, data_neg = get_key_time_based_sentiment(df_query)

In [31]:
data_pos

[{'x': '2022-09-22', 'y': 0},
 {'x': '2022-09-23', 'y': 0},
 {'x': '2022-09-24', 'y': 0},
 {'x': '2022-09-25', 'y': 0},
 {'x': '2022-09-26', 'y': 0},
 {'x': '2022-09-27', 'y': 1},
 {'x': '2022-09-28', 'y': 4},
 {'x': '2022-09-29', 'y': 15}]

In [31]:
data_neg

[{'x': '2020-03-05', 'y': 1},
 {'x': '2020-03-06', 'y': 0},
 {'x': '2020-03-07', 'y': 0},
 {'x': '2020-03-08', 'y': 0},
 {'x': '2020-03-09', 'y': 0},
 {'x': '2020-03-10', 'y': 3},
 {'x': '2020-03-11', 'y': 4},
 {'x': '2020-03-12', 'y': 5},
 {'x': '2020-03-13', 'y': 50},
 {'x': '2020-03-14', 'y': 17}]

# Django views.py

(1)app name: app_userkey_sentiment


(2)namespace defined in urls.py: 
app_name="namespace_userkey_sentiment"  
or app_name="app_userkey_sentiment"

(3) home.html


In [None]:
from django.http import JsonResponse
from django.shortcuts import render
import pandas as pd
from datetime import datetime, timedelta


# Load news data
# import from app_user_keyword.views and use df later
import app_user_keyword.views as userkeyword_views
def load_df_data():
    # import and use df from app_user_keyword 
    global df # global variable
    df = userkeyword_views.df

# call load data function when starting server
load_df_data()


def home(request):
    return render(request, 'app_userkey_sentiment/home.html')

def api_get_userkey_sentiment(request):

    userkey = request.GET['userkey']
    cate = request.GET['cate']
    cond = request.GET['cond']
    weeks = int(request.GET['weeks'])

    key = userkey.split()

    # global variable
    # global  df_query

    # Proceed filtering
    df_query = filter_dataFrame_fullText(key, cond, cate, weeks)
    print(key)
    print(len(df_query))
    

    sentiCount, sentiPercnt = get_article_sentiment(df_query)

    if weeks <= 4:
        freq_type = 'D'
    else:
        freq_type = 'W'

    data_pos, data_neg = get_key_time_based_sentiment( df_query, freq_type )

    response = {
        'sentiCount': sentiCount,
        'data_pos':data_pos,
        'data_neg':data_neg,
    }
    return JsonResponse(response)

def get_article_sentiment(df_query):
    sentiCount = {'Positive': 0, 'Negative': 0, 'Neutral': 0}
    sentiPercnt = {'Positive': 0, 'Negative': 0, 'Neutral': 0}
    numberOfArticle = len(df_query)
    for senti in df_query.sentiment:
        # determine sentimental polarity
        if float(senti) >= 0.75:
            sentiCount['Positive'] += 1
        elif float(senti) <= 0.4:
            sentiCount['Negative'] += 1
        else:
            sentiCount['Neutral'] += 1
    for polar in sentiCount :
        sentiPercnt[polar]=int(sentiCount[polar]/numberOfArticle*100)
        #sentiPercnt[polar]=round(sentiCount[polar]/numberOfArticle,2)
    return sentiCount, sentiPercnt


def get_key_time_based_sentiment(df_query, freq_type='D'):

    # date samples
    date_samples = df_query.date

    # positive sn >= 0.6
    pos_freq = pd.DataFrame({'date_index': pd.to_datetime(date_samples),
                             'pos': [(lambda x: 1 if x >= 0.6 else 0)(s) for s in df_query.sentiment]})
    data = pos_freq.groupby(pd.Grouper(key='date_index', freq= freq_type)).sum()
    data_pos = []
    for i, idx in enumerate(data.index):
        row = {'x': idx.strftime('%Y-%m-%d'), 'y': int(data.iloc[i].pos)}
        data_pos.append(row)

    # negative sn <= 0.4
    neg_freq = pd.DataFrame({'date_index': pd.to_datetime(date_samples),
                             'neg': [(lambda x: 1 if x <= 0.4 else 0)(s) for s in df_query.sentiment]})
    data = neg_freq.groupby(pd.Grouper(key='date_index', freq= freq_type)).sum()
    data_neg = []
    for i, idx in enumerate(data.index):
        row = {'x': idx.strftime('%Y-%m-%d'), 'y': int(data.iloc[i].neg)}
        data_neg.append(row)

    return data_pos, data_neg

def filter_dataFrame_fullText(user_keywords, cond, cate,weeks):

    # end date: the date of the latest record of news
    end_date = df.date.max()
    
    # start date
    start_date = (datetime.strptime(end_date, '%Y-%m-%d').date() - timedelta(weeks=weeks)).strftime('%Y-%m-%d')

    # proceed filtering
    if (cate == "全部") & (cond == 'and'):
        df_query = df[(df.date >= start_date) & (df.date <= end_date) & df.content.apply(
            lambda row: all((qk in row) for qk in user_keywords))]
    elif (cate == "全部") & (cond == 'or'):
        queryKey = '|'.join(user_keywords)
        df_query = df[(df['date'] >= start_date) & (df['date'] <= end_date) & df.content.str.contains(queryKey)]
    elif (cond == 'and'):
        df_query = df[(df.category == cate) & (df.date >= start_date) & (df.date <= end_date) & df.content.apply(
            lambda row: all((qk in row) for qk in user_keywords))]
    elif (cond == 'or'):
        queryKey = '|'.join(user_keywords)
        df_query = df[(df.category == cate) & (df['date'] >= start_date) & (df['date'] <= end_date) & df[
            'content'].str.contains(queryKey)]

    return df_query

print("app_userkey_sentiment was loaded!")
