In [1]:
import numpy as np
import pandas as pd
import tushare as ts

from tqdm import tqdm
import warnings
import time

import ast
import jieba
import datetime
from gensim.models import word2vec
from gensim.models import FastText
from glove import Glove, Corpus

pro = ts.pro_api('52119551ebc32c734a42fd61ccf33344fde3c6195b0aacec58d15140')
STARTDATE = '20190101'
ENDDATE = '20190622'

# Import previous data

In [2]:
start = datetime.datetime(2014, 4, 1, 0, 0, 0)
end = datetime.datetime(2019, 6, 22, 0, 0, 0)

delta = end - start

def get_new(news):
    news_text = []
    for i in tqdm(range(0, delta.days + 1, 2)):
        startdate = (start + datetime.timedelta(days=i)).date()
        enddate = (start + datetime.timedelta(days=i+1)).date()
        df = pro.news(src=news, start_date=str(startdate), end_date=str(enddate))
        news_text.append(df)
        time.sleep(0.5)
    return news_text

In [3]:
df_10jqka_list = get_new('10jqka')

100%|██████████| 955/955 [16:03<00:00,  1.13s/it]


In [5]:
df_eastmoney_list = get_new('eastmoney')

100%|██████████| 955/955 [17:27<00:00,  1.46s/it]


In [6]:
df_sina_list = get_new('sina')

100%|██████████| 955/955 [18:19<00:00,  1.48s/it]


In [18]:
df_sina = pd.concat(df_sina_list)
df_10jqka = pd.concat(df_10jqka_list)
df_eastmoney = pd.concat(df_eastmoney_list)

In [19]:
df_sina.to_csv(r'./sina_news.csv')
df_10jqka.to_csv(r'./10jqka_news.csv')
df_eastmoney.to_csv(r'./eastmoney_news.csv')

In [20]:
# text_df = [df_sina, df_10jqka, df_eastmoney, df_yuncaijing]
text_df = [df_sina, df_10jqka, df_eastmoney]

In [21]:
for i, df in enumerate(text_df):
    df['datetime'] = df['datetime'].str.split(' ').apply(lambda x: x[0])
    df[str(i)+'_content'] = df['content']
    df[str(i)+'_title'] = df['title']
    df.drop(columns=["content", "title"], inplace=True)

In [24]:
df_sina.columns.values, df_10jqka.columns.values, df_eastmoney.columns.values,

(array(['datetime', '0_content', '0_title'], dtype=object),
 array(['datetime', '1_content', '1_title'], dtype=object),
 array(['datetime', '2_content', '2_title'], dtype=object))

In [40]:
merge_date_sina = df_sina[['datetime', '0_content']]
merge_date_sina = merge_date_sina.groupby(
    ['datetime'])['0_content'].apply(';'.join).reset_index()

merge_date_sina_title = df_sina[['datetime', '0_title']]
merge_date_sina_title = merge_date_sina_title.groupby(
    ['datetime'])['0_title'].apply(';'.join).reset_index()
merge_date_sina = merge_date_sina.merge(merge_date_sina_title, on="datetime", how="left")
merge_date_sina['0_text'] = merge_date_sina[['0_content', '0_title']].apply(lambda x: ';'.join(x), axis=1)
merge_date_sina.drop(columns=["0_content", "0_title"], inplace=True)


merge_date_10jqka = df_10jqka[['datetime', '1_content']]
merge_date_10jqka['1_content'] = merge_date_10jqka['1_content'].astype(str)
merge_date_10jqka = merge_date_10jqka.groupby(
    ['datetime'])['1_content'].apply(';'.join).reset_index()

merge_date_10jqka_title = df_10jqka[['datetime', '1_title']]
merge_date_10jqka_title = merge_date_10jqka_title.groupby(
    ['datetime'])['1_title'].apply(';'.join).reset_index()
merge_date_10jqka = merge_date_10jqka.merge(merge_date_10jqka_title, on="datetime", how="left")
merge_date_10jqka['1_text'] = merge_date_10jqka[['1_content', '1_title']].apply(lambda x: ';'.join(x), axis=1)
merge_date_10jqka.drop(columns=["1_content", "1_title"], inplace=True)


merge_date_eastmoney = df_eastmoney[['datetime', '2_content']]
merge_date_eastmoney = merge_date_eastmoney.groupby(
    ['datetime'])['2_content'].apply(';'.join).reset_index()

merge_date_eastmoney_title = df_eastmoney[['datetime', '2_title']]
merge_date_eastmoney_title = merge_date_eastmoney_title.groupby(
    ['datetime'])['2_title'].apply(';'.join).reset_index()
merge_date_eastmoney = merge_date_eastmoney.merge(merge_date_eastmoney_title, on="datetime", how="left")
merge_date_eastmoney['2_text'] = merge_date_eastmoney[['2_content', '2_title']].apply(lambda x: ';'.join(x), axis=1)
merge_date_eastmoney.drop(columns=["2_content", "2_title"], inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [42]:
merge_date_sina = merge_date_sina.merge(merge_date_10jqka, on="datetime", how="left")
merge_date_sina = merge_date_sina.merge(merge_date_eastmoney, on="datetime", how="left")
merge_date_sina.head()

Unnamed: 0,datetime,0_text,1_text,2_text
0,2018-10-09,在岸人民币兑美元（CNY）北京时间23:30收报6.9225元，较周一夜盘收盘涨60点；全天...,,
1,2018-10-11,现货白银扩大涨幅至2%，现报14.56美元/盎司。;纳指再度转涨。;哥伦比亚广播下跌2....,,
2,2018-10-13,意大利副总理Salvini：预算法律可能会在周末之前准备就绪。;【粤港澳大湾区现代化交通运输...,,
3,2018-10-15,摩根大通策略师：波动率上升释放买入标普500指数的信号。;美国纽约联储GDPNowcast模...,,
4,2018-10-17,道指跌幅收窄至不到60点。;特斯拉跌3.2%。;美国财政部雇员因为泄露敏感信息而被捕。;立陶...,,


In [45]:
merge_date_sina = merge_date_sina.fillna('')
merge_date_sina['text'] = merge_date_sina[['0_text', '1_text', '2_text']].apply(lambda x: ';'.join(x), axis=1)
merge_date_sina.drop(columns=['0_text', '1_text', '2_text'], inplace=True)
merge_date_sina.head()

Unnamed: 0,datetime,text
0,2018-10-09,在岸人民币兑美元（CNY）北京时间23:30收报6.9225元，较周一夜盘收盘涨60点；全天...
1,2018-10-11,现货白银扩大涨幅至2%，现报14.56美元/盎司。;纳指再度转涨。;哥伦比亚广播下跌2....
2,2018-10-13,意大利副总理Salvini：预算法律可能会在周末之前准备就绪。;【粤港澳大湾区现代化交通运输...
3,2018-10-15,摩根大通策略师：波动率上升释放买入标普500指数的信号。;美国纽约联储GDPNowcast模...
4,2018-10-17,道指跌幅收窄至不到60点。;特斯拉跌3.2%。;美国财政部雇员因为泄露敏感信息而被捕。;立陶...


In [46]:
text_df = merge_date_sina.copy()

In [47]:
text_df.shape

(129, 2)

In [13]:
df = pro.news(src='10jqka', start_date='20140401', end_date=ENDDATE)

In [15]:
df['datetime'] = df['datetime'].str.split(' ').apply(lambda x: x[0])

In [18]:
df['datetime'].unique()

array(['2019-06-21', '2019-06-20', '2019-06-19', '2019-06-18'],
      dtype=object)

In [19]:
df['datetime'].shape

(1000,)