# After giving each article a tag(bullish article or bearish article) [code](https://nbviewer.jupyter.org/github/popolee0513/Data-Science-Skills-Practice/blob/master/Big%20Data%20And%20Business%20Analytics/Stock%20news%20classification/Stock.ipynb),we want to find the key words in bullish article and bearish article respectively in order to build features and make prediction.

In [71]:
import pandas as pd
data=pd.read_csv("data_with_tag.csv")

In [2]:
data=data[data["tag"]!="same"]
print(data.shape)
print(pd.value_counts(data["tag"]))
data=data.dropna()

(4806, 5)
fall    2543
rise    2263
Name: tag, dtype: int64


In [4]:
data.head()

Unnamed: 0,post_time,title,content,weekday,tag
0,2016-01-05,新聞大立光月合併營收億元,發文前請先詳閱新聞分類發文規範未依規範發文將受處份連結過長請善用縮網址連結能不能點擊者板規處...,1,fall
1,2016-01-05,新聞罕見大立光去年月營收年減,原文連結必須檢附原文內容罕見大立光去年月營收年減中央社記者韓婷婷台北日電大立光電公布去年月合...,1,fall
2,2016-01-05,新聞大立光業績免驚外資喊到元,原文連結必須檢附原文內容股王大立光今天公告月營收億元月減成創下年月以來單月最低紀錄顯示蘋果光...,1,fall
3,2016-01-05,公告板開始舉辦樂透,大立光請到板按參與樂透一張幣迷你級樂透結束時間,1,fall
4,2016-01-06,新聞賣超差傳通路庫存爆滿,賣超差傳通路庫存爆滿蘋果砍單三成時間年月日上午聚財網新聞記者陳瑞哲報導大立光前月營收大減三成...,2,fall


In [5]:
def get_ngrams(n, data):
    tf = {}
    df = {}
    tfdf = {}
    for row in range(len(data)):
        tokens = [data.iloc[row][i:i+n] for i in range(0, len(data.iloc[row])-(n-1))]
        #if tokens not in stopword:
        for token in set(tokens):
            if token not in df.keys():
                df[token] = 1
            else:    
                df[token] += 1
        for token in tokens:
            if token not in tf.keys():
                tf[token] = 1
            else:
                tf[token] += 1
    for key, value in tf.items():
        tfdf[key] = [value, df[key]]
    final = pd.DataFrame.from_dict(tfdf, orient = 'index', columns = ['tf','df'])
    return final

In [6]:
data["all"]=data["title"]+data["content"]
rise=data[data["tag"]=="rise"]
fall=data[data["tag"]=="fall"]

data_2_gram=get_ngrams(2, data["all"])
data_3_gram=get_ngrams(3, data["all"])
data_4_gram=get_ngrams(4, data["all"])

fall_2_gram=get_ngrams(2, fall["all"])
fall_3_gram=get_ngrams(3, fall["all"])
fall_4_gram=get_ngrams(4, fall["all"])

rise_2_gram=get_ngrams(2, rise["all"])
rise_3_gram=get_ngrams(3, rise["all"])
rise_4_gram=get_ngrams(4, rise["all"])

In [7]:
data_all_gram=pd.concat([data_2_gram,data_3_gram],axis=0)
rise_all_gram=pd.concat([rise_2_gram,rise_3_gram],axis=0)
fall_all_gram=pd.concat([fall_2_gram,fall_3_gram],axis=0)

In [8]:
data_all_gram.head()

data_all_gram=data_all_gram[data_all_gram["tf"]>=40]
rise_all_gram=rise_all_gram[rise_all_gram["tf"]>=40]
fall_all_gram=fall_all_gram[fall_all_gram["tf"]>=40]

print(data_all_gram.shape)
print(rise_all_gram.shape)
print(fall_all_gram.shape)

(16826, 2)
(7597, 2)
(8793, 2)


In [9]:
def remove_same(df):
    """移除相同DF的 被較長詞包含的詞"""
    df['len'] = df.index.str.len()
    df.sort_values('len', ascending=True, inplace = True)
    df.drop('len', axis=1, inplace=True)
    same_drop = set()
    for i in range(len(df)):
        for j in range(i+1, len(df)):
            # row i 的詞比row j 的詞短 (e.g. row i: 2-gram, row j: 3-gram)
            # 且row i 被 row j 的詞包含
            if (len(df.index[i]) < len(df.index[j])) & (df.index[i] in df.index[j]): 
                # 兩個詞的 DF 相差不到1% same DF number
                if abs(df.iloc[i, 1] - df.iloc[j, 1]) <= max(df.iloc[i, 1], df.iloc[j, 1]) * 0.01 :
                    #add the word in row i(shorter word) to a same_drop set
                    same_drop.add(df.index[i])
                    break
    return df.drop(same_drop)

In [10]:
data_all_gram=remove_same(data_all_gram)
rise_all_gram=remove_same(rise_all_gram)
fall_all_gram=remove_same(fall_all_gram)

In [11]:
import numpy as np
data_all_gram['tfidf'] = (1+np.log(data_all_gram.tf))*np.log(4806/data_all_gram.df)
rise_all_gram['tfidf'] = (1+np.log(rise_all_gram.tf))*np.log(2263/rise_all_gram.df)
fall_all_gram['tfidf'] = (1+np.log(fall_all_gram.tf))*np.log(2543/fall_all_gram.df)

In [12]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression

In [18]:
def get_final_df(df, docs, all_df, all_docs,feature_count,usage):
    """combine the topic df and the total df and then calculate the chi square"""
    all_df.columns = ['all_tf','all_df','all_tf-idf']
    df = pd.merge(df, all_df, left_index = True, right_index = True, how = 'left')
    df['midf'] = np.log(df.df/(df.all_df*docs))
    df['tf_ev'] = df.all_tf/all_docs*docs
    df['df_ev'] = df.all_df/all_docs*docs
    df['tf_chi'] = ((df.tf-df.tf_ev)**2/df.tf_ev)*np.sign(df.tf-df.tf_ev)
    df['df_chi'] = ((df.df-df.df_ev)**2/df.df_ev)*np.sign(df.df-df.df_ev)
    df["lift"]=(df.df/docs)/(df.all_df/all_docs)
    df["mi*tfidf"]=df["midf"]*df["tfidf"]
   
    df = df.sort_values(usage,ascending = False)[:feature_count]

    return(df)

import warnings
warnings.filterwarnings('ignore')



for i in ["mi*tfidf","df_chi","midf","lift"]:
    for j in [1000,2000,3000,4000]:
        fall=get_final_df(fall_all_gram,2543,data_all_gram,4806,j,i)
        rise=get_final_df(rise_all_gram,2263,data_all_gram,4806,j,i)
        drop=set(fall.index).intersection(rise.index)
        fall=fall.drop(list(drop))
        rise=rise.drop(list(drop))
        total=list(fall.index)+list(rise.index)
        feature=np.zeros((data.shape[0],len(total)))
        for k in range(len(data)):
            for l in range(len(total)):
                if total[l] in data["all"].iloc[k]:
                    feature[k,l]+=1
                else:
                    feature[k,l]=feature[k,l]
        x_train, x_test, y_train, y_test = train_test_split(feature,data["tag"], test_size=0.15, random_state=1,shuffle=True)#,stratify=data["tag"])
        logistic = LogisticRegression(random_state=0,max_iter=100000,C=0.1)
        logistic.fit(x_train, y_train)
        y_pred = logistic.predict(x_test)
        print("Under %d keyswords and in %s mode,the accuracy=%.5f" %(j,i,accuracy_score(y_test, y_pred)))

Under 1000 keyswords and in mi*tfidf mode,the accuracy=0.61026
Under 2000 keyswords and in mi*tfidf mode,the accuracy=0.65049
Under 3000 keyswords and in mi*tfidf mode,the accuracy=0.68793
Under 4000 keyswords and in mi*tfidf mode,the accuracy=0.66713
Under 1000 keyswords and in df_chi mode,the accuracy=0.72399
Under 2000 keyswords and in df_chi mode,the accuracy=0.71429
Under 3000 keyswords and in df_chi mode,the accuracy=0.69071
Under 4000 keyswords and in df_chi mode,the accuracy=0.64771
Under 1000 keyswords and in midf mode,the accuracy=0.73232
Under 2000 keyswords and in midf mode,the accuracy=0.72399
Under 3000 keyswords and in midf mode,the accuracy=0.69487
Under 4000 keyswords and in midf mode,the accuracy=0.65049
Under 1000 keyswords and in lift mode,the accuracy=0.73370
Under 2000 keyswords and in lift mode,the accuracy=0.72677
Under 3000 keyswords and in lift mode,the accuracy=0.69626
Under 4000 keyswords and in lift mode,the accuracy=0.65049


In [19]:
fall=get_final_df(fall_all_gram,2543,data_all_gram,4806,1000,"lift")
rise=get_final_df(rise_all_gram,2263,data_all_gram,4806,1000,"lift")

In [20]:
drop=set(fall.index).intersection(rise.index)
fall=fall.drop(list(drop))
rise=rise.drop(list(drop))

In [21]:
print(len(list(fall.index)+list(rise.index)))
total=list(fall.index)+list(rise.index)

2000


In [22]:
feature=np.zeros((data.shape[0],len(total)))
for k in range(len(data)):
    for l in range(len(total)):
        if total[l] in data["all"].iloc[k]:
            feature[k,l]+=1
        else:
            feature[k,l]=feature[k,l]

In [69]:
from imblearn.over_sampling import SMOTE
x_train, x_test, y_train, y_test = train_test_split(feature,data["tag"], test_size=0.15, random_state=2,shuffle=True)

forest = LogisticRegression(random_state=0,max_iter=100,C=0.19)
forest.fit(x_train, y_train)
y_pred = forest.predict(x_test)
y_train_pred=forest.predict(x_train)
print('Accuracy (forest): %.5f' % accuracy_score(y_train, y_train_pred))
print('Accuracy (forest): %.5f' % accuracy_score(y_test, y_pred))

Accuracy (forest): 0.87243
Accuracy (forest): 0.74619
