In [1]:
import pandas as pd
import numpy as np
import pandas.io.data as web
from pandas.tseries.offsets import BDay
from sklearn.preprocessing import normalize as Normal
from sklearn import svm
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_auc_score
import statsmodels.api as sm
import os

The pandas.io.data module is moved to a separate package (pandas-datareader) and will be removed from pandas in a future version.
After installing the pandas-datareader package (https://github.com/pydata/pandas-datareader), you can change the import ``from pandas.io import data, wb`` to ``from pandas_datareader import data, wb``.


## Class and Functions

In [2]:
class KeywordAnalyzer:
    
    # constructor: pass in the keyword data frame, and the list of tickers associated with company names
    def __init__(self, keywords_df, ticker_df ):
        self.__Keywords=keywords_df.copy()
        self.__Keywords['Ticker']=self.__Keywords['Company Name'].apply(lambda x:ticker_df[ticker_df.Name==x]['Ticker'].tolist()[0])
        self.__Keywords['Ticker']=self.__Keywords['Ticker'].apply(lambda x:x.split(' ')[0])
        try:
            self.__Keywords['Date Filed']=self.__Keywords['Date Filed'].apply(lambda x: pd.to_datetime(x, format='%Y-%M-%d'))
        except:
            pass
        
        #initialize response dataframe
        self.__Y=pd.DataFrame(index=keywords_df.index)
        #initialize return time series
        self.__TS=pd.DataFrame()
        
        
    # display       
    def get_X(self):
        return self.__Keywords
    
    
    # get Y data as response
    def get_Y(self, kind='Return'):
        if(kind in self.__Y.columns.tolist()): #already calculate
            pass
        else:
            self.__Y[kind]=self.__Keywords.apply(lambda x: get_Ticker_Y(x.loc['Ticker'], x.loc['Date Filed'], kind),axis=1)
            
        return self.__Y
            
    # get stock return time series
    def get_TS(self, window=10):
        if(str(window-1) in self.__TS.columns.tolist()): #already called
            pass
        else:
            #tmp_df=pd.DataFrame(index=self.__TS.index, columns=np.arange(window,-1,-1))
            for i,row in self.__Keywords.iterrows():
                tmp=get_Ticker_TS(row.Ticker, row['Date Filed'], window)
                tmp.name=i
                self.__TS=self.__TS.append(tmp)
             
        return (self.__TS)
    
    def prep_Classifier(self, kind='Return', normalize=True):
        tmp=self.__Keywords.drop(['Ticker','CIK','Company Name','Date Filed'],axis=1).copy()
        if(normalize):
            TS_mat=Normal(self.__TS.as_matrix(),axis=0)
            KW_mat=Normal(tmp.as_matrix(),axis=0)
        else:
            TS_mat=self.__TS.as_matrix()
            KW_mat=tmp.as_matrix()
                
        label=(self.__Y[kind].as_matrix().T>0)*2-1
        
        return TS_mat, KW_mat, label
        

In [3]:
#helper function
def get_Ticker_Y(ticker, date, kind, window=22, window2=5):
    #get trading day
    if(date==date+BDay(0)): # if date is a trading day
        T0=date
        T1=date+BDay(1)
    else:
        T0=date-BDay(1)
        T1=date+BDay(1)
    
    
    if(kind=='Return'):# get T+1 return
        try:
            price = web.DataReader(ticker, 'yahoo', T0, T1)
        except:
            try:
                price = web.DataReader(ticker, 'google', T0, T1)
            except:
                return 0
        
        try:
            return price.Close[1]/price.Close[0]-1
        except:
            return 0
        
    elif(kind=='Return_Z'): # get T+1 Z score of return
        try:
            price = web.DataReader(ticker, 'yahoo', date-BDay(window), T1)
        except:
            try:
                price = web.DataReader(ticker, 'google', date-BDay(window), T1)
            except:
                return 0
        
        daily_return=price.Close.pct_change(1)
        return float((daily_return.tail(1)-daily_return.mean())/daily_return.std())
    
    
    elif(kind=='Vol_Ratio'): # get T+1 
        try:
            price = web.DataReader(ticker, 'yahoo', date-BDay(window), T1+BDay(window2-1))
        except:
            try:
                price = web.DataReader(ticker, 'google', date-BDay(window), T1+BDay(window2-1))
            except:
                return 0
            
        daily_return=price.Close.pct_change(1)    
        return float(daily_return.tail(5).std(ddof=0)/daily_return.head(22).std(ddof=0)-1)

        

In [4]:
# helper function
def get_Ticker_TS(ticker, date, window):
    if(date==date+BDay(0)): # if date is a trading day
        T0=date
    else:
        T0=date-BDay(1)
        
        
    try:
        price = web.DataReader(ticker, 'yahoo', T0-BDay(window+3), T0)
    except:
        try:
            price = web.DataReader(ticker, 'google', T0-BDay(window+3), T0)
        except:
            return 0
        
    r=price.Close.pct_change(1).tail(window)
    r.index=['Lag '+ str(i) for i in np.arange(window-1,-1,-1)]
    return r
    
    

## Data Preparation

In [20]:
#Setup working directory 
os.chdir("C:/Users/ziyi/Desktop/Inde project/Independent Study Anseri-20170120T004329Z/Independent Study Anseri")

#Setup Item Parameters 

#Setup Output Parameters
addX1='8K data/X1.csv'
addX2='8K data/X2.csv'
addY='8K data/Y.csv'

In [6]:
#Import Data
df=pd.read_csv("8K data/events_ts_Info.csv", index_col=0)
df2=pd.read_csv("8K data/list_40.csv", index_col=0)
df['Date Filed']=df['Date Filed'].apply(lambda x: pd.to_datetime(x, format='%Y-%m-%d').date())

In [7]:
KA=KeywordAnalyzer(df,df2)

In [8]:
# pull stock return as label
Y=KA.get_Y(kind='Vol_Ratio')

#Pull stock return time series as X
KA.get_TS(window=10)

X1, X2,Y= KA.prep_Classifier(kind='Vol_Ratio',normalize=False)

pd.DataFrame(X1).to_csv(addX1)
pd.DataFrame(X2).to_csv(addX2)
pd.DataFrame(Y).to_csv(addY)

## Linear Regression

In [104]:
X2=pd.DataFrame(X2, columns=df.columns[3:])

In [106]:
X2.sum(axis=0)

ITEM 1.01    106
ITEM 1.02      2
ITEM 1.03      0
ITEM 1.04      0
ITEM 2.01     10
ITEM 2.02    309
ITEM 2.03     51
ITEM 2.04      0
ITEM 2.05      3
ITEM 2.06      2
ITEM 3.01      2
ITEM 3.02     11
ITEM 3.03      0
ITEM 4.01      0
ITEM 4.02      0
ITEM 5.01      0
ITEM 5.02    162
ITEM 5.03     23
ITEM 5.04      1
ITEM 5.05      1
ITEM 5.06      0
ITEM 5.07     80
ITEM 5.08      0
ITEM 6.01      0
ITEM 6.02      0
ITEM 6.03      0
ITEM 6.04      0
ITEM 6.05      0
ITEM 7.01    299
ITEM 8.01    215
ITEM 9.01    743
dtype: int64

In [None]:
df_temp=df.drop(['CIK','Company Name','Date Filed'],axis=1).copy()
df_temp.sum(axis=0)

In [None]:
X2=Normal(df_temp.as_matrix(),axis=1)

In [107]:
X2=X2.iloc[:,[0,5,6,16,21,28,29,30]]

In [108]:
model = sm.OLS(Y,X2)
results = model.fit()
print(results.summary())

                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.892
Model:                            OLS   Adj. R-squared:                  0.891
Method:                 Least Squares   F-statistic:                     1006.
Date:                Tue, 24 Jan 2017   Prob (F-statistic):               0.00
Time:                        19:57:59   Log-Likelihood:                -301.27
No. Observations:                 983   AIC:                             618.5
Df Residuals:                     975   BIC:                             657.7
Df Model:                           8                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [95.0% Conf. Int.]
------------------------------------------------------------------------------
ITEM 1.01      0.3904      0.045      8.610      0.0

## SVM (Normalized)

### Rough Grid

In [160]:
X1.shape

(979, 10)

In [162]:
# Grid method to tune hyper parameters, which is the penalty weight in linear svm
# we only use linear model
t_sample=700

skf = StratifiedKFold(n_splits=10)
parameters = {'C':np.exp(np.arange(-5,15,0.5))}

svc = svm.SVC(kernel='linear')
clf = GridSearchCV(svc, parameters,cv=skf)
clf.fit(X1[:t_sample,:], Y[:t_sample])

GridSearchCV(cv=StratifiedKFold(n_splits=10, random_state=None, shuffle=False),
       error_score='raise',
       estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='linear',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'C': array([ 1998.1959 ,  2100.64559,  2208.34799,  2321.57241,  2440.60198,
        2565.73432,  2697.28233,  2835.57495,  2980.95799,  3133.79497])},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=0)

In [165]:
print(clf.best_params_)
best_power=np.log(clf.best_params_['C'])
print('Log is ', best_power)

{'C': 1998.1958951041172}
Log is  7.6


### Fine grid

In [166]:
parameters = {'C':np.exp(np.arange(best_power-0.2,best_power+0.25,0.05))}

clf = GridSearchCV(svc, parameters,cv=skf)
clf.fit(X1[:t_sample,:], Y[:t_sample])

GridSearchCV(cv=StratifiedKFold(n_splits=10, random_state=None, shuffle=False),
       error_score='raise',
       estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='linear',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'C': array([ 1635.98443,  1719.86315,  1808.04241,  1900.74273,  1998.1959 ,
        2100.64559,  2208.34799,  2321.57241,  2440.60198,  2565.73432])},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=0)

In [167]:
print(clf.best_params_)
best_power=np.log(clf.best_params_['C'])
print('Log is ', best_power)

{'C': 1635.9844299959257}
Log is  7.4


### Out of sample score

In [169]:
svc = svm.SVC(kernel='linear',C=np.exp(best_power))
svc.fit(X1[:t_sample,:],Y[:t_sample])
pred=svc.predict(X1[t_sample:,:])

In [171]:
confusion_matrix(Y[t_sample:], pred)

array([[170,  11],
       [ 83,  15]])

In [172]:
roc_auc_score(Y[t_sample:], pred)

0.5461438719134063

### SVM Pipeline--Text Only

In [175]:
def SVM_pipeline(X,Y, test_ratio):
    t_sample=int(np.floor(X.shape[0]*test_ratio))
    
    #rough grid
    skf = StratifiedKFold(n_splits=10)
    parameters = {'C':np.exp(np.arange(-5,10,1))}
    svc = svm.SVC(kernel='linear')
    clf = GridSearchCV(svc, parameters,cv=skf)
    clf.fit(X1[:t_sample,:], Y[:t_sample])
    best_power=np.log(clf.best_params_['C'])
    
    #fine grid
    parameters = {'C':np.exp(np.arange(best_power-0.5,best_power+0.6,0.1))}
    clf = GridSearchCV(svc, parameters,cv=skf)
    clf.fit(X[:t_sample,:], Y[:t_sample])
    best_power=np.log(clf.best_params_['C'])
    
    #construct model and predict
    svc = svm.SVC(kernel='linear',C=np.exp(best_power))
    svc.fit(X1[:t_sample,:],Y[:t_sample])
    pred=svc.predict(X1[t_sample:,:])
    c_mat=confusion_matrix(Y[t_sample:], pred)
    auc=roc_auc_score(Y[t_sample:], pred)
    
    return pred, c_mat, auc

In [174]:
pred, c_mat, auc=SVM_pipeline(X2,Y, 0.7)



In [176]:
c_mat

array([[174,  17],
       [ 84,  19]])

In [177]:
auc

0.54773039190769068

### TS+Text

In [184]:
pred, c_mat, auc=SVM_pipeline(np.concatenate([X1,X2],axis=1),Y, 0.7)

In [185]:
c_mat

array([[170,  21],
       [ 83,  20]])

In [186]:
auc

0.54211355665124783

## SVM (Non-Normalized)

In [196]:
X1, X2,Y= KA.prep_Classifier(normalize=False)

In [197]:
pd.DataFrame(X1).to_csv('X1.csv')
pd.DataFrame(X2).to_csv('X2.csv')
pd.DataFrame(Y).to_csv('Y.csv')

In [198]:
# Time series only
pred, c_mat, auc=SVM_pipeline(X1,Y, 0.7)

In [199]:
c_mat

array([[170,  21],
       [ 83,  20]])

In [200]:
auc

0.54211355665124783

In [201]:
# Text only
pred, c_mat, auc=SVM_pipeline(X2,Y, 0.7)

In [202]:
c_mat

array([[180,  11],
       [ 89,  14]])

In [203]:
auc

0.5391653535302191

In [204]:
#Combined
pred, c_mat, auc=SVM_pipeline(np.concatenate([X1,X2],axis=1),Y, 0.7)

In [205]:
c_mat

array([[172,  19],
       [ 83,  20]])

In [206]:
auc

0.54734915874548873