In [None]:
import pandas as pd
import seaborn as sns
import numpy as np

In [None]:
# IGI 
# ADBL *
# AHPC
# AIL *
# AKJCL *
# AKPL *
# BARUN -
# BPCL -
# NLG -
# PICL
# NICLBSL
# RLI
# NIFRA

# HIDCL

url = 'http://nepalstockinfo.com/companyhistory/NABIL'
dfs = pd.read_html(url, parse_dates=True, index_col=1)

In [None]:
nbl_df = dfs[16]
nbl_df.head()

In [None]:
nbl_df.tail()

In [None]:
nbl_df = nbl_df.sort_values(['Date'], ascending=True)
nbl_df.head()

In [None]:
nbl_df.tail(20)

In [None]:
sns.lineplot(data=nbl_df['Price'])

In [None]:
nbl_df['100ma'] = nbl_df['Price'].rolling(window = 100,min_periods = 0).mean()
nbl_df.head()

In [None]:
nbl_df.drop(['S.N','Prev Price','Change','Max Price','Min Price','Transaction', 'Volume', 'Amount', '100ma'],1,inplace = True)
nbl_df.head()

In [None]:
nbl_df_corr = nbl_df.corr()
nbl_df_corr.head()

In [None]:
nbl_df_corr = nbl_df.pct_change().corr()
nbl_df_corr.head()

In [None]:
def process_data_for_labels(df):
    hm_days = 7
    tickers = df.columns.values.tolist()
    df.fillna(0,inplace=True)
    
    for i in range(hm_days+1):
        df['nbl_shift'] = (df['Price'].shift(-i)-df['Price'])/df['Price'] #shifting negative to get future data
        
    df.fillna(0,inplace=True)
    return df

process_data_for_labels(nbl_df)    

In [None]:
def buy_sell_hold(*args):#allows us to pass any number of parameters
    cols = [c for c in args]
    requirement = 0.01
    
    for col in cols:
        if col > requirement:
            return 1 #buy
        if col < requirement:
            return -1 #sell
    return 0 #hold 

In [None]:
from collections import Counter
def extract_featuresets(df):
    df = process_data_for_labels(df)
    df['label_target'] = list(map(buy_sell_hold,df['nbl_shift']))
                                          
    vals = df['label_target'].values.tolist()
    str_vals = [str(i) for i in vals]
    print('Data spread:',Counter(str_vals)) #this will give us distribution
                                          
    df.fillna(0,inplace=True)
    df = df.replace([np.inf,-np.inf],np.NaN)
    df.dropna(inplace=True)
    
    df_vals = df.pct_change() #normalize
    df_vals = df_vals.replace([np.inf,-np.inf],0)
    df_vals.fillna(0,inplace=True)
                                          
    X = df_vals.values
    y = df['label_target'].values
                                          
    return X, y, df
                                          
extract_featuresets(nbl_df)

In [None]:
#from sklearn import svm, cross_validation, neighbors
from sklearn import svm, neighbors
from sklearn.model_selection import train_test_split 
from sklearn.ensemble import VotingClassifier,RandomForestClassifier

#The svm import is for a Support Vector Machine, cross_validation will
#let us easily create shuffled training and testing samples, and neighbors is for K Nearest Neighbors. 

#The voting classifier is just what it sounds like. Basically, it's a classifier that will let us combine
#many classifiers, and allow them to each get a "vote" on what they think the class of the featuresets is. 
#The random forest classifier is just another classifier. We're going to use three classifiers in our voting 
#classifier.

In [None]:
def do_ml(df):
    X, y, df = extract_featuresets(df)
    X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.25) #25% of our sample data will test against 
                                                                            # which will give accurancy
    
    clf = neighbors.KNeighborsClassifier() #defined our classifier
    clf.fit(X_train,y_train) #X is the %change data for all company
                             #Y is the target hold,buy or sell
    # use the classifier that will fit input data to the target we're setting
    confidence = clf.score(X_test,y_test)
    predictions = clf.predict(X_test)
    
    print('Predicted spread:',Counter(predictions))
    
    return confidence

do_ml(nbl_df)