<a href="https://colab.research.google.com/github/sjdee/Market-Analysis-Techniques/blob/master/NaiveBayes.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics
from sklearn.metrics import classification_report
import numpy as np
import warnings
warnings.filterwarnings("ignore")


def run_classifier (pDf,minify_data,less_columns,print_data):  

  
  df = pDf
  
  if(less_columns==True):
    df = df.loc[:,['BEST_ANALYST_RATING', 'RETURN_ON_ASSET', 'BEST_TARGET_PRICE', 'CUR_MKT_CAP', 'SHORT_INT', 'TOT_BUY_REC', 'TOT_SELL_REC', 'day1','day2', 'day3', 'day4','day5', 'day6','day7', 'day8', 'day9', 'day10']]
    df.dropna(inplace=True)
    features = df.iloc[:,:-10]
    labels = df.iloc[:,-10:]
    
  else:
    df.dropna(inplace=True)
    features = df.iloc[:,5:-12]
    labels = df.iloc[:,-10:]
       
    
  report_data = []


  for i in range(len(labels.columns)):
    # specify the feature set, target set, the test size and random_state to select records randomly
    X_train, X_test, y_train, y_test = train_test_split(features, labels.iloc[:,i], test_size=0.3,random_state=0) 

    # Scaling values in the feature set
    scaling = MinMaxScaler(feature_range=(0,1)).fit(X_train)
    X_train = scaling.transform(X_train)
    X_test = scaling.transform(X_test)

    # Create a NaiveBayes Classifier
    clf_gaus = GaussianNB()
    clf_multi = MultinomialNB()

    # Train the model using the training sets
    clf_gaus.fit(X_train, y_train)
    clf_multi.fit(X_train, y_train)

    # Predict the response for test dataset
    y_pred_gaus = clf_gaus.predict(X_test)
    y_pred_multi = clf_multi.predict(X_test)


    # accumulating data for NB Gaussian    
    accuracy_gaus = metrics.accuracy_score(y_test, y_pred_gaus)
    report_gaus = classification_report(y_test, y_pred_gaus)
    f1_scores_gaus =[] 
    f1_scores_gaus.insert(0, metrics.f1_score(y_test, y_pred_gaus, average='macro'))
    f1_scores_gaus.insert(1, metrics.f1_score(y_test, y_pred_gaus, average='micro'))
    f1_scores_gaus.insert(2, metrics.f1_score(y_test, y_pred_gaus, average='weighted'))

    accumulate_data(report_data,report_gaus,accuracy_gaus,labels.iloc[:,i].name,"Naives Bayes (Gaussian)",minify_data,f1_scores_gaus)

    
    
    # accumulating data for NB Multinomial        
    accuracy_multi = metrics.accuracy_score(y_test, y_pred_multi)
    report_multi = classification_report(y_test, y_pred_multi)
    f1_scores_multi =[] 
    f1_scores_multi.insert(0, metrics.f1_score(y_test, y_pred_multi, average='macro'))
    f1_scores_multi.insert(1, metrics.f1_score(y_test, y_pred_multi, average='micro'))
    f1_scores_multi.insert(2, metrics.f1_score(y_test, y_pred_multi, average='weighted'))

    accumulate_data(report_data,report_multi,accuracy_multi,labels.iloc[:,i].name,"Naives Bayes (Multinomial)",minify_data,f1_scores_multi)


  return report_data       

In [0]:
def accumulate_data(report_data,report,accuracy,day_name,model_name,minify_data,f1_scores):
      
  print(day_name)
  print(report)
  
  if(minify_data == True):

    row = {}
    row['day'] = day_name.replace("day", "") 
    row['accuracy'] = accuracy
    row['f1score_macro'] = f1_scores[0]
    row['f1score_micro'] = f1_scores[1]
    row['f1score_weigthed'] = f1_scores[2]
    row['model'] = model_name

    # unravel report for the given day       
    lines = report.split('\n')
    for line in lines[2:-5]:

      row_data = line.split('     ')

      # update recall for sell
      if(float(row_data[1])==0.0):
        row['sell_recall']= float(row_data[3])
      # update precison for buy
      if(float(row_data[1])==2.0):
        row['buy_precison']= float(row_data[2])

    report_data.append(row)


  else:    
    lines = report.split('\n')
    for line in lines[2:-5]:
        row = {}
        row_data = line.split('     ')
        row['model'] = model_name
        row['accuracy'] = accuracy
        row['day'] = labels.iloc[:,i].name
        row['class'] = row_data[1]
        row['precision'] = float(row_data[2])
        row['recall'] = float(row_data[3])
        row['f1_score'] = float(row_data[4])
        row['support'] = float(row_data[5])
        report_data.append(row)

  return report_data

In [0]:
sectors = ['Communication Services', 'Consumer Discretionary','Consumer Staples', 'Energy', 'Financials', 'Health Care', 'Industrials', 'Information Technology', 'Materials', 'Real Estate', 'Utilities']

In [0]:
# https://www.analyticsvidhya.com/blog/2017/09/naive-bayes-explained/

!pip install -U -q PyDrive

from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials

# 1. Authenticate and create the PyDrive client.
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)


file_id = '18pa4iuqvz2SX5RYrUdn09bDU8eNm2hqI'


# 2. Load a file by ID 
downloaded = drive.CreateFile({'id': file_id})
downloaded.GetContentFile('sp500_transformation_input.xlsx')

In [0]:
import pandas as pd

df = pd.read_excel('sp500_transformation_input.xlsx')

In [0]:
df.set_index('Sector', inplace=True)

In [0]:
check_each_sector = False
less_columns = True
minify = True
print_data = False

In [0]:
from google.colab import files

if(check_each_sector == True):
   
  for k in range(len(sectors)):
    print('Running classifier by sector.')

    df_sectorised = df.loc[sectors[k]]
    
    report = run_classifier(df_sectorised,minify,less_columns,print_data)

    
    dataframe = pd.DataFrame.from_dict(report)
    file_name = sectors[k]+'NaiveBayes_classification_report.csv'
    dataframe.to_csv(file_name, index = False)
    files.download(file_name)
    
else:
  print('Running classifier on all.')
  
  report = run_classifier(df,minify,less_columns,print_data)

  dataframe = pd.DataFrame.from_dict(report)
  dataframe.to_csv('NaiveBayes_classification_report.csv', index = False)
  files.download('NaiveBayes_classification_report.csv')

Running classifier on all.
day1
              precision    recall  f1-score   support

         0.0       0.45      0.04      0.08     52535
         2.0       0.59      0.96      0.73     73687

    accuracy                           0.58    126222
   macro avg       0.52      0.50      0.40    126222
weighted avg       0.53      0.58      0.46    126222

day1
              precision    recall  f1-score   support

         0.0       0.00      0.00      0.00     52535
         2.0       0.58      1.00      0.74     73687

    accuracy                           0.58    126222
   macro avg       0.29      0.50      0.37    126222
weighted avg       0.34      0.58      0.43    126222

day2
              precision    recall  f1-score   support

         0.0       0.73      0.97      0.83     91528
         2.0       0.33      0.04      0.07     34694

    accuracy                           0.71    126222
   macro avg       0.53      0.50      0.45    126222
weighted avg       0.62      0.7