**Model to classify 'buy' or 'sell' stocks**

In [2]:
import pandas as pd
import os
import pandas_profiling
from pandas_profiling import ProfileReport
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns 

from sklearn.metrics import silhouette_score, silhouette_samples
import sklearn.metrics
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans, DBSCAN, AgglomerativeClustering
from sklearn.mixture import GaussianMixture
from sklearn.metrics import accuracy_score, cohen_kappa_score, f1_score, log_loss, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.datasets import make_classification

import scipy

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [3]:
df1=pd.read_csv("2014_Financial_Data.csv")
df2=pd.read_csv("2015_Financial_Data.csv")
df3=pd.read_csv("2016_Financial_Data.csv")
df4=pd.read_csv("2017_Financial_Data.csv")
df5=pd.read_csv("2018_Financial_Data.csv")

In [4]:
df1['Year']=2014
df2['Year']=2015
df3['Year']=2016
df4['Year']=2017
df5['Year']=2018

In [5]:
### Drop the Stock Column
df1 = df1.drop(df1.columns[0], axis = 1)
df2 = df2.drop(df2.columns[0], axis = 1)
df3 = df3.drop(df3.columns[0], axis = 1)
df4 = df4.drop(df4.columns[0], axis = 1)
df5 = df5.drop(df5.columns[0], axis = 1)

In [7]:
### Change Data Type
df1['Class'] = df1['Class'].astype(object)
df2['Class'] = df2['Class'].astype(object)
df3['Class'] = df3['Class'].astype(object)
df4['Class'] = df4['Class'].astype(object)
df5['Class'] = df5['Class'].astype(object)

In [8]:
df5['Class'] = df5['Class'].astype(object)
df1.rename(columns={"2015 PRICE VAR [%]": "PRICE_VAR"},inplace=True)
df2.rename(columns={"2016 PRICE VAR [%]": "PRICE_VAR"},inplace=True)
df3.rename(columns={"2017 PRICE VAR [%]": "PRICE_VAR"},inplace=True)
df4.rename(columns={"2018 PRICE VAR [%]": "PRICE_VAR"},inplace=True)
df5.rename(columns={"2019 PRICE VAR [%]": "PRICE_VAR"},inplace=True)

In [9]:
df1.drop(['Sector'], axis=1, inplace=True)
df2.drop(['Sector'], axis=1, inplace=True)
df3.drop(['Sector'], axis=1, inplace=True)
df4.drop(['Sector'], axis=1, inplace=True)
df5.drop(['Sector'], axis=1, inplace=True)

In [10]:
from sklearn.impute import KNNImputer
imputer = KNNImputer(n_neighbors=20, weights='distance', metric='nan_euclidean', copy=True)

In [11]:
df1_clean = imputer.fit_transform(df1)
df1_clean = pd.DataFrame(df1_clean)
df1_clean.columns = list(df1)

In [12]:
df2_clean = imputer.fit_transform(df2)
df2_clean = pd.DataFrame(df2_clean)
df2_clean.columns = list(df2)
df3_clean = imputer.fit_transform(df3)
df3_clean = pd.DataFrame(df3_clean)
df3_clean.columns = list(df3)
df4_clean = imputer.fit_transform(df4)
df4_clean = pd.DataFrame(df4_clean)
df4_clean.columns = list(df4)
df5_clean = imputer.fit_transform(df5)
df5_clean = pd.DataFrame(df5_clean)
df5_clean.columns = list(df5)


In [13]:
##### Check Missing Value Again
df1_clean.isnull().sum()
df2_clean.isnull().sum()
df3_clean.isnull().sum()
df4_clean.isnull().sum()
df5_clean.isnull().sum()

Revenue                 0
Revenue Growth          0
Cost of Revenue         0
Gross Profit            0
R&D Expenses            0
                       ..
R&D Expense Growth      0
SG&A Expenses Growth    0
PRICE_VAR               0
Class                   0
Year                    0
Length: 224, dtype: int64

Revenue                 0
Revenue Growth          0
Cost of Revenue         0
Gross Profit            0
R&D Expenses            0
                       ..
R&D Expense Growth      0
SG&A Expenses Growth    0
PRICE_VAR               0
Class                   0
Year                    0
Length: 224, dtype: int64

Revenue                 0
Revenue Growth          0
Cost of Revenue         0
Gross Profit            0
R&D Expenses            0
                       ..
R&D Expense Growth      0
SG&A Expenses Growth    0
PRICE_VAR               0
Class                   0
Year                    0
Length: 224, dtype: int64

Revenue                 0
Revenue Growth          0
Cost of Revenue         0
Gross Profit            0
R&D Expenses            0
                       ..
R&D Expense Growth      0
SG&A Expenses Growth    0
PRICE_VAR               0
Class                   0
Year                    0
Length: 224, dtype: int64

Revenue                 0
Revenue Growth          0
Cost of Revenue         0
Gross Profit            0
R&D Expenses            0
                       ..
R&D Expense Growth      0
SG&A Expenses Growth    0
PRICE_VAR               0
Class                   0
Year                    0
Length: 224, dtype: int64

In [62]:
data = pd.concat([df1_clean, df2_clean, df3_clean, df4_clean, df5_clean])

In [63]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 22077 entries, 0 to 4391
Columns: 224 entries, Revenue to Year
dtypes: float64(224)
memory usage: 37.9 MB


In [64]:
from sklearn.svm import LinearSVC
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import ExtraTreesClassifier
X=data.drop('Class', axis=1)
y=data['Class']
svc = LinearSVC(C=0.01, penalty="l1", dual=False).fit(X, y)
model = SelectFromModel(svc, prefit=True)
X_new = model.transform(data.drop('Class', axis=1))
print(X_new.shape)
feature_idx = model.get_support()
feature_name = X.columns[feature_idx]
feature_name=list(feature_name)

(22077, 49)


  f"X has feature names, but {self.__class__.__name__} was fitted without"


In [65]:
feature_name.append('Class')
data_info=data[feature_name]
data=data_info
data.columns

Index(['EBITDA Margin', 'EBIT Margin', 'Profit Margin',
       'Earnings Before Tax Margin', 'Net Profit Margin', 'priceToSalesRatio',
       'priceEarningsRatio', 'priceToFreeCashFlowsRatio',
       'priceToOperatingCashFlowsRatio', 'priceCashFlowRatio',
       'priceEarningsToGrowthRatio', 'priceSalesRatio',
       'enterpriseValueMultiple', 'ebtperEBIT', 'eBTperEBIT',
       'inventoryTurnover', 'fixedAssetTurnover', 'currentRatio', 'quickRatio',
       'daysOfSalesOutstanding', 'operatingCycle', 'daysOfPayablesOutstanding',
       'cashConversionCycle', 'interestCoverage', 'cashFlowToDebtRatio',
       'operatingCashFlowSalesRatio', 'cashFlowCoverageRatios',
       'capitalExpenditureCoverageRatios',
       'dividendpaidAndCapexCoverageRatios', 'dividendPayoutRatio',
       'EV to Sales', 'Enterprise Value over EBITDA',
       'EV to Operating cash flow', 'EV to Free cash flow', 'Earnings Yield',
       'Net Debt to EBITDA', 'Interest Coverage', 'Income Quality',
       'Graham Num

In [66]:
# Data missing information 
data_info=pd.DataFrame(data.dtypes).T.rename(index={0:'column type'})
data_info=data_info.append(pd.DataFrame(data.isnull().sum()).T.rename(index={0:'null values (nb)'}))
data_info=data_info.append(pd.DataFrame(data.isnull().sum()/data.shape[0]*100).T.
                         rename(index={0:'null values (%)'}))
display(data_info)

Unnamed: 0,EBITDA Margin,EBIT Margin,Profit Margin,Earnings Before Tax Margin,Net Profit Margin,priceToSalesRatio,priceEarningsRatio,priceToFreeCashFlowsRatio,priceToOperatingCashFlowsRatio,priceCashFlowRatio,...,Days of Inventory on Hand,Inventory Turnover,Operating Income Growth,EPS Diluted Growth,Operating Cash Flow growth,Free Cash Flow growth,R&D Expense Growth,PRICE_VAR,Year,Class
column type,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,...,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64
null values (nb),0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
null values (%),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [67]:
# Train Year: 2014 - 2016
# Test Year:  2017 - 2018
all_year = set(data['Year'].unique())
test_year = set(range(2017,2018,1))
train_year = all_year - test_year

len(train_year), len(test_year), len(all_year)

train = data[data['Year'].isin(train_year)]
test = data[data['Year'].isin(test_year)]

(4, 1, 5)

In [69]:
class_ratio = len(train[train['Class']==1]) / len(train.index)
class_ratio

len(test) / len(data)
len(train) / len(data)

0.6303090494829702

0.224668206730987

0.775331793269013

In [70]:
data.drop(['Year','PRICE_VAR'], axis=1, inplace=True)
train.drop(['Year','PRICE_VAR'], axis=1, inplace=True)
test.drop(['Year','PRICE_VAR'], axis=1, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


In [71]:
test2=test.copy()
print(test.columns)
print(test2.columns)
print(data.columns)

Index(['EBITDA Margin', 'EBIT Margin', 'Profit Margin',
       'Earnings Before Tax Margin', 'Net Profit Margin', 'priceToSalesRatio',
       'priceEarningsRatio', 'priceToFreeCashFlowsRatio',
       'priceToOperatingCashFlowsRatio', 'priceCashFlowRatio',
       'priceEarningsToGrowthRatio', 'priceSalesRatio',
       'enterpriseValueMultiple', 'ebtperEBIT', 'eBTperEBIT',
       'inventoryTurnover', 'fixedAssetTurnover', 'currentRatio', 'quickRatio',
       'daysOfSalesOutstanding', 'operatingCycle', 'daysOfPayablesOutstanding',
       'cashConversionCycle', 'interestCoverage', 'cashFlowToDebtRatio',
       'operatingCashFlowSalesRatio', 'cashFlowCoverageRatios',
       'capitalExpenditureCoverageRatios',
       'dividendpaidAndCapexCoverageRatios', 'dividendPayoutRatio',
       'EV to Sales', 'Enterprise Value over EBITDA',
       'EV to Operating cash flow', 'EV to Free cash flow', 'Earnings Yield',
       'Net Debt to EBITDA', 'Interest Coverage', 'Income Quality',
       'Graham Num

In [55]:
data['Class']= data['Class'].astype('int64')
train['Class']= train['Class'].astype('int64')
test['Class']= test['Class'].astype('int64')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


# **MLP Classifier**

In [72]:
from sklearn.neural_network import MLPClassifier
X_train=train.drop('Class', axis=1)
Y_train=train['Class']
clf = MLPClassifier(random_state=1, hidden_layer_sizes=150,activation='logistic', solver='lbfgs',alpha=0.2, learning_rate='adaptive',max_iter=300).fit(X_train, Y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)


In [73]:
pred=clf.predict(test2.drop('Class', axis=1))

In [74]:
confusion_matrix(pred, test['Class'])

array([[1204,  501],
       [2386,  869]])

In [75]:
from sklearn.metrics import roc_auc_score, classification_report
roc_auc_score(test['Class'], pred)

0.4848413069556554

In [76]:
print(classification_report(pred,test['Class']))

              precision    recall  f1-score   support

         0.0       0.34      0.71      0.45      1705
         1.0       0.63      0.27      0.38      3255

    accuracy                           0.42      4960
   macro avg       0.48      0.49      0.42      4960
weighted avg       0.53      0.42      0.40      4960



# Decision Tree Classifier

In [77]:
from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeClassifier
clf = DecisionTreeClassifier(random_state=0)
clf = clf.fit(X_train,Y_train)
#Predict the response for test dataset
y_pred = clf.predict(test2.drop('Class', axis=1))

In [78]:
print(classification_report(y_pred,test['Class']))

              precision    recall  f1-score   support

         0.0       0.44      0.72      0.55      2178
         1.0       0.56      0.28      0.37      2782

    accuracy                           0.47      4960
   macro avg       0.50      0.50      0.46      4960
weighted avg       0.51      0.47      0.45      4960



In [79]:
roc_auc_score(test['Class'], y_pred)

0.49979057804525956

In [80]:
confusion_matrix(y_pred, test['Class'])

array([[1576,  602],
       [2014,  768]])

# **Random Forest Classifier**

In [81]:
from sklearn.ensemble import RandomForestClassifier
#Create a Gaussian Classifier
clf=RandomForestClassifier(n_estimators=100)
#Train the model using the training sets y_pred=clf.predict(X_test)
clf.fit(X_train,Y_train)
y_pred = clf.predict(test2.drop('Class', axis=1))

RandomForestClassifier()

In [82]:
print(classification_report(y_pred,test['Class']))

              precision    recall  f1-score   support

         0.0       0.23      0.75      0.36      1106
         1.0       0.80      0.28      0.42      3854

    accuracy                           0.39      4960
   macro avg       0.52      0.52      0.39      4960
weighted avg       0.67      0.39      0.41      4960



In [83]:
confusion_matrix(pred, test['Class'])

array([[1204,  501],
       [2386,  869]])

In [84]:
roc_auc_score(test['Class'], y_pred)

0.5168859158652379

# **Logistic Regression**

In [85]:
from sklearn.linear_model import LogisticRegression

# instantiate the model (using the default parameters)
logreg = LogisticRegression()

# fit the model with data
logreg.fit(X_train,Y_train)

#
y_pred=logreg.predict(test2.drop('Class', axis=1))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


LogisticRegression()

In [86]:
print(classification_report(y_pred,test['Class']))

              precision    recall  f1-score   support

         0.0       0.00      1.00      0.01        13
         1.0       1.00      0.28      0.43      4947

    accuracy                           0.28      4960
   macro avg       0.50      0.64      0.22      4960
weighted avg       1.00      0.28      0.43      4960



In [87]:
confusion_matrix(pred, test['Class'])

array([[1204,  501],
       [2386,  869]])

In [88]:
roc_auc_score(test['Class'], y_pred)

0.5018105849582173

# **Ada Boost**

In [89]:
from sklearn.ensemble import AdaBoostClassifier
# Create adaboost classifer object
abc = AdaBoostClassifier(n_estimators=50,
                         learning_rate=1)
# Train Adaboost Classifer
model = abc.fit(X_train, Y_train)

#Predict the response for test dataset
y_pred = model.predict(test2.drop('Class', axis=1))

In [90]:
train_x=X_train
train_y=Y_train
test_x=test2.drop('Class', axis=1)
test_y=test['Class']

In [91]:
print(classification_report(y_pred,test['Class']))

              precision    recall  f1-score   support

         0.0       0.35      0.73      0.47      1740
         1.0       0.65      0.28      0.39      3220

    accuracy                           0.44      4960
   macro avg       0.50      0.50      0.43      4960
weighted avg       0.55      0.44      0.42      4960



In [92]:
confusion_matrix(pred, test['Class'])

array([[1204,  501],
       [2386,  869]])

In [93]:
roc_auc_score(test['Class'], y_pred)

0.502321940507899

GB Classifier

In [94]:
from sklearn.linear_model import SGDClassifier, SGDRegressor,LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
basicmodel = GradientBoostingClassifier()
basicmodel = basicmodel.fit(train_x, train_y)
preds1 = basicmodel.predict(test_x)
acc1=accuracy_score(test_y, preds1)
print("Accuracy is {}".format(acc1*100))
print(classification_report(test_y, preds1))
print(confusion_matrix(test_y, preds1))

Accuracy is 42.9233870967742
              precision    recall  f1-score   support

         0.0       0.72      0.35      0.47      3590
         1.0       0.27      0.64      0.38      1370

    accuracy                           0.43      4960
   macro avg       0.49      0.49      0.43      4960
weighted avg       0.59      0.43      0.45      4960

[[1255 2335]
 [ 496  874]]


SGD Classifier

In [95]:
basicmodel= SGDClassifier(loss='modified_huber', max_iter=5, random_state=0, shuffle=True)
basicmodel = basicmodel.fit(train_x, train_y)
preds1 = basicmodel.predict(test_x)
acc1=accuracy_score(test_y, preds1)
print("Accuracy is {}".format(acc1*100))
print(classification_report(test_y, preds1))
print(confusion_matrix(test_y, preds1))

Accuracy is 47.66129032258065
              precision    recall  f1-score   support

         0.0       0.72      0.45      0.56      3590
         1.0       0.27      0.54      0.36      1370

    accuracy                           0.48      4960
   macro avg       0.50      0.49      0.46      4960
weighted avg       0.60      0.48      0.50      4960

[[1630 1960]
 [ 636  734]]




Support Vector Classifier

In [None]:
from sklearn import svm
clf = svm.SVC(kernel='linear', C = 1.0)
clf= clf.fit(train_x,train_y)
preds1=clf.predict(test_x)
acc1=accuracy_score(test_y, preds1)
print("Accuracy is {}".format(acc1*100))
print(classification_report(test_y, preds1))
print(confusion_matrix(test_y, preds1))