
# Importing libraries

In [None]:
# suppress display of warnings
import warnings
warnings.filterwarnings("ignore")

# 'Pandas' is used for data manipulation and analysis
import pandas as pd 

# 'Numpy' is used for mathematical operations on large, multi-dimensional arrays and matrices
import numpy as np

# 'Matplotlib' is a data visualization library for 2D and 3D plots, built on numpy
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap

# 'Seaborn' is based on matplotlib; used for plotting statistical graphics
import seaborn as sns

# import 'is_string_dtype' to check if the type of input is string  
from pandas.api.types import is_string_dtype

# import various functions to perform classification
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree
from sklearn.tree import export_graphviz
from sklearn.multiclass import OneVsRestClassifier
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import cohen_kappa_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.feature_selection import SelectFromModel
from sklearn.svm import SVC

# import various functions to perform regression
from sklearn.linear_model import SGDClassifier
import statsmodels
import statsmodels.api as sm
from sklearn import linear_model

#importing library for scaling data
from sklearn.preprocessing import MinMaxScaler

# display all columns of the dataframe
pd.options.display.max_columns = None

In [None]:
#setting the plot size using rcParams
plt.rcParams['figure.figsize'] = [15,8]

In [None]:
#importing data set for training the models
df=pd.read_csv("../input/share-market-prediction/Training_.csv")

# Understanding the data set

In [None]:
#printing the first 5 records from the training data set
df.head()

In [None]:
#checking the number of rows and columns in the training data set
df.shape

There are total 4378 rows and 7 columns in the training dataset

In [None]:
#"info" gives us the column names and their data types along with null values if any in the columns
df.info()

From the above output we can see that their are 6 values missing in each column,except "S.No".We need to handle these missing values.

In [None]:
# the describe() returns the statistical summary of the numeric variables
df.describe()

The statistical summary contains information about the mean,number of rows(count),standard deviation,minimum value,quartiles and maximum value in a column.The column named close contains 4372 values with the maximum value of 91000,mean value of 26466 and the minimum value of 8040.

# DATA PREPARATION FOR MODEL BUILDING

In [None]:
#As the problem statement states that any day when the closing value is 2% higher than the opening price
#that day is considered to be the buy day and rest don't buy day for intraday traders
#Hence calculating the closing price is how much greater tha the opening price and the percent of it
#Then storing both the values in two new columns
df['greater']=(df['Close']-df['Open'])/df['Open']
df['percent']=df['greater']*100

In [None]:
#defining the target variable
#Setting the values in target variable as 1 where the percent is greater than 2
#rest of the values are set as 0
df['target'] = np.where(df['percent']>=2, 1, 0)
   

In [None]:
#plotting the countplot for the target column
sns.countplot(df['target'],palette='rainbow')
plt.title("Countplot of the target column")
plt.show()

We can see that the days which are considered good for intraday traders are very less as compared to the days considered bad  for intraday traders

# Exploratory Data Analysis

In [None]:
# plot the histogram of numeric variables
# the hist() function considers the numeric variables only, by default
df.hist(xrot = 20, )

# adjust the subplots
plt.tight_layout()

# display the plot
plt.show()  

We can see from the above graphs that open,high,low,close,adj close and volume all columns are right skewed.The greater and percent columns are normally distributed and the target column is categorical in nature.

In [None]:
# Pairplot of numeric variables

# select the columns for the pairplot
columns= ["Open", "Close", "High", "Low", "Volume"]

# draw the pairplot such that the diagonal should be density plot and the other graphs should be scatter plot
sns.pairplot(df[columns], size=2, kind= "scatter", diag_kind="kde")

# display the plot
plt.show()

We can see from the plots that open,close,high and low are directly proportional to each other.

In [None]:
# draw the boxplot for target and the opening price
sns.boxplot(y="Open", x="target", data= df)

# set the title of the plot and the fontsize
plt.title("Open price versus target variable", fontsize=15)

# set the xlabel and the fontsize
plt.xlabel("Target", fontsize=15)

# set the ylabel and the fontsize
plt.ylabel("Open", fontsize=15)

# display the plot
plt.show()

We can see that the day which is good for intraday traders for trading the opening price is between the range 11000 to 28000 approximately.

In [None]:
# draw the boxplot for target and the opening price
sns.boxplot(y="Close", x="target", data= df)

# set the title of the plot and the fontsize
plt.title("Close price versus target variable", fontsize=15)

# set the xlabel and the fontsize
plt.xlabel("Target", fontsize=15)

# set the ylabel and the fontsize
plt.ylabel("Close", fontsize=15)

# display the plot
plt.show()

From the plot we can see that the day which is not good for trading versus the day that is good for intraday trading,the mean closing price lies above 20000 and below 20000 respectively

In [None]:
sns.boxplot(y="High", x="target", data= df)

# set the title of the plot and the fontsize
plt.title("High price versus target variable", fontsize=15)

# set the xlabel and the fontsize
plt.xlabel("Target", fontsize=15)

# set the ylabel and the fontsize
plt.ylabel("High", fontsize=15)

# display the plot
plt.show()

In [None]:
sns.boxplot(y="Low", x="target", data= df)

# set the title of the plot and the fontsize
plt.title("Low price versus target variable", fontsize=15)

# set the xlabel and the fontsize
plt.xlabel("Target", fontsize=15)

# set the ylabel and the fontsize
plt.ylabel("Low", fontsize=15)

# display the plot
plt.show()

Since open,close,high and low prices are directly proportional,therefore we see that the boxplot is same for all the variables against the target variable.

In [None]:
# draw the boxplot for greater and the opening price
sns.boxplot(y="greater", x="target", data= df)

# set the title of the plot and the fontsize
plt.title("Open price versus target variable", fontsize=15)

# set the xlabel and the fontsize
plt.xlabel("Target", fontsize=15)

# set the ylabel and the fontsize
plt.ylabel("greater", fontsize=15)

# display the plot
plt.show()

From the plot we can see that the day which is not good for trading the greater value(closing price versus opening price calculation) has got many outliers below the minimum value, whereas the day which is good for trading the outliers lies above maximum value in the boxplot.

# Finding Outliers

In [None]:
#Checking employees who require treatment are from which gender
sns.boxplot(df.Open)
plt.show()

In [None]:
#Checking employees who require treatment are from which gender
sns.boxplot(df.Close)
plt.show()

In [None]:
sns.boxplot(df.Low)
plt.show()

In [None]:
sns.boxplot(df.High)
plt.show()

In [None]:
sns.boxplot(df['Adj Close'])
plt.show()

In [None]:
sns.boxplot(df['Volume'])
plt.show()

In [None]:
sns.boxplot(df.greater)
plt.show()

In [None]:
#Putting all features in one variable df_features
df_features=df[['Open', 'High', 'Low', 'Close', 'Adj Close', 'Volume','greater', 'percent','target']]

In [None]:
from scipy.stats.mstats import winsorize

In [None]:
df['Open']=winsorize(df['Open'],(0.01,0.1))
df['Close']=winsorize(df['Close'],(0.01,0.1))
df['Low']=winsorize(df['Low'],(0.01,0.1))
df['High']=winsorize(df['High'],(0.01,0.1))
df['Adj Close']=winsorize(df['Adj Close'],(0.01,0.1))
df['Volume']=winsorize(df['Volume'],(0.01,0.1))

# Viewing the columns after adjusting the outliers 

In [None]:
#Checking for outliers after handling outliers 
sns.boxplot(df['Open'])
plt.show()

In [None]:
#Checking for outliers after handling outliers 
sns.boxplot(df['Close'])
plt.show()

In [None]:
#Checking for outliers after handling outliers 
sns.boxplot(df['Low'])
plt.show()

In [None]:
#Checking for outliers after handling outliers 
sns.boxplot(df['High'])
plt.show()

In [None]:
#Checking for outliers after handling outliers 
sns.boxplot(df['Volume'])
plt.show()

In [None]:
#Checking for outliers after handling outliers 
sns.boxplot(df['greater'])
plt.show()

Hence we can see from the plots that the outliers have been handled to great extent.

# Finding the missing values

In [None]:
# sort the variables on the basis of total null values in the variable
# 'isnull().sum()' returns the number of missing values in each variable
Total = df.isnull().sum().sort_values(ascending = False)          

# calculate the percentage of missing values
Percent = ((Total*100)/df.isnull().count()).sort_values(ascending = False)   

# concat the 'Total' and 'Percent' columns using 'concat' function
missing_data = pd.concat([Total, Percent], axis = 1, keys = ['Total', 'Percentage of Missing Values'])    
missing_data

In [None]:
# plot heatmap to check null values
# 'cbar = False' does not show the color axis 
sns.heatmap(df.isnull(), cbar=False)

# display the plot
plt.show()

We can see from the graph above that the data missing from each column is from the same rows.Hence we delete those rows,as the missing data count is not high.

In [None]:
#deleting the rows with missing data
df=df.dropna(axis=0)


In [None]:
# plot heatmap to check null values
# 'cbar = False' does not show the color axis 
sns.heatmap(df.isnull(), cbar=False)

# display the plot
plt.show()

# Preparing the data for model building

In [None]:
#Creating a dataframe X which contains all the features
X=df[['Open', 'High', 'Low', 'Close', 'Adj Close','percent','greater','Volume']]

In [None]:
scaler = MinMaxScaler()
scaler.fit(X)

In [None]:
X = pd.DataFrame(scaler.fit_transform(X))

In [None]:
#creating another dataframe to store the target variable
y=pd.DataFrame(df['target'])

In [None]:
# let us now split the dataset into train & test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state=10)

# print the shape of 'x_train'
print("X_train ",X_train.shape)

# print the shape of 'x_test'
print("X_test ",X_test.shape)

# print the shape of 'y_train'
print("y_train ",y_train.shape)

# print the shape of 'y_test'
print("y_test ",y_test.shape)

# Creating generalised functions

In [None]:
# create a generalized function to calculate the metrics values for test set
def get_test_report(model):
    
    # return the performace measures on test set
    return(classification_report(y_test, y_pred))

In [None]:
# create a generalized function to calculate the metrics values for test set
def kappa_score(model):
    
    # return the kappa score on test set
    return(cohen_kappa_score(y_test, y_pred))

In [None]:
# define a to plot a confusion matrix for the model
def plot_confusion_matrix(model):
    
    # create a confusion matrix
    cm = confusion_matrix(y_test, y_pred)
    conf_matrix = pd.DataFrame(data = cm,columns = ['Predicted:0','Predicted:1'], index = ['Actual:0','Actual:1'])

    # plot a heatmap to visualize the confusion matrix
    sns.heatmap(conf_matrix, annot = True, fmt = 'd', cmap = ListedColormap(['lightskyblue']), cbar = False, 
                linewidths = 0.1, annot_kws = {'size':25})

    # set the font size of x-axis ticks using 'fontsize'
    plt.xticks(fontsize = 20)

    # set the font size of y-axis ticks using 'fontsize'
    plt.yticks(fontsize = 20)

    # display the plot
    plt.show()

In [None]:
# define a function to plot the ROC curve and print the ROC-AUC score
def plot_roc(model):
    
    # the roc_curve() returns the values for false positive rate, true positive rate and threshold
    fpr, tpr, thresholds = roc_curve(y_test, y_pred)

    # plot the ROC curve
    plt.plot(fpr, tpr)

    # set limits for x and y axes
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.0])

    # plot the straight line showing worst prediction for the model
    plt.plot([0, 1], [0, 1],'r--')

    # add plot and axes labels
    # set text size using 'fontsize'
    plt.title('ROC Curve', fontsize = 15)
    plt.xlabel('False positive rate (1-Specificity)', fontsize = 15)
    plt.ylabel('True positive rate (Sensitivity)', fontsize = 15)

    # add the AUC score to the plot
    plt.text(x = 0.02, y = 0.9, s = ('AUC Score:',round(roc_auc_score(y_test, y_pred),4)))

    # plot the grid
    plt.grid(True)

In [None]:
# create an empty dataframe to store the scores for various classification algorithms
score_card = pd.DataFrame(columns=['Model', 'AUC Score', 'Precision Score', 'Recall Score', 'Accuracy Score',
                                   'Kappa Score', 'f1-score'])

def update_score_card(model_name):
    
    # assign 'score_card' as global variable
    global score_card

    # append the results to the dataframe 'score_card'
    # 'ignore_index = True' do not consider the index labels
    score_card = score_card.append({'Model': model_name,
                                    'AUC Score' : roc_auc_score(y_test, y_pred),
                                    'Precision Score': metrics.precision_score(y_test, y_pred),
                                    'Recall Score': metrics.recall_score(y_test, y_pred),
                                    'Accuracy Score': metrics.accuracy_score(y_test, y_pred),
                                    'Kappa Score': cohen_kappa_score(y_test, y_pred),
                                    'f1-score': metrics.f1_score(y_test, y_pred)}, 
                                    ignore_index = True)
    return(score_card)

# Model 1(Logistic regression)

In [None]:
# instantiate the 'SGDClassifier' to build model using SGD
# to perform logistic regression, consider the log-loss function 
# set 'random_state' to generate the same dataset each time you run the code 
SGD = SGDClassifier(loss = 'log', random_state = 10)

# fit the model on scaled training data
logreg_with_SGD = SGD.fit(X_train, y_train)

In [None]:
# use predict() to predict the class labels of target variable
y_pred = logreg_with_SGD.predict(X_test)

In [None]:
# call the function to plot the confusion matrix
plot_confusion_matrix(logreg_with_SGD)

In [None]:
# compute the performance measures on test data
test_report = get_test_report(logreg_with_SGD)

# print the performace measures
print(test_report)

In [None]:
# compute kappa score on test set
kappa_value = kappa_score(logreg_with_SGD)

# print the kappa value
print(kappa_value)

In [None]:
# call the function 'plot_roc' to plot the ROC curve
plot_roc(logreg_with_SGD)

In [None]:
# use the function 'update_score_card' to store the performance measures
update_score_card(model_name = 'Logistic Regression (SGD)')

# Model 2(Support vector machine)

In [None]:
# build the model
svclassifier = SVC(kernel = 'linear')

# fit the model
svc_model=svclassifier.fit(X_train, y_train)


In [None]:
# predict the values
y_pred = svclassifier.predict(X_test)

In [None]:
# call the function to plot the confusion matrix
plot_confusion_matrix(svc_model)

In [None]:
# compute the performance measures on test data
test_report = get_test_report(svc_model)

# print the performace measures
print(test_report)

In [None]:
# compute kappa score on test set
kappa_value = kappa_score(svc_model)

# print the kappa value
print(kappa_value)

In [None]:
plot_roc(svc_model)

In [None]:
update_score_card(model_name='SVM')

# Model 3(Support vector machine using kernel(rbf))

In [None]:
# build the model
svclassifier = SVC(kernel='rbf')
# fit the model
svm_rbf=svclassifier.fit(X_train, y_train)

In [None]:
# predict the values
y_pred= svclassifier.predict(X_test)


In [None]:
plot_confusion_matrix(svm_rbf)

In [None]:
# compute kappa score on test set
kappa_value = kappa_score(svm_rbf)

# print the kappa value
print(kappa_value)

In [None]:
update_score_card(model_name='SVM with rbf')

# Model 4 (Support vector machine using kernel(sigmoid))

In [None]:
# build the model
svclassifier = SVC(kernel='sigmoid')
# fit the model
svm_sigmoid=svclassifier.fit(X_train, y_train)

In [None]:
# predict the values
y_pred  = svclassifier.predict(X_test)

In [None]:
# call the function to plot the confusion matrix
plot_confusion_matrix(svm_sigmoid)

In [None]:
# compute the performance measures on test data
test_report = get_test_report(svm_sigmoid)

# print the performace measures
print(test_report)

In [None]:
# compute kappa score on test set
kappa_value = kappa_score(svm_sigmoid)

# print the kappa value
print(kappa_value)

In [None]:
plot_roc(svm_sigmoid)

In [None]:
update_score_card(model_name='SVM Sigmoid')

# Model 5(Support vector machine using kernel(polynomial))

In [None]:
# build the model
svclassifier = SVC(kernel='poly')
# fit the model
svm_poly=svclassifier.fit(X_train, y_train)


In [None]:
# predict the values
y_pred  = svclassifier.predict(X_test)

In [None]:
# call the function to plot the confusion matrix
plot_confusion_matrix(svm_poly)

In [None]:
# compute the performance measures on test data
test_report = get_test_report(svm_poly)

# print the performace measures
print(test_report)

In [None]:
# compute kappa score on test set
kappa_value = kappa_score(svm_poly)

# print the kappa value
print(kappa_value)

In [None]:
plot_roc(svm_poly)

In [None]:
update_score_card(model_name='SVM using polynomial kernel')

# Model 6(Support vector machine using kernel with degree 2)

In [None]:
# build the model
svclassifier_Poly = SVC(kernel='poly', degree = 2, gamma = 'auto')
# fit the model
svm=svclassifier_Poly.fit(X_train, y_train)

In [None]:
# predict the values
y_pred  = svclassifier_Poly.predict(X_test)

In [None]:
plot_confusion_matrix(svm)

In [None]:
test_report=get_test_report(svm)
print(test_report)

In [None]:
# compute kappa score on test set
kappa_value = kappa_score(svm)

# print the kappa value
print(kappa_value)

In [None]:
plot_roc(svm)

In [None]:
update_score_card(model_name='SVM with kernel(ploynomial) with degree 2')

# Model 7(Support vector machine using grid search)

In [None]:
# degree: Degree of the polynomial
# C: value of C parameter or regularisation parameter
# gamma:
param_grid = { 
    'degree': [2,4,6,8,10], 
    'gamma' : ['auto','scale' ],
    'C': [0.5, 1,2,2.5]
}

In [None]:
CV_rfc = GridSearchCV(estimator= svclassifier_Poly, param_grid=param_grid, scoring='accuracy', cv= 5)
# fit the model
CV_rfc.fit(X_train, y_train)


In [None]:
# find the best parameters
CV_rfc.best_params_

In [None]:
# build the model with best parameters obtained from above code
svclassifier_Poly_Grid = SVC(kernel='poly', 
                            degree = 6, 
                            gamma = 'scale',
                           C = 2.5 )
# fit the model
svm1=svclassifier_Poly_Grid.fit(X_train, y_train)

In [None]:
# predict the values
y_pred= svclassifier_Poly_Grid.predict(X_test)

In [None]:
plot_confusion_matrix(svm1)

In [None]:
test_report=get_test_report(svm1)
print(test_report)

In [None]:
# compute kappa score on test set
kappa_value = kappa_score(svm1)

# print the kappa value
print(kappa_value)

In [None]:
plot_roc(svm1)

In [None]:
update_score_card(model_name='SVM with grid search CV ')

# Model 8(Decision tree model)

In [None]:
# instantiate the 'DecisionTreeClassifier' object using 'entropy' criterion
decision_tree = DecisionTreeClassifier(criterion = 'entropy', random_state = 10)

# fit the model using fit() on train data
decision_tree_model = decision_tree.fit(X_train, y_train)

In [None]:
labels=X_train.columns

#plot the decisin tree
fig=plt.figure(figsize=(20,20))
z=tree.plot_tree(decision_tree_model,
                feature_names=labels,
                class_names=['0','1'],
                filled=True)

In [None]:
y_pred=decision_tree_model.predict(X_test)

In [None]:
plot_confusion_matrix(decision_tree_model)

In [None]:
kappa_score(decision_tree_model)

In [None]:
test_report=get_test_report(decision_tree_model)
print(test_report)

In [None]:
plot_roc(decision_tree)

In [None]:
update_score_card(model_name='Decision Tree')

# Model 9(K-nearest neighbour)

In [None]:
classifier=KNeighborsClassifier(n_neighbors=5)
KNN=classifier.fit(X_train,y_train)

In [None]:
y_pred=classifier.predict(X_test)

In [None]:
plot_confusion_matrix(KNN)

In [None]:
kappa_score(KNN)

In [None]:
test_report=get_test_report(KNN)
print(test_report)

In [None]:
plot_roc(KNN)

In [None]:
update_score_card(model_name='KNN')

# Conclusion:

Total 9 models have been built to predict whether it is a good day for an intraday trader or not.Out of all the models decision tree model is the best with 100% accuracy.Even the F1 score which is the harmonic mean between the precision and recall is 1.

We can even see that the kappa score of decision tree model is 1 which means that the predicted data and the actual data totally agree with each other.

The AUC score i.e. the area under the curve of decision tree model is 1.We even use this metrics to choose between different models.As the metric value is 1 and it is the best among other models we choose this model to do prediction on the test data.

If we see the confusion matrix then the type 1 error and type 2 error both are 0,it confirms that the model has got no errors while predicting the day on which the intraday traders should trade or not.

But while doing prediction on the test dataset it was observed that support vector machine using gridCv which is the model number 6 in our result table gave better output. It gave an accuracy of 81.6% whereas when prediction was done on the test data using decision tree model the accuracy was 81.2%.

We used the support vector machine with gridsearch Cv model to do prediction  because it is the second best model having accuracy of 99.65% and F1 score of 98.08%.

Even the AUC score is 99.23% which is better than rest of the models. If we see the confusion matrix then we can observe that in this model the type 1 and type 2 errors are 2 and 1 respectively, which is quiet good.Even when we see the kappa score it is 0.9790 which means that the predicted data agrees 97.9% with the actual data.

Hence, this model was used to do prediction on the test data.

Reason why support vector machine worked better than decision tree model on test data even after decision tree model having better metrics when model was trained on training data, can be overfitting.

# Working with the test data set

In [None]:
#loading the test data set
df_test=pd.read_csv("../input/share-market-prediction/Test_.csv")

In [None]:
#displaying the first 5 records of the test dataset
df_test.head()

On loading the test data we found that the close price column is missing.Hence we need to predict that column first in order to predict whether an intraday trader should invest or not on a particular day.

 # Creating model to find closing price

In [None]:
df.head()

In [None]:
df1=df.drop(['greater','percent','target'],axis=1)

In [None]:
# filter the numerical features in the dataset using select_dtypes()
# include=np.number: selects the numeric features
df_numeric_features = df1.select_dtypes(include=np.number)

# display the numeric features
df_numeric_features.drop(['S.No','Close'],axis=1,inplace=True)

In [None]:
df_numeric_features = sm.add_constant(df_numeric_features)
# separate the independent and dependent variables
X = df_numeric_features

# extract the target variable from the data set
y = df1['Close']

# split data into train subset and test subset for predictor and target variables
# 'test_size' returns the proportion of data to be included in the test set
# set 'random_state' to generate the same dataset each time you run the code 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 1)

# check the dimensions of the train & test subset for 
# print dimension of predictors train set
print("The shape of X_train is:",X_train.shape)

# print dimension of predictors test set
print("The shape of X_test is:",X_test.shape)

# print dimension of target train set
print("The shape of y_train is:",y_train.shape)

# print dimension of target test set
print("The shape of y_test is:",y_test.shape)

In [None]:
# build a full model using OLS()
linreg_full_model = sm.OLS(y_train, X_train).fit()



In [None]:
# predict the 'log_Property_Sale_Price' using predict()
linreg_full_model_predictions = linreg_full_model.predict(X_test)

In [None]:
# take the exponential of predictions using np.exp()
predicted=linreg_full_model_predictions 

# extract the 'Property_Sale_Price' values from the test data
actual= y_test

In [None]:
from statsmodels.tools.eval_measures import rmse
# calculate rmse using rmse()
linreg_full_model_withlog_rmse = rmse(actual, predicted)

# calculate R-squared using rsquared
linreg_full_model_withlog_rsquared = linreg_full_model.rsquared

# calculate Adjusted R-Squared using rsquared_adj
linreg_full_model_withlog_rsquared_adj = linreg_full_model.rsquared_adj 

In [None]:
# create the result table for all accuracy scores
# accuracy measures considered for model comparision are RMSE, R-squared value and Adjusted R-squared value
cols = ['Model', 'RMSE', 'R-Squared', 'Adj. R-Squared']

# create a empty dataframe of the colums
# columns: specifies the columns to be selected
result_tabulation = pd.DataFrame(columns = cols)

# compile the required information
linreg_full_model_withlog_metrics = pd.Series({'Model': "Linreg full model with log of target variable ",
                     'RMSE':linreg_full_model_withlog_rmse,
                     'R-Squared': linreg_full_model_withlog_rsquared,
                     'Adj. R-Squared': linreg_full_model_withlog_rsquared_adj     
                   })

# append our result table using append()
# ignore_index=True: does not use the index labels
# python can only append a Series if ignore_index=True or if the Series has a name
result_tabulation = result_tabulation.append(linreg_full_model_withlog_metrics, ignore_index = True)

# print the result table
result_tabulation

In [None]:
df1['log']=np.log(df1['Close'])

In [None]:
df_numeric_features = sm.add_constant(df_numeric_features)
# separate the independent and dependent variables
X = df_numeric_features

# extract the target variable from the data set
y = df1['log']

# split data into train subset and test subset for predictor and target variables
# 'test_size' returns the proportion of data to be included in the test set
# set 'random_state' to generate the same dataset each time you run the code 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 1)

# check the dimensions of the train & test subset for 
# print dimension of predictors train set
print("The shape of X_train is:",X_train.shape)

# print dimension of predictors test set
print("The shape of X_test is:",X_test.shape)

# print dimension of target train set
print("The shape of y_train is:",y_train.shape)

# print dimension of target test set
print("The shape of y_test is:",y_test.shape)

In [None]:
# build a full model using OLS()
linreg_full_model_withlog = sm.OLS(y_train, X_train).fit()

In [None]:
# predict the 'log_Property_Sale_Price' using predict()
linreg_full_model_withlog_predictions = linreg_full_model_withlog.predict(X_test)

In [None]:
# take the exponential of predictions using np.exp()
predicted=np.exp(linreg_full_model_withlog_predictions) 

# extract the 'Property_Sale_Price' values from the test data
actual= y_test

In [None]:
# calculate rmse using rmse()
linreg_full_model_withlog_rmse = rmse(actual, predicted)

# calculate R-squared using rsquared
linreg_full_model_withlog_rsquared = linreg_full_model_withlog.rsquared

# calculate Adjusted R-Squared using rsquared_adj
linreg_full_model_withlog_rsquared_adj = linreg_full_model_withlog.rsquared_adj 

In [None]:
# create the result table for all accuracy scores
# accuracy measures considered for model comparision are RMSE, R-squared value and Adjusted R-squared value
cols = ['Model', 'RMSE', 'R-Squared', 'Adj. R-Squared']

# create a empty dataframe of the colums
# columns: specifies the columns to be selected
result_tabulation = pd.DataFrame(columns = cols)

# compile the required information
linreg_full_model_withlog_metrics = pd.Series({'Model': "Linreg full model with log of target variable ",
                     'RMSE':linreg_full_model_withlog_rmse,
                     'R-Squared': linreg_full_model_withlog_rsquared,
                     'Adj. R-Squared': linreg_full_model_withlog_rsquared_adj     
                   })

# append our result table using append()
# ignore_index=True: does not use the index labels
# python can only append a Series if ignore_index=True or if the Series has a name
result_tabulation = result_tabulation.append(linreg_full_model_withlog_metrics, ignore_index = True)

# print the result table
result_tabulation

# Predicting the closing price of the test data

In [None]:
#preparing the dataset
df_test1=df_test.drop('S.No',axis=1)

In [None]:
#adding constant as we need to do so for statsmodels library
df_test1=sm.add_constant(df_test1)

In [None]:
predict=linreg_full_model.predict(df_test1)

In [None]:
#converting the predicted values into list
v=np.array(predict).tolist()

In [None]:
#inserting the predicted values in the dataframe under the column name treatment
df_test.insert(2,column="Close",value=v)
df_test.head()

In [None]:
#Finding the rows in which volume=0
#making a dataframe out of it
df3=df_test[df_test['Volume']==0]

In [None]:
df3.shape

In [None]:
#On the days when volume is 0 the opening and closing price should be same as no one traded on that day
#using "np.where" function to set the closing value price of such days when volume=0
df_test['Close'] = np.where(df_test['Volume'] == 0,df_test['Open'], df_test['Close'])

In [None]:
#creating the greater and percent column in test data as its required for model prediction
df_test['greater']=(df_test['Close']-df_test['Open'])/df_test['Open']
df_test['percent']=df_test['greater']*100

In [None]:
df_test_features=df_test.drop(['S.No'],axis=1)

In [None]:
scaler = MinMaxScaler()
scaler.fit(df_test_features)

In [None]:
df_test_features = pd.DataFrame(scaler.fit_transform(df_test_features))

In [None]:
#predicting using the best model
df_test_predict=svm1.predict(df_test_features)

In [None]:
#converting the predicted values into list
v1=np.array(df_test_predict).tolist()

In [None]:
#inserting the predicted values in the dataframe under the column name treatment
df_test.insert(2,column="Flag",value=v1)


In [None]:
#Converting the data in treatment column replacing 1 with yes and 0 with no
df_test['Flag']=df_test['Flag'].replace(1,'Buy')
df_test['Flag']=df_test['Flag'].replace(0,"Don't Buy")


In [None]:
upload=df_test.drop([ 'Open', 'Close', 'High', 'Low', 'Adj Close', 'Volume',
       'greater', 'percent'],axis=1)
upload.head()

In [None]:
upload.to_csv("predicted_flag9.csv",index=False)