In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/stockdata/new.csv
/kaggle/input/stockdata/top.csv
/kaggle/input/stockdata/all.csv
/kaggle/input/stockdata/Data/modIndex.csv
/kaggle/input/stockdata/Data/Index.csv
/kaggle/input/stockdata/Data/RiskFreeRate.csv
/kaggle/input/stockdata/Data/Equity.csv
/kaggle/input/stockdata/Data/Stock/fc500770.csv
/kaggle/input/stockdata/Data/Stock/532540.csv
/kaggle/input/stockdata/Data/Stock/fc500325.csv
/kaggle/input/stockdata/Data/Stock/fc532540.csv
/kaggle/input/stockdata/Data/Stock/533155.csv
/kaggle/input/stockdata/Data/Stock/500180.csv
/kaggle/input/stockdata/Data/Stock/fc500085.csv
/kaggle/input/stockdata/Data/Stock/fc533155.csv
/kaggle/input/stockdata/Data/Stock/fc533260.csv
/kaggle/input/stockdata/Data/Stock/gr500124.csv
/kaggle/input/stockdata/Data/Stock/gr500182.csv
/kaggle/input/stockdata/Data/Stock/gr500209.csv
/kaggle/input/stockdata/Data/Stock/fc532210.csv
/kaggle/input/stockdata/Data/Stock/532538.csv
/kaggle/input/stockdata/Data/Stock/fc500680.csv
/kaggle/input/stockdata/D

In [2]:
import pandas as pd
import numpy as np 
import os
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn import metrics
from math import sqrt
from sklearn.model_selection import GridSearchCV
import time 
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import f_classif
from sklearn.metrics import confusion_matrix
import math
from sklearn.metrics import precision_score, make_scorer
import traceback
import prettytable
import io

In [3]:
import warnings; warnings.simplefilter('ignore')

In [4]:
path = "/kaggle/input/stockdata"
path

'/kaggle/input/stockdata'

# PreProcessing Data    

In [5]:
def pre_process_data(data,null_threshold):
    """
    Drops Date and Unix Date columns from the data.
    Drops the columns which has null values more than specified null_threshold.
    Replaces infinite values with NAN.
    Drops the rows which has null values.

    Parameters
    ----------
    data : dataframe

    null_threshold : numeric
        numeric value describing the amount of null values that can be present.

    Returns
    -------
    data : dataframe
        an updated dataframe after performing all the opertaions.
    """
    
    data.drop(columns=['Unix Date','Date'],axis=1,inplace=True)
    total = data.shape[0]
    for col in data.columns:
        if null_threshold * total / 100 < data[col].isnull().sum():
            data.drop(columns=[col],axis=1,inplace=True)
    data.replace([np.inf, -np.inf], np.nan, inplace=True)
    data.dropna(axis=0,inplace=True)
    return data

# Removing columns based on dependent column


In [6]:
def dependent_column(data,column):
    """
    Removes all the Next Day columns.
    Removes all the non Growth Rate Columns (GR)
    add the predictor column to list of columns.

    Parameters
    ----------
    data : dataframe

    column : string
        name of the predictor column 

    Returns
    -------
    data : dataframe
        an updated dataframe after performing all the opertaions.
    column : string
        name of the predictor column
    """
    cols = [col for col in data.columns if "next" not in col.lower() and col.lower().endswith("gr")]
    cols.append(column)
    data = data[cols]
    return (data,column)

# Parameters Tuning

In [7]:
def best_params_logistic(X,Y):
    custom_scorer = make_scorer(metrics.f1_score, greater_is_better=True,pos_label = 1)
    penalty = ['l1', 'l2','none']
    C = [0.001,0.01,0.1,1,10,100]
    hyperparameters = dict(C = C, penalty = penalty)
    logistic = LogisticRegression()
    try:
        gridsearch = GridSearchCV(logistic, hyperparameters, cv=5,verbose=0,scoring = custom_scorer)
        best_model = gridsearch.fit(X, Y)
    except:
        hyperparameters["penalty"] = ['none','l2']
        gridsearch = GridSearchCV(logistic, hyperparameters, cv=5, verbose=0,scoring = custom_scorer)
        best_model = gridsearch.fit(X, Y)
    best_p = best_model.best_estimator_.get_params()["penalty"]
    best_c = best_model.best_estimator_.get_params()["C"]
    return best_p,best_c

# Confusion Matrix

### true negatives  {0,0},
### false negatives {1,0},
### true positives  {1,1},
### false positives {0,1}.

In [8]:
def create_confusion_matrix(y_pred,y_true):
    
    cm = confusion_matrix(y_true,y_pred)
    accuracy = metrics.accuracy_score(y_true,y_pred)
    precision = metrics.precision_score(y_true,y_pred)
    recall = metrics.recall_score(y_true,y_pred)
    f1_score = metrics.f1_score(y_true,y_pred)
    return {"accuracy":accuracy,"precision":precision,"recall":recall,"f1_score":f1_score,"confusion matrix":cm}


# Logistic Regression

In [9]:
def logistic_regression(df,column, threshold,C, penalty):
    df["Target"] = df["Next Day Close Price GR"].apply(lambda x : 1 if x >= threshold else 0)
    X = df[df.columns[:-2]]
    Y = df["Target"]
    X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=0)
    logmodel = LogisticRegression(penalty = penalty, C = C,random_state = 0)
    logmodel.fit(X_train, y_train)
    y_pred = logmodel.predict(X_test)

    confidence = logmodel.score(X_test, y_test)
    rmse = sqrt(metrics.mean_squared_error(y_test, y_pred))
    mae = metrics.mean_absolute_error(y_test, y_pred)
    mse = metrics.mean_squared_error(y_test, y_pred)
    confusion_mat =  create_confusion_matrix(y_pred,y_test)
    myres = {"root_mean_squared_error":rmse,"mean_absolute_error":mae,"mean_squared_error":mse,"confidence":confidence}
    myres.update(confusion_mat)
    return myres

In [10]:
def logistic_model(df,column = "Next Day Close Price GR"):
    
    threshold = [0.01,0.02,0.03,0.04,0.05]
    solution = list()
    for t in threshold:
        df["Target"] = df["Next Day Close Price GR"].apply(lambda x : 1 if x >= t else 0)
        X = df.drop(columns=["Target",column])
        Y = df["Target"]

        P_values = [0.01,0.05,0.1,0.15,0.2,0.25]
        F_values = [0.1,0.5,1,2,5,10,100]

        f_col_values, p_col_values = f_classif(X,Y)
        f_col_values = dict(zip(X.columns,f_col_values))
        p_col_values = dict(zip(X.columns,p_col_values))
        
        for p in P_values:
            mycols = list()
            for col,val in p_col_values.items():
                if val <= p:
                    mycols.append(col)
            if mycols == []:
                continue
            X = df[mycols]
            X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=0)
            best_p,best_c = best_params_logistic(X_train,y_train)
            result = logistic_regression(df,column = "Next Day Close Price GR",threshold = t,C = best_c, penalty = best_p)
            result.update({"penalty":best_p,"C":best_c,"p_f_value":"p_"+str(p),"threshold":t})
            result.update({"features":mycols})
            solution.append(result)

        for f in F_values:
            mycols = list()
            for col,val in f_col_values.items():
                if val >= f:
                    mycols.append(col)
            if mycols == []:
                continue
            X = df[mycols]
            X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=0)
            best_p,best_c = best_params_logistic(X_train,y_train)
            result = logistic_regression(df,column = "Next Day Close Price GR",threshold = t,C = best_c, penalty = best_p)
            result.update({"penalty":best_p,"C":best_c,"p_f_value":"f_"+str(f),"threshold":t})
            result.update({"features":mycols})
            solution.append(result)

    return solution

In [11]:
total = pd.read_csv(os.path.join(path,"all.csv"))

In [12]:
for _, row in total.iterrows():
    try:
        security_id = str(row['security id'])
        df = pd.read_csv(os.path.join(path,'Data/Stock/'+'gr'+str(security_id)+'.csv'))
        print(security_id)
        df = pre_process_data(df,60)
        column = "Next Day Close Price GR"
        df,column = dependent_column(df,column)
        result = logistic_model(df,column)
        result_df = pd.DataFrame(result)
        result_df.to_csv("logistic_"+str(security_id)+".csv",index=None)
    except Exception as e:
        traceback.print_exc() 


500112
500325
532540
500209
532174
507685
530965
500182
532210
500180
500680
506395
500770
500085
501425
532899
537291
500790
500825
533155
533287
533260
539921
542602


Traceback (most recent call last):
  File "<ipython-input-12-4a5f9dec9f6c>", line 4, in <module>
    df = pd.read_csv(os.path.join(path,'Data/Stock/'+'gr'+str(security_id)+'.csv'))
  File "/opt/conda/lib/python3.7/site-packages/pandas/io/parsers.py", line 605, in read_csv
    return _read(filepath_or_buffer, kwds)
  File "/opt/conda/lib/python3.7/site-packages/pandas/io/parsers.py", line 457, in _read
    parser = TextFileReader(filepath_or_buffer, **kwds)
  File "/opt/conda/lib/python3.7/site-packages/pandas/io/parsers.py", line 814, in __init__
    self._engine = self._make_engine(self.engine)
  File "/opt/conda/lib/python3.7/site-packages/pandas/io/parsers.py", line 1045, in _make_engine
    return mapping[engine](self.f, **self.options)  # type: ignore[call-arg]
  File "/opt/conda/lib/python3.7/site-packages/pandas/io/parsers.py", line 1862, in __init__
    self._open_handles(src, kwds)
  File "/opt/conda/lib/python3.7/site-packages/pandas/io/parsers.py", line 1363, in _open_handle

532538
500387


Traceback (most recent call last):
  File "<ipython-input-12-4a5f9dec9f6c>", line 4, in <module>
    df = pd.read_csv(os.path.join(path,'Data/Stock/'+'gr'+str(security_id)+'.csv'))
  File "/opt/conda/lib/python3.7/site-packages/pandas/io/parsers.py", line 605, in read_csv
    return _read(filepath_or_buffer, kwds)
  File "/opt/conda/lib/python3.7/site-packages/pandas/io/parsers.py", line 457, in _read
    parser = TextFileReader(filepath_or_buffer, **kwds)
  File "/opt/conda/lib/python3.7/site-packages/pandas/io/parsers.py", line 814, in __init__
    self._engine = self._make_engine(self.engine)
  File "/opt/conda/lib/python3.7/site-packages/pandas/io/parsers.py", line 1045, in _make_engine
    return mapping[engine](self.f, **self.options)  # type: ignore[call-arg]
  File "/opt/conda/lib/python3.7/site-packages/pandas/io/parsers.py", line 1862, in __init__
    self._open_handles(src, kwds)
  File "/opt/conda/lib/python3.7/site-packages/pandas/io/parsers.py", line 1363, in _open_handle

532689
532706
532163
524715
532488
500124
