In [None]:
%%capture
import pandas as pd
import numpy as np 
import os
import time 
import re
import datetime
import statsmodels.api as sm
from keras.wrappers.scikit_learn import KerasRegressor
from sklearn.model_selection import GridSearchCV,train_test_split
from sklearn.linear_model import Ridge,Lasso,ElasticNet,LassoCV,RidgeCV,LinearRegression

from sklearn.feature_selection import SelectKBest, chi2, SelectFromModel, SequentialFeatureSelector
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVR,SVR
from sklearn import preprocessing
from keras.models import Sequential
from keras.layers import Dense,LSTM,Dropout
import tensorflow as tf
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier,KNeighborsRegressor
from sklearn import metrics
from sklearn.neighbors import DistanceMetric
from sklearn.metrics import precision_score, make_scorer,f1_score
import traceback
import multiprocessing
from multiprocessing.pool import ThreadPool
import warnings
warnings.filterwarnings("ignore")


In [None]:
def pre_process_data(df, null_threshold):
    """
    Drops Date and Unix Date columns from the data.
    Drops the columns which has null values more than specified null_threshold.
    Replaces infinite values with NAN.
    Drops the rows which has null values.

    Parameters
    ----------
    data : dataframe

    null_threshold : numeric
        numeric value describing the amount of null values that can be present.

    Returns
    -------
    data : dataframe
        an updated dataframe after performing all the opertaions.
    """

    df.drop(columns=['Date'], axis=1, inplace=True)
    total = df.shape[0]
    for col in df.columns:
        if null_threshold * total / 100 < df[col].isnull().sum():
            df.drop(columns=[col], axis=1, inplace=True)
    df.replace([np.inf, -np.inf], np.nan, inplace=True)
    df.dropna(axis=0, inplace=True)
    return df

In [None]:
def error_metrics(y_true, y_pred):
    rmse = metrics.mean_squared_error(y_true, y_pred) ** 0.5
    mae = metrics.mean_absolute_error(y_true, y_pred)
    mse = metrics.mean_squared_error(y_true, y_pred)
    r2_score = metrics.r2_score(y_true, y_pred)
    return {"root_mean_squared_error": rmse, "mean_absolute_error": mae, "mean_squared_error": mse,"r2_score":r2_score}

In [None]:
def split_dataset(X,Y,t):
    tr = int(len(X)*t)
    tt = len(X) - tr
    xtr = X[:tr]
    xtt = X[tr:tr+tt]
    ytr = Y[:tr]
    ytt = Y[tr:tr+tt]
    return (xtr,xtt,ytr,ytt)

In [None]:
def remove_next_columns(df,column):
    cols = [col for col in df.columns if "next" not in col.lower()]
    cols.append(column)
    df = df[cols]
    return (df, column)

def remove_cp_columns(df):
    cols = [col for col in df.columns if not col.lower().startswith("cp")]
    df = df[cols]
    return df

def remove_previous_columns(df,column):
    cols = [col for col in df.columns if not col.lower().startswith("previous")]
    cols.append(column)
    df = df[cols]
    return df

def remove_max_avg_min_columns(df):
    cols = [col for col in df.columns if not (col.lower().startswith("max") or col.lower().startswith("avg") or col.lower().startswith("min"))]
    df = df[cols]
    return df

In [None]:
def build_lstm(input_shape,optimizer,activation,dropout_rate,neurons,layers):
    model = Sequential()
    model.add(LSTM(neurons,return_sequences=True,input_shape=input_shape))
    model.add(Dropout(dropout_rate))
    model.add(LSTM(neurons))
    model.add(Dropout(dropout_rate))
    for _ in range(layers):
        model.add(Dense(neurons))
        model.add(Dropout(dropout_rate))
    model.add(Dense(units=1,activation = activation))
    model.compile(loss='mean_squared_error', optimizer=optimizer, metrics=["accuracy"])
    return model

def split_dataset(X,Y,t):
    tr = int(len(X)*t)
    tt = len(X) - tr
    xtr = X[:tr]
    xtt = X[tr:tr+tt]
    ytr = Y[:tr]
    ytt = Y[tr:tr+tt]
    return (xtr,xtt,ytr,ytt)

def reshape_data(x_train, x_test, y_train, y_test,units = 30):
    my_x_train = list()
    my_y_train = list()
    my_x_test = list()
    my_y_test = list()
    for i in range(x_train.shape[0]-units):
        my_x_train.append(x_train.iloc[i:i+units,:])
        my_y_train.append(y_train.iloc[i+units,])
    
    my_x_train = np.array(my_x_train)
    my_x_train = np.reshape(my_x_train,(my_x_train.shape[0],my_x_train.shape[1],my_x_train.shape[2]))
    
    my_y_train = np.array(my_y_train)
    my_y_train = np.reshape(my_y_train,(my_y_train.shape[0],1))
    
    for i in range(x_test.shape[0]-units):
        my_x_test.append(x_test.iloc[i:i+units,:])
        my_y_test.append(y_test.iloc[i+units,])
        
    my_x_test = np.array(my_x_test)
    my_x_test = np.reshape(my_x_test,(my_x_test.shape[0],my_x_test.shape[1],my_x_test.shape[2]))
    
    my_y_test = np.array(my_y_test)
    my_y_test = np.reshape(my_y_test,(my_y_test.shape[0],1))
    
    return (my_x_train, my_x_test, my_y_train, my_y_test)

def rnnregression(df,column):
    
    X = df.drop(columns=[column])
    Y = df[column]
    X_train, X_test, Y_train, Y_test = split_dataset(X, Y,0.70)
    X_train, X_test, Y_train, Y_test = reshape_data(X_train, X_test, Y_train, Y_test,units = 30)
    
    input_shape = X_train.shape[1],X_train.shape[2]
    model = build_lstm(input_shape,"adam",'relu',0.2,32,2)
    model.fit(X_train,Y_train,epochs=25,batch_size=32,verbose=0)
    Y_pred = model.predict(X_test)
    
    result = {}
    result = error_metrics(Y_test, Y_pred)
    result.update({"actual":[i[0] for i in Y_test]})
    result.update({"predicted":[i[0] for i in Y_pred]})
    return result

In [None]:
def run_linear(X_train, X_test, Y_train, Y_test,num,col):
    linear_pipeline = Pipeline([("feature_selection",SequentialFeatureSelector(LinearRegression(),n_jobs=None,n_features_to_select=num)),("linear_regression",LinearRegression())])
    linear_pipeline.fit(X_train,Y_train)
    Y_pred = linear_pipeline.predict(X_test)
    result = error_metrics(Y_test,Y_pred)
    selected_features = X_train.columns[linear_pipeline["feature_selection"].get_support()].tolist()
    result.update({"selected_features":selected_features})
    result.update({"numoffeatures":len(selected_features)})
    result.update({"predicted_column":col})
    result.update({"model":"linear"})
    result.update({"actual":Y_test.values.tolist()})
    result.update({"predicted":Y_pred.tolist()})
    return result

In [None]:
def run_knn(X_train, X_test, Y_train, Y_test,num,col):
    knn_pipeline = Pipeline([("feature_selection",SequentialFeatureSelector(KNeighborsRegressor(),n_jobs=-1,n_features_to_select=num)),("knn_regression",KNeighborsRegressor())])
    knn_pipeline.fit(X_train,Y_train)
    Y_pred = knn_pipeline.predict(X_test)
    result = error_metrics(Y_test,Y_pred)
    selected_features = X_train.columns[knn_pipeline["feature_selection"].get_support()].tolist()
    result.update({"selected_features":selected_features})
    result.update({"numoffeatures":len(selected_features)})
    result.update({"predicted_column":col})
    result.update({"model":"knn"})
    result.update({"actual":Y_test.values.tolist()})
    result.update({"predicted":Y_pred.tolist()})
    return result

In [None]:
def run_svr(X_train, X_test, Y_train, Y_test,num,col):
    knn_pipeline = Pipeline([("feature_selection",SequentialFeatureSelector(SVR(),n_jobs=-1,n_features_to_select=num)),("knn_regression",SVR())])
    knn_pipeline.fit(X_train,Y_train)
    Y_pred = knn_pipeline.predict(X_test)
    result = error_metrics(Y_test,Y_pred)
    selected_features = X_train.columns[knn_pipeline["feature_selection"].get_support()].tolist()
    result.update({"selected_features":selected_features})
    result.update({"numoffeatures":len(selected_features)})
    result.update({"predicted_column":col})
    result.update({"model":"svr"})
    result.update({"actual":Y_test.values.tolist()})
    result.update({"predicted":Y_pred.tolist()})
    return result

In [None]:
def run_models(df,col):
    ref = df.copy()
    days = int(re.findall(r"\d+",col)[0])
    print(col,days)
    start = df['Date'].iloc[0] + datetime.timedelta(days = days)
    end = df['Date'].iloc[-1] - datetime.timedelta(days = days)
    df  = df[df.Date.between(start,end)]
    df = pre_process_data(df, 60)
    df[df.columns] = (df[df.columns].astype(str)).apply(pd.to_numeric, errors='coerce')
    df,column = remove_next_columns(df,col)
    X = df.drop(columns=[column])
    Y = df[column]
    X_train, X_test, Y_train, Y_test = split_dataset(X, Y,0.70)
    num = 0.33
    result = []
    linres = run_linear(X_train, X_test, Y_train, Y_test,num,column)
    linres.update({"close":ref.loc[X_test.index]['Close Price'].values.tolist()})
    linres.update({"date":ref.loc[X_test.index]['Date'].apply(lambda row : row.strftime('%Y-%m-%d')).values.tolist()})

    knnres = run_knn(X_train, X_test, Y_train, Y_test,num,column)
    knnres.update({"close":ref.loc[X_test.index]['Close Price'].values.tolist()})
    knnres.update({"date":ref.loc[X_test.index]['Date'].apply(lambda row : row.strftime('%Y-%m-%d')).values.tolist()})

    svrres = run_svr(X_train, X_test, Y_train, Y_test,num,column)
    svrres = run_knn(X_train, X_test, Y_train, Y_test,num,column)
    svrres.update({"close":ref.loc[X_test.index]['Close Price'].values.tolist()})
    svrres.update({"date":ref.loc[X_test.index]['Date'].apply(lambda row : row.strftime('%Y-%m-%d')).values.tolist()})

    result.append(linres)
    result.append(knnres)
    result.append(svrres)
    return result

In [None]:
necessary_columns = ["Date","Close Price","Previous 360 days UB","Min Inc % in 180 days","Next 60 days LB","Previous 720 days UB","No. of Trades GR","CP % LV 180 days","Max Inc % in 180 days","Next 1080 days LB","CP % BA 180 days","Next Day Low Price GR","Max Dec % in 90 days","Expenditure GR","CP % HV 90 days","Min Dec % in 365 days","Max Dec % in 365 days","CP % HV 7 days","CP % BA 7 days","Avg Inc % in 365 days","Min Inc % in 90 days","Avg Inc % in 180 days","Total Turnover (Rs.) GR","Low Price GR","Previous 1080 days UB","CP % HV 180 days","Next 180 days UB","No.of Shares GR","Previous 60 days UB","CP % BA 90 days","Avg Inc % in 90 days","Sequential Increase %","WAP GR","CP % BA 30 days","Avg Dec % in 180 days","Previous 720 days LB","EPS GR","Deliverable Quantity GR","Next 360 days UB","CP % HV 365 days","Spread Close-Open GR","Min Dec % in 180 days","Next 30 days LB","Sequential Increase","Previous 360 days LB","Alpha GR","CP % LV 365 days","Dividend Value GR","Sequential Decrease","Next 360 days LB","Avg Dec % in 365 days","Net Profit GR","CP % LV 7 days","CP % HV 30 days","% Deli. Qty to Traded Qty GR","Min Inc % in 365 days","Sequential Decrease %","Beta GR","Next 30 days UB","High Price GR","Spread High-Low GR","Income GR","Max Dec % in 180 days","Previous 30 days UB","Next 90 days UB","Next 90 days LB","Next 1080 days UB","Open Price GR","Next 720 days LB","Max Inc % in 365 days","Previous 90 days LB","Previous 90 days UB","Next 60 days UB","Avg Dec % in 90 days","Previous 30 days LB","Previous 1080 days LB","Next Day Open Price GR","Next Day High Price GR","CP % BA 365 days","Max Inc % in 90 days","Revenue GR","CP % LV 30 days","Min Dec % in 90 days","Next 180 days LB","Previous 180 days LB","Close Price GR","CP % LV 90 days","Previous 60 days LB","Previous 180 days UB","Next 720 days UB","Next Day Close Price GR"]
columns_to_predict = ['Next 30 days LB','Next 30 days UB','Next 60 days LB','Next 60 days UB','Next 90 days LB','Next 90 days UB','Next 180 days LB','Next 180 days UB','Next 360 days LB','Next 360 days UB','Next 720 days LB','Next 720 days UB','Next 1080 days LB','Next 1080 days UB']

In [None]:
security_codes = [name for name in os.listdir("../input/newdss")]
security_codes.sort()

In [None]:
%%time
for name in security_codes:
    try:
        print(name)
        df = pd.read_csv("../input/newdss/" + name)
        df = df.iloc[::-1].reset_index(drop=True)
        df['Date'] = pd.to_datetime(df['Date'])
        df = df[necessary_columns]
        pool = ThreadPool(multiprocessing.cpu_count())
        combs = list(zip([df]*len(columns_to_predict),columns_to_predict))
        result = pool.starmap(run_models,combs)
        resultdf = pd.DataFrame(result)
        resultdf.to_csv(name[2:],index=None)
    except:
        traceback.print_exc()
