In [36]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import cross_val_score
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error
%matplotlib inline

In [37]:
infy_dataset = pd.read_csv("complete_data_set_v1/INFY.NS.csv")

In [38]:
infy_dataset['Date'] = pd.to_datetime(infy_dataset['Date'],format='%Y-%m')

In [39]:
infy_dataset['Adj Close'] = pd.to_numeric(infy_dataset['Adj Close'],errors='coerce')
infy_dataset['Volume'] = pd.to_numeric(infy_dataset['Volume'],errors='coerce')
infy_dataset['Close'] = pd.to_numeric(infy_dataset['Close'],errors='coerce')
infy_dataset['Low'] = pd.to_numeric(infy_dataset['Low'],errors='coerce')
infy_dataset['High'] = pd.to_numeric(infy_dataset['High'],errors='coerce')
infy_dataset['Open'] = pd.to_numeric(infy_dataset['Open'],errors='coerce')

In [40]:
infy_dataset = infy_dataset.dropna(axis=0,how='any')

In [41]:
'''Create new column to show closing price after 30th day'''
forecast_out = int(30) # predicting 30 days into future
infy_dataset['PriceNextMonth'] = infy_dataset[['Adj Close']].shift(-forecast_out)
infy_dataset = infy_dataset[:-forecast_out]# remove last 30 from X

In [42]:
infy_dataset.corr()

Unnamed: 0,Open,High,Low,Close,Adj Close,Volume,PriceNextMonth
Open,1.0,0.997877,0.997125,0.99566,0.968154,-0.082128,0.874661
High,0.997877,1.0,0.996697,0.997984,0.969741,-0.0711,0.876141
Low,0.997125,0.996697,1.0,0.998256,0.970066,-0.116717,0.875029
Close,0.99566,0.997984,0.998256,1.0,0.971249,-0.100262,0.876217
Adj Close,0.968154,0.969741,0.970066,0.971249,1.0,-0.08972,0.92866
Volume,-0.082128,-0.0711,-0.116717,-0.100262,-0.08972,1.0,-0.074321
PriceNextMonth,0.874661,0.876141,0.875029,0.876217,0.92866,-0.074321,1.0


In [43]:
#Keep only 'Adj Close and remove other columns as they are highly correlated'
X = infy_dataset['Adj Close']
X = X.values.reshape(X.shape[0],1)
#X = preprocessing.scale(X)

In [44]:
y = np.array(infy_dataset['PriceNextMonth'])

In [45]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

In [50]:
def evaluate_model(model,X_train, X_test, y_train, y_test):
    confidence = model.score(X_test, y_test)
    print("score: ", confidence)
    
    scores = cross_val_score(model, X_train, y_train, cv=5)
    print("cross_val_score: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))
    
    mae = mean_absolute_error(y_test, model.predict(X_test))
    print("mean_absolute_error: ", mae)

In [51]:
# LinearRegression
clf = LinearRegression()
clf.fit(X_train,y_train)
evaluate_model(clf,X_train, X_test, y_train, y_test)

score:  0.865207675534
cross_val_score: 0.86 (+/- 0.02)
mean_absolute_error:  47.1501303897


In [52]:
#RandomForestRegressor
from sklearn.ensemble import RandomForestRegressor
model = RandomForestRegressor()
model.fit(X_train, y_train)
evaluate_model(model,X_train, X_test, y_train, y_test)

score:  0.842358951868
cross_val_score: 0.85 (+/- 0.02)
mean_absolute_error:  51.426651235


In [49]:
#We'll go with  RandomForestRegressor as base model