In [ ]:
# importing required packages
import pandas_profiling
import numpy as np
import pandas as pd # data processing, CSV file IO
import matplotlib.pyplot as plt # Plotting Figures
import seaborn as sns
from pandas.plotting import scatter_matrix

from sklearn.model_selection import train_test_split # Tarihe göre ayırdım, buna gerek yok
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn import metrics

import datetime
import pandas_datareader.data as web
from pandas import Series, DataFrame
import sklearn
%matplotlib inline
from matplotlib import style
import matplotlib as mpl

import sklearn.preprocessing

from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import Ridge
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import make_pipeline

############ 1 -> EXPLANATORY DATA ANALYSIS (Introduction) ############

# Imports CSV file into Pandas Data Frame
df = pd.read_csv('migros.csv')
df = df[df.Volume > 0]
df = df[df.index > 1500]

dolarf = pd.read_csv('dolar.csv')

del df['Open']
del df['High']
del df['Low']
df['Date'] = pd.to_datetime(df.Date,format='%Y-%m-%d', errors='coerce')
df.Volume = df.Volume.astype(int)

# Indicators:

# Moving Average
df['MA10'] = df.Close.rolling(window=10).mean().shift(-5)

# Exponential Moving Average
df['EMA'] = df.Close.ewm(span=10, min_periods=5,adjust=True).mean().shift(-5) # Dünün EMA'sı

# Rate of Change: 10 günlük değişim yüzdesi
df['ROC'] = df.Close.pct_change(10).shift(-10)

# Volatility
df['Volatility'] = df.Close.rolling(window=10).std().shift(-10)

# Classification as Up(1)/Down(0)
threshold = -0.01
df['Change'] = df.Close.diff()
df['Up/Down'] = np.where(df.Change > threshold, 1,0)

# Dolar
df['Dolar'] = dolarf['Dollar-TRY']

df.head() # Shows first 5 rows of the dataframe

In [ ]:
df.tail(500)

In [ ]:
df.info()

In [ ]:
plt.figure(figsize=(15,10))
plt.grid(True)
df19 = df[df.Date > "2019-01-01"]
plt.plot(df19['Date'], df19['Close'],label='Close')
plt.plot(df19['Date'], df19['MA10'], label='MA 10 Days')
plt.plot(df19['Date'], df19['EMA'], label='Exponential MA 10 Days')
plt.legend(loc=2)

In [ ]:
# Row ve column sayısı
print(df.shape)

In [ ]:
# Prints out column names using columns
print(df.columns)

In [ ]:
# Outputs some general information about the dataframe
print(df.info())

In [ ]:
# shows basic statistical characteristics of each numerical feature (int64 and float64 types): number of non-missing values,
# mean, standard deviation, range, median, 0.25 and 0.75 quartiles.
df.describe()

In [ ]:
# Shows how the Volume is distributed
print(df['Volume'].describe())
plt.figure(figsize=(10, 10))
sns.distplot(df['Volume'], color='g', bins=300, hist_kws={'alpha': 0.4});

In [ ]:
# Shows how the Close is distributed
print(df['Close'].describe())
plt.figure(figsize=(10, 10))
sns.distplot(df['Close'], color='g', bins=300, hist_kws={'alpha': 0.4});

In [ ]:
# Numerical data distribution
list(set(df.dtypes.tolist()))
df_num = df.select_dtypes(include = ['float64', 'int64'])
df_num.head()

In [ ]:
# Avoid having the matplotlib verbose informations
df_num.hist(figsize=(16, 20), bins=50, xlabelsize=8, ylabelsize=8);

In [ ]:
# We'll try to find which features are strongly correlated with Close.
# We'll store them in a var called precious_features_list. 
df_num_corr = df_num.corr()['Close'][:-1] # -1 because the latest row is SalePrice
precious_features_list = df_num_corr[abs(df_num_corr) > 0.5].sort_values(ascending=False)
print("There is {} strongly correlated values with Close:\n{}".format(len(precious_features_list), precious_features_list))

In [ ]:
# We plot precious features correlated with Close Attribute
for i in range(0, len(df_num.columns), 5):
    sns.pairplot(data=df_num,
                x_vars=df_num.columns[i:i+5],
                y_vars=['Close'])

In [ ]:
# Feature relationship
# Trying to plot all the numerical features in a seaborn pairplot
# will take us too much time and will be hard to interpret.
# We can try to see if some variables are linked between each other
# and then explain their relation with common sense.

corr = df_num.corr() # We already examined SalePrice correlations
plt.figure(figsize=(12, 10))
sns.heatmap(corr[(corr >= 0.5) | (corr <= -0.4)], 
            cmap='viridis', vmax=1.0, vmin=-1.0, linewidths=0.1,
            annot=True, annot_kws={"size": 8}, square=True);

In [ ]:
# Shows how Close is related to the Volume feature. 
# We'll do this using a crosstab contingency table and also through visual analysis with Seaborn
#pd.crosstab(df['Close'], df['Volume'], margins=True)

In [ ]:
# Plots a correlation matrix
corr = df.corr()

ax = sns.heatmap(
    corr, vmin=-1, vmax=1, center=0, 
    cmap=sns.diverging_palette(20, 220, n=200),
    square=True
)

ax.set_xticklabels(
    ax.get_xticklabels(), rotation=45, horizontalalignment='right'
);

# Blue means positive, red means negative
# The stronger the color, the larger the correlation magnitude

In [ ]:
# We can use pandas_profiling for more detailed report
# pandas_profiling.ProfileReport(df)

In [ ]:
# Plot a histogram for all the columns of the dataframe. This shows the frequency of values in all the columns
%matplotlib inline
import matplotlib.pyplot as plt
sns.set()
df.hist(sharex = False, sharey = False, xlabelsize = 12, ylabelsize = 12, figsize=(20, 20))

In [ ]:
#setting index as date
#%matplotlib inline
#import matplotlib.pyplot as plt

#df['Date'] = pd.to_datetime(df.Date,format='%Y-%m-%d')
#df.index = df['Date']

#plot
plt.figure(figsize=(24,12))
plt.plot(df['Date'], df['Close'], label='Close Price history')

In [ ]:
print(df.info())
df.head()

# Linear Regression

In [ ]:
dataset = df[(df.index > 20) & (df.index < 3842)] # NaN row'lar çöpe
testset = df[df.index > 3841] # Test set'i 4 Kasım ve sonrası

In [ ]:
dataset.describe()

In [ ]:
dataset.isnull().any()

In [ ]:
dataset = dataset.fillna(method='ffill')

X_linear_train = dataset[['MA10', 'EMA', 'ROC', 'Volatility']].values
y_linear_train = dataset['Close'].values

X_linear_test = testset[['MA10', 'EMA', 'ROC', 'Volatility']].values
y_linear_test = testset['Close'].values

# Next, we split 80% of the data to the training set while 20% of the data to test set using below code.
#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

# 3841 	2019-11-04


In [ ]:
plt.figure(figsize=(15,10))
plt.tight_layout()
sns.distplot(dataset['Close'])

In [ ]:
# Now lets train our model.
regressor = LinearRegression()
regressor.fit(X_linear_train, y_linear_train)

y_linear_pred = regressor.predict(X_linear_test)

predictions = pd.DataFrame({'Actual': y_linear_test, 'Predicted': y_linear_pred, 'Diff': y_linear_test - y_linear_pred})

df1 = predictions
df1

In [ ]:
df1.plot(kind='bar',figsize=(10,8))
plt.grid(which='major', linestyle='-', linewidth='0.5', color='green')
plt.grid(which='minor', linestyle=':', linewidth='0.5', color='black')
plt.show()

In [ ]:
print('Mean Absolute Error:', metrics.mean_absolute_error(y_linear_test, y_linear_pred))  
print('Mean Squared Error:', metrics.mean_squared_error(y_linear_test, y_linear_pred))  
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_linear_test, y_linear_pred)))

In [ ]:
plt.plot(y_linear_test, color = 'red', label = 'Real Price')
plt.plot(y_linear_pred, color = 'green', label = 'Predicted Price')
plt.title('MIGROS Stock Price Prediction')
plt.xlabel('Time')
plt.ylabel('MIGROS Stock Price')
plt.legend()
plt.show()

# Logistic Regression

In [ ]:
X_logistic_train = dataset[['MA10', 'EMA', 'ROC', 'Volatility']].values
y_logistic_train = dataset['Up/Down'].values

X_logistic_test = testset[['MA10', 'EMA', 'ROC', 'Volatility']].values
y_logistic_test = testset['Up/Down'].values

regressor = LogisticRegression(solver='lbfgs')
regressor.fit(X_logistic_train, y_logistic_train)

y_logistic_pred = regressor.predict(X_logistic_test)

log_predictions = pd.DataFrame({'Actual': y_logistic_test, 'Predicted': y_logistic_pred})

df2 = log_predictions
df2

In [ ]:
cm = metrics.confusion_matrix(y_logistic_test, y_logistic_pred)

# TruePositive FN
# FalseP TNegative
# Şeklinde array verecek

cm

# Other

In [None]:
start = datetime.datetime(2000, 1, 1)
end = datetime.datetime(2019, 1, 30)

df = web.DataReader("MGROS.IS", 'yahoo', start, end)
df.tail()

# Imports CSV file into Pandas Data Frame
# df = pd.read_csv('migros.csv')
# df.head() # Shows first 5 rows of the dataframe

close_px = df['Close']
mavg = close_px.rolling(window=100).mean()

# Adjusting the size of matplotlib
mpl.rc('figure', figsize=(8, 7))
mpl.__version__

# Adjusting the style of matplotlib
style.use('ggplot')

close_px.plot(label='MIGROS')
mavg.plot(label='mavg')
plt.legend()

In [None]:
rets = close_px / close_px.shift(1) - 1
rets.plot(label='return')

In [None]:
# Competitors
dfcomp = web.DataReader(['BIZIM.IS', 'CRFSA.IS', 'BIMAS.IS', 'SOKM.IS', 'MGROS.IS'],'yahoo',start=start,end=end)['Close']
dfcomp.tail()

In [None]:
retscomp = dfcomp.pct_change()
corr = retscomp.corr()

plt.matshow(corr.corr())
plt.xticks(range(len(corr.columns)), corr.columns)
plt.yticks(range(len(corr.columns)), corr.columns)
plt.colorbar()
plt.show()

In [None]:
plt.imshow(corr, cmap='hot', interpolation='none')
plt.colorbar()
plt.xticks(range(len(corr)), corr.columns)
plt.yticks(range(len(corr)), corr.columns);

retscomp = dfcomp.pct_change()
corr = retscomp.corr()
print(corr)
print(retscomp.columns)

plt.scatter(retscomp['MGROS.IS'], retscomp['BIMAS.IS'])
plt.xlabel('Returns MGROS')
plt.ylabel('Returns BIMAS')

In [None]:
scatter_matrix(retscomp, diagonal='kde', figsize=(10, 10));

In [None]:
plt.scatter(retscomp.mean(), retscomp.std())
plt.xlabel('Expected returns')
plt.ylabel('Risk')
for label, x, y in zip(retscomp.columns, retscomp.mean(), retscomp.std()):
    plt.annotate(
        label, 
        xy = (x, y), xytext = (20, -20),
        textcoords = 'offset points', ha = 'right', va = 'bottom',
        bbox = dict(boxstyle = 'round,pad=0.5', fc = 'yellow', alpha = 0.5),
        arrowprops = dict(arrowstyle = '->', connectionstyle = 'arc3,rad=0'))

dfreg = df.loc[:,['Close','Volume']]
dfreg['HL_PCT'] = (df['High'] - df['Low']) / df['Close'] * 100.0
dfreg['PCT_change'] = (df['Close'] - df['Open']) / df['Open'] * 100.0

dfreg

In [None]:
# Drop missing value
dfreg.fillna(value=-99999, inplace=True)
# We want to separate 1 percent of the data to forecast
forecast_out = int(math.ceil(0.01 * len(dfreg)))
# Separating the label here, we want to predict the Close
forecast_col = 'Close'
dfreg['label'] = dfreg[forecast_col].shift(-forecast_out)
X = np.array(dfreg.drop(['label'], 1))
# Scale the X so that everyone can have the same distribution for linear regression
X = sklearn.preprocessing.scale(X)
# Finally We want to find Data Series of late X and early X (train) for model generation and evaluation
X_lately = X[-forecast_out:]
X = X[:-forecast_out]
# Separate label and identify it as y
y = np.array(dfreg['label'])
y = y[:-forecast_out]

In [None]:
# Model Generation
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

print(dfreg.head())

# Linear regression
clfreg = LinearRegression(n_jobs=-1)
clfreg.fit(X, y)

# Quadratic Regression 2
clfpoly2 = make_pipeline(PolynomialFeatures(2), Ridge())
clfpoly2.fit(X, y)

# Quadratic Regression 3
clfpoly3 = make_pipeline(PolynomialFeatures(3), Ridge())
clfpoly3.fit(X, y)

# kNN

In [None]:
clfknn = KNeighborsRegressor(n_neighbors=2)
clfknn.fit(X_train, y_train)

confidencereg = clfreg.score(X_test, y_test)
confidencepoly2 = clfpoly2.score(X_test,y_test)
confidencepoly3 = clfpoly3.score(X_test,y_test)
confidenceknn = clfknn.score(X_test, y_test)
print(confidencereg)
print(confidencepoly2)
print(confidencepoly3)
print(confidenceknn)

forecast_set = clfknn.predict(X_lately)
dfreg['Forecast'] = np.nan

In [None]:
last_date = dfreg.iloc[-1].name
last_unix = last_date
next_unix = last_unix + datetime.timedelta(days=1)

for i in forecast_set:
    next_date = next_unix
    next_unix += datetime.timedelta(days=1)
    dfreg.loc[next_date] = [np.nan for _ in range(len(dfreg.columns)-1)]+[i]
dfreg['Close'].tail(500).plot()
dfreg['Forecast'].tail(500).plot()
plt.legend(loc=4)
plt.xlabel('Date')
plt.ylabel('Price')
plt.show()

In [None]:
# roc curve and auc
from sklearn.datasets import make_classification
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score
from matplotlib import pyplot

# generate a no skill prediction (majority class)
ns_probs = [0 for _ in range(len(y_test))]

# fit a model
model = LogisticRegression(solver='lbfgs')
model.fit(X_train, y_train)

# predict probabilities
lr_probs = model.predict_proba(X_test)

# keep probabilities for the positive outcome only
lr_probs = lr_probs[:, 1]

# calculate scores
ns_auc = roc_auc_score(testy, ns_probs)
lr_auc = roc_auc_score(testy, lr_probs)

X = X.reshape(X.shape[1:])