In [None]:
import numpy as np 
import pandas as pd
import plotly as py
import plotly.graph_objs as go
import plotly.express as px
from plotly.offline import init_notebook_mode
init_notebook_mode(connected = True)
import seaborn as sns

import matplotlib.pyplot as plt
%matplotlib inline

import warnings
warnings.filterwarnings("ignore")

pd.set_option('display.max_columns', None)
#########################################################
df = pd.read_csv('../input/stock-exchange-data/indexData.csv')
df_p = pd.read_csv('../input/stock-exchange-data/indexProcessed.csv')
info = pd.read_csv('../input/stock-exchange-data/indexInfo.csv')

# Basic information

In [None]:
info

In [None]:
df.head()

In [None]:
df.info()

In [None]:
info.head()

In [None]:
df.dropna(inplace = True)

df.reset_index(drop = True, inplace = True)

In [None]:
df.shape

In [None]:
new_features = ['change_day', 'change_day_to_day', '%_change_day_to_day', 'change_volume', '%_change_volume']

for i in new_features:
    df[i] = 0
    df[i] = df[i].astype('float')

for k in range(1, len(df)):
    if df['Index'][k] == df['Index'][k-1]:
        df['change_day'][k] = df['Close'][k] - df['Open'][k]
        df['change_day_to_day'][k] = df['Close'][k] - df['Close'][k-1]
        df['%_change_day_to_day'][k] = ((df['Close'][k] / df['Close'][k-1]) * 100) - 100
        if df['Volume'][k] != 0 and df['Volume'][k-1] != 0:
            df['change_volume'][k] = df['Volume'][k] - df['Volume'][k-1]
            df['%_change_volume'][k] = ((df['Volume'][k] / df['Volume'][k-1]) * 100) - 100

df['Date'] = pd.to_datetime(df['Date'])
df['year'] = df['Date'].dt.year

df.head()

In [None]:
info['Exchange'].unique()

# EDA

In [None]:
dfMerge = df.merge(info, on = 'Index')

In [None]:
def eda():
    exchange = info['Exchange'].tolist()
    colors = {'New York Stock Exchange': '#41729F',
             'NASDAQ': '#5885AF',
             'Hong Kong Stock Exchange': '#274472',
             'Shanghai Stock Exchange': '#C3E0E5',
             'Tokyo Stock Exchange': '#145DA0',
             'Euronext': '#0C2D48',
             'Shenzhen Stock Exchange': '#2E8BC0',
             'Toronto Stock Exchange': '#B1D4E0',
             'National Stock Exchange of India': '#BFD7ED',
             'Frankfurt Stock Exchange': '#60A3D9',
             'Korea Exchange': '#0074B7',
             'SIX Swiss Exchange': '#003B73',
             'Taiwan Stock Exchange': '#0E86D4',
             'Johannesburg Stock Exchange': '#68BBE3'}
    
    for i in exchange:
        plt.figure(figsize = (15, 7))
        plt.title(i, size = 35, y = 1.03, fontname = 'monospace')
        plt.grid(color = 'gray', linestyle = ':', axis = 'y', alpha = 0.8, zorder = 0,  dashes = (1,7))
        c = sns.lineplot(x = "Date", y = "Close", data = dfMerge.query("Exchange == @i"), color = colors.get(i), linewidth = 0.5)
        plt.ylabel('Close price', size = 14, fontname = 'monospace')
        plt.xlabel('')
        plt.yticks(size = 12, fontname = 'monospace')
    
        for j in ['right', 'top']:
            c.spines[j].set_visible(False)
        for j in ['bottom', 'left']:
            c.spines[j].set_linewidth(1.3)
    
        plt.show()
        fig = plt.figure(figsize = (15, 15))
        plt.subplot(221)
        plt.grid(color = 'gray', linestyle = ':', axis = 'y', alpha = 0.8, zorder = 0,  dashes = (1,7))
        a = sns.lineplot(x = "Date", y = "%_change_day_to_day", data = dfMerge.query("Exchange == @i"), color = colors.get(i), linewidth = 0.5)
        plt.ylabel('Price changes %', size = 14, fontname = 'monospace')
        plt.xlabel('')
        plt.yticks(size = 12, fontname = 'monospace')
    
        plt.subplot(222)
        plt.grid(color = 'gray', linestyle = ':', axis = 'y', alpha = 0.8, zorder = 0,  dashes = (1,7))
        b = sns.lineplot(x = "Date", y = "%_change_volume", data = dfMerge.query("Exchange == @i"), color = colors.get(i), linewidth = 0.5)
        plt.ylabel('Volume changes %', size = 14, fontname = 'monospace')
        plt.xlabel('')
        plt.yticks(size = 12, fontname = 'monospace')

        for k in [a,b,c]:
            for j in ['right', 'top']:
                k.spines[j].set_visible(False)
            for j in ['bottom', 'left']:
                k.spines[j].set_linewidth(1.3)
        
        plt.figtext(0.5, -0.001, 'whitespace', color = 'white')
        plt.show()

In [None]:
eda()

# **Machine Learning**

In [None]:
df_p.info()

In [None]:
df_p.dropna(inplace = True)

df_p.reset_index(drop = True, inplace = True)

new_features = ['change_day', 'change_day_to_day', '%_change_day_to_day', 'change_volume', '%_change_volume']

for i in new_features:
    df_p[i] = 0
    df_p[i] = df_p[i].astype('float')

for k in range(1, len(df_p)):
    if df_p['Index'][k] == df_p['Index'][k-1]:
        df_p['change_day'][k] = df_p['Close'][k] - df_p['Open'][k]
        df_p['change_day_to_day'][k] = df_p['Close'][k] - df_p['Close'][k-1]
        df_p['%_change_day_to_day'][k] = ((df_p['Close'][k] / df_p['Close'][k-1]) * 100) - 100
        if df_p['Volume'][k] != 0 and df_p['Volume'][k-1] != 0:
            df_p['change_volume'][k] = df_p['Volume'][k] - df_p['Volume'][k-1]
            df_p['%_change_volume'][k] = ((df_p['Volume'][k] / df_p['Volume'][k-1]) * 100) - 100

df_p['Date'] = pd.to_datetime(df_p['Date'])


            
df_p.head()

In [None]:
df_p['year'] = df_p['Date'].dt.year
df_p.drop('Date', inplace=True, axis=1)

In [None]:
from sklearn.preprocessing import LabelEncoder #Library to encode object (text) columns to numerics
le = LabelEncoder()
df_p['Index'] = le.fit_transform(df_p['Index'])

In [None]:
from sklearn.model_selection import train_test_split

#splitting data into training data and testing data
X_train, X_test, y_train, y_test = train_test_split(
    df_p.drop(['CloseUSD'], axis=1),
    df_p.CloseUSD,
    test_size= 0.2,  # 20% test data & 80% train data
    random_state=42
)

In [None]:
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
reg = LinearRegression().fit(X_train, y_train)
y_pred = reg.predict(X_test)

# The coefficients
print('Coefficients: \n', reg.coef_)
# The mean squared error
print('Mean squared error: %.2f'
      % mean_squared_error(y_test, y_pred))
# The coefficient of determination: 1 is perfect prediction
print('Coefficient of determination: %.2f'
      % r2_score(y_test, y_pred))