# Stocks Data Analysis

### Importing necessary libraries

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
#for dirname, _, filenames in os.walk('/kaggle/input'):
    #for filename in filenames:
       # print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import datetime
import matplotlib.pyplot as plt
%matplotlib inline

### Download the Data

In [None]:
start = '2012-01-01'
end = '2017-01-01'
tesla = pd.read_csv('../input/stock-market-dataset/stocks/TSLA.csv',dayfirst=True,index_col=0).loc[start:end]
ford= pd.read_csv('../input/stock-market-dataset/stocks/FORD.csv',dayfirst=True,index_col=0).loc[start:end]
gm = pd.read_csv('../input/stock-market-dataset/stocks/GM.csv',dayfirst=True,index_col=0).loc[start:end]

### Overview of performance

In [None]:
tesla['Open'].plot(label="TESLA",figsize=(20,8))
ford['Open'].plot(label="FORD")
gm['Open'].plot(label="GM")
plt.ylabel('Stock Prices')
plt.title("Stock Prices of Tesla, Ford and GM")
plt.legend()

In [None]:
tesla['Volume'].plot(label="TESLA",figsize=(20,8))
ford['Volume'].plot(label="FORD")
gm['Volume'].plot(label="GM")
plt.ylabel('Volume Traded')
plt.title("Volume Traded by Tesla, Ford and GM")
plt.legend()

ford had a big spike of volume traded by end of 2013, as due to some aggressive something, the prices of ford stock fell, and thus volume traded increased

In [None]:
ford.iloc[[ford['Volume'].argmax()]]

In [None]:
ford.iloc[1175:1200]['Open'].plot()

### Market Cap

for us to understand how profitable a company is, we need to get an idea about its market cap.

We do not have the market cap in our current data, so we can use volume*open_price as a value which will be a good visual indicator of the market cap.

In [None]:
tesla['Total Traded'] = tesla['Volume']*tesla['Open']
ford['Total Traded'] = ford['Volume']*ford['Open']
gm['Total Traded'] = gm['Volume']*gm['Open']

tesla.head()

In [None]:
tesla['Total Traded'].plot(label="TESLA",figsize=(20,8))
ford['Total Traded'].plot(label="FORD")
gm['Total Traded'].plot(label="GM")
plt.ylabel('Total Traded')
plt.title("Approx market cap of Tesla, Ford and GM")
plt.legend()

something unusual happened with tesla stock which caused a spike in the total volume traded around 2013 end.

In [None]:
tesla['Total Traded'].argmax()

In [None]:
tesla.iloc[[tesla['Total Traded'].argmax()]]

share prices soar due to the same reason as mentioned in the below article.

[https://money.cnn.com/2014/02/25/investing/tesla-record-high/](http://)

### Moving Average

data can be sometimes vey noisy, thus to smoothen the curve out for large intervals, we use moving average, to make the data more trustworthy

In [None]:
gm['Open'].iloc[1000:1100].plot()

In [None]:
gm['MA50'] = gm['Open'].rolling(50).mean()
gm['MA50'].plot(label='MA50',figsize=(20,8))
gm['Open'].plot()
plt.legend()

### Correlation and Scatter Matrix

all stocks we are currently analysing are of the automobile industry, so they will be correlated in some way.

we can use a scatter matrix plot to see this.

In [None]:
from pandas.plotting import scatter_matrix

In [None]:
car_comp = pd.concat([tesla['Open'],gm['Open'],ford['Open']],axis=1)
car_comp.columns=['Tesla','GM','Ford']
car_comp.head()

In [None]:
scatter_matrix(car_comp,figsize=(10,10),hist_kwds={'bins':50})

scatter plots help us in finding out if any of the variables have linear correlation between each other.

in the above plots we cannot see any plot to be even somewhat linearly correlated.

### Japanese CandleStick Charts

Helps us in reading changes in stock or currency prices or items like that.

[https://en.wikipedia.org/wiki/Candlestick_chart](http://)

In [None]:
pip install mplfinance

In [None]:
from mplfinance.original_flavor import candlestick_ohlc
from matplotlib.dates import DateFormatter, date2num, WeekdayLocator, DayLocator, MONDAY

ford.index = pd.to_datetime(ford.index)
ford_reset = ford.loc['2012-01':'2012-01'].reset_index()
ford_reset['date_ax'] = ford_reset['Date'].apply(lambda date: date2num(date))
ford_values=[tuple(vals) for vals in ford_reset[['date_ax','Open','High','Low','Close']].values]

mondays = WeekdayLocator(MONDAY)
alldays = DayLocator()
weekFormatter = DateFormatter('%b %d')
dayFormatter = DateFormatter('%d')

fig, ax = plt.subplots(figsize=(18,10))
candlestick_ohlc(ax,ford_values,width = 0.6, colorup='g',colordown='r')

### Daily Percentage Change

# $r_t = \frac{p_t}{p_{t-1}} - 1$

$ r_t $ is the return at time t

$ p_t $ is the price at time t

In [None]:
tesla['returns'] = (tesla['Close']/tesla['Close'].shift(1))-1
ford['returns'] = (ford['Close']/ford['Close'].shift(1))-1
gm['returns'] = (gm['Close']/gm['Close'].shift(1))-1
tesla.head()

### Volatility

the width of the histogram tells us how volatile our particular stock is.

the lesser the width, the more stable the stock price is.

In [None]:
ford['returns'].hist(bins=50,figsize=(10,6))

variance is the square of std deviation

In [None]:
gm['returns'].hist(bins=50,figsize=(10,6))

In [None]:
tesla['returns'].hist(bins=50,figsize=(10,6))

In [None]:
ford['returns'].hist(bins=100,label='FORD',alpha=0.5,figsize=(20,10)).set_xlim((-0.2,0.2))
gm['returns'].hist(bins=100,label='GM',alpha=0.5)
tesla['returns'].hist(bins=100,label='TESLA',alpha=0.5)
plt.legend()

from the above histogram we can infer that Ford stocks are the most volatile among the three stocks.

to see the histogram more clearly we can use a kde curve.

In [None]:
tesla['returns'].plot(kind='kde',label='Tesla', figsize=(20,10),xlim=(-0.2,0.2))
ford['returns'].plot(kind='kde',label='Ford')
gm['returns'].plot(kind='kde',label='GM')
plt.legend()

"kernel", kde shows us a normalized version of the data in the histogram

kde shows a probability distribution function

through the kde we can see that FORD has the widest curve, thus it is the most volatile

### Box Plots

In [None]:
box_df = pd.concat([tesla['returns'],ford['returns'],gm['returns']],axis=1)
box_df.columns = ['Tesla','Ford','GM']
box_df.plot(kind='box',figsize=(20,8))

+ find more about box plots

from the above box plot we can see that for the given interval, ford is more volatile than both tesla and gm.

In [None]:
scatter_matrix(box_df, figsize=(12,12),hist_kwds={'bins':50},alpha=0.5)

if ford and tesla's corresponding graph can be put in a line, they might be linearly correlatedm but that is not the case, we cannot see any correlation in the above graph

## Cumulative Return

## $i_t = (1+r_t)i_{t-1}$

draw tables explaining cumulative return

In [None]:
tesla['cumulative return'] = (1+tesla['returns']).cumprod()
ford['cumulative return'] = (1+ford['returns']).cumprod()
gm['cumulative return'] = (1+gm['returns']).cumprod()
gm.head()

In [None]:
tesla['cumulative return'].plot(label='Tesla')
ford['cumulative return'].plot(label='Ford')
gm['cumulative return'].plot(label='GM')
plt.legend()

In [None]:
ford

In [None]:
ford['cumulative return'].plot(label='FORD',figsize=(12,6))

In [None]:
gm['cumulative return'].plot(label='GM',figsize=(12,6))
tesla['cumulative return'].plot(label='Tesla')
#ford['cumulative return'].plot(label='FORD')
plt.legend()